Import dav1d_0.7.1.orig.tar.xz
authorVasyl Gello <vasek.gello@gmail.com>
Fri, 3 Jul 2020 19:18:02 +0000 (20:18 +0100)
committerVasyl Gello <vasek.gello@gmail.com>
Fri, 3 Jul 2020 19:18:02 +0000 (20:18 +0100)
[dgit import orig dav1d_0.7.1.orig.tar.xz]

230 files changed:
CONTRIBUTING.md [new file with mode: 0644]
COPYING [new file with mode: 0644]
NEWS [new file with mode: 0644]
README.md [new file with mode: 0644]
THANKS.md [new file with mode: 0644]
dav1d_logo.png [new file with mode: 0644]
doc/Doxyfile.in [new file with mode: 0644]
doc/PATENTS [new file with mode: 0644]
doc/dav1d_logo.svg [new file with mode: 0644]
doc/meson.build [new file with mode: 0644]
examples/dav1dplay.c [new file with mode: 0644]
examples/dp_fifo.c [new file with mode: 0644]
examples/dp_fifo.h [new file with mode: 0644]
examples/dp_renderer.h [new file with mode: 0644]
examples/dp_renderer_placebo.c [new file with mode: 0644]
examples/dp_renderer_sdl.c [new file with mode: 0644]
examples/meson.build [new file with mode: 0644]
gcovr.cfg [new file with mode: 0644]
include/common/attributes.h [new file with mode: 0644]
include/common/bitdepth.h [new file with mode: 0644]
include/common/dump.h [new file with mode: 0644]
include/common/intops.h [new file with mode: 0644]
include/common/mem.h [new file with mode: 0644]
include/common/validate.h [new file with mode: 0644]
include/compat/gcc/stdatomic.h [new file with mode: 0644]
include/compat/getopt.h [new file with mode: 0644]
include/compat/msvc/stdatomic.h [new file with mode: 0644]
include/dav1d/common.h [new file with mode: 0644]
include/dav1d/data.h [new file with mode: 0644]
include/dav1d/dav1d.h [new file with mode: 0644]
include/dav1d/headers.h [new file with mode: 0644]
include/dav1d/meson.build [new file with mode: 0644]
include/dav1d/picture.h [new file with mode: 0644]
include/dav1d/version.h.in [new file with mode: 0644]
include/meson.build [new file with mode: 0644]
include/vcs_version.h.in [new file with mode: 0644]
meson.build [new file with mode: 0644]
meson_options.txt [new file with mode: 0644]
package/crossfiles/aarch64-android.meson [new file with mode: 0644]
package/crossfiles/arm-android.meson [new file with mode: 0644]
package/crossfiles/i686-linux32.meson [new file with mode: 0644]
package/crossfiles/i686-w64-mingw32.meson [new file with mode: 0644]
package/crossfiles/x86_64-w64-mingw32.meson [new file with mode: 0644]
package/snap/snapcraft.yaml [new file with mode: 0644]
src/arm/32/cdef.S [new file with mode: 0644]
src/arm/32/ipred.S [new file with mode: 0644]
src/arm/32/itx.S [new file with mode: 0644]
src/arm/32/loopfilter.S [new file with mode: 0644]
src/arm/32/looprestoration.S [new file with mode: 0644]
src/arm/32/mc.S [new file with mode: 0644]
src/arm/32/msac.S [new file with mode: 0644]
src/arm/32/util.S [new file with mode: 0644]
src/arm/64/cdef.S [new file with mode: 0644]
src/arm/64/cdef16.S [new file with mode: 0644]
src/arm/64/cdef_tmpl.S [new file with mode: 0644]
src/arm/64/ipred.S [new file with mode: 0644]
src/arm/64/ipred16.S [new file with mode: 0644]
src/arm/64/itx.S [new file with mode: 0644]
src/arm/64/itx16.S [new file with mode: 0644]
src/arm/64/loopfilter.S [new file with mode: 0644]
src/arm/64/loopfilter16.S [new file with mode: 0644]
src/arm/64/looprestoration.S [new file with mode: 0644]
src/arm/64/looprestoration16.S [new file with mode: 0644]
src/arm/64/looprestoration_common.S [new file with mode: 0644]
src/arm/64/looprestoration_tmpl.S [new file with mode: 0644]
src/arm/64/mc.S [new file with mode: 0644]
src/arm/64/mc16.S [new file with mode: 0644]
src/arm/64/msac.S [new file with mode: 0644]
src/arm/64/util.S [new file with mode: 0644]
src/arm/asm.S [new file with mode: 0644]
src/arm/cdef_init_tmpl.c [new file with mode: 0644]
src/arm/cpu.c [new file with mode: 0644]
src/arm/cpu.h [new file with mode: 0644]
src/arm/ipred_init_tmpl.c [new file with mode: 0644]
src/arm/itx_init_tmpl.c [new file with mode: 0644]
src/arm/loopfilter_init_tmpl.c [new file with mode: 0644]
src/arm/looprestoration_init_tmpl.c [new file with mode: 0644]
src/arm/mc_init_tmpl.c [new file with mode: 0644]
src/arm/msac.h [new file with mode: 0644]
src/cdef.h [new file with mode: 0644]
src/cdef_apply.h [new file with mode: 0644]
src/cdef_apply_tmpl.c [new file with mode: 0644]
src/cdef_tmpl.c [new file with mode: 0644]
src/cdf.c [new file with mode: 0644]
src/cdf.h [new file with mode: 0644]
src/cpu.c [new file with mode: 0644]
src/cpu.h [new file with mode: 0644]
src/ctx.h [new file with mode: 0644]
src/data.c [new file with mode: 0644]
src/data.h [new file with mode: 0644]
src/dav1d.rc.in [new file with mode: 0644]
src/decode.c [new file with mode: 0644]
src/decode.h [new file with mode: 0644]
src/dequant_tables.c [new file with mode: 0644]
src/dequant_tables.h [new file with mode: 0644]
src/env.h [new file with mode: 0644]
src/ext/x86/x86inc.asm [new file with mode: 0644]
src/fg_apply.h [new file with mode: 0644]
src/fg_apply_tmpl.c [new file with mode: 0644]
src/film_grain.h [new file with mode: 0644]
src/film_grain_tmpl.c [new file with mode: 0644]
src/getbits.c [new file with mode: 0644]
src/getbits.h [new file with mode: 0644]
src/internal.h [new file with mode: 0644]
src/intra_edge.c [new file with mode: 0644]
src/intra_edge.h [new file with mode: 0644]
src/ipred.h [new file with mode: 0644]
src/ipred_prepare.h [new file with mode: 0644]
src/ipred_prepare_tmpl.c [new file with mode: 0644]
src/ipred_tmpl.c [new file with mode: 0644]
src/itx.h [new file with mode: 0644]
src/itx_1d.c [new file with mode: 0644]
src/itx_1d.h [new file with mode: 0644]
src/itx_tmpl.c [new file with mode: 0644]
src/levels.h [new file with mode: 0644]
src/lf_apply.h [new file with mode: 0644]
src/lf_apply_tmpl.c [new file with mode: 0644]
src/lf_mask.c [new file with mode: 0644]
src/lf_mask.h [new file with mode: 0644]
src/lib.c [new file with mode: 0644]
src/log.c [new file with mode: 0644]
src/log.h [new file with mode: 0644]
src/loopfilter.h [new file with mode: 0644]
src/loopfilter_tmpl.c [new file with mode: 0644]
src/looprestoration.h [new file with mode: 0644]
src/looprestoration_tmpl.c [new file with mode: 0644]
src/lr_apply.h [new file with mode: 0644]
src/lr_apply_tmpl.c [new file with mode: 0644]
src/mc.h [new file with mode: 0644]
src/mc_tmpl.c [new file with mode: 0644]
src/meson.build [new file with mode: 0644]
src/msac.c [new file with mode: 0644]
src/msac.h [new file with mode: 0644]
src/obu.c [new file with mode: 0644]
src/obu.h [new file with mode: 0644]
src/picture.c [new file with mode: 0644]
src/picture.h [new file with mode: 0644]
src/ppc/cdef_init_tmpl.c [new file with mode: 0644]
src/ppc/cpu.c [new file with mode: 0644]
src/ppc/cpu.h [new file with mode: 0644]
src/ppc/looprestoration_init_tmpl.c [new file with mode: 0644]
src/ppc/types.h [new file with mode: 0644]
src/qm.c [new file with mode: 0644]
src/qm.h [new file with mode: 0644]
src/recon.h [new file with mode: 0644]
src/recon_tmpl.c [new file with mode: 0644]
src/ref.c [new file with mode: 0644]
src/ref.h [new file with mode: 0644]
src/refmvs.c [new file with mode: 0644]
src/refmvs.h [new file with mode: 0644]
src/scan.c [new file with mode: 0644]
src/scan.h [new file with mode: 0644]
src/tables.c [new file with mode: 0644]
src/tables.h [new file with mode: 0644]
src/thread.h [new file with mode: 0644]
src/thread_data.h [new file with mode: 0644]
src/thread_task.c [new file with mode: 0644]
src/thread_task.h [new file with mode: 0644]
src/warpmv.c [new file with mode: 0644]
src/warpmv.h [new file with mode: 0644]
src/wedge.c [new file with mode: 0644]
src/wedge.h [new file with mode: 0644]
src/win32/thread.c [new file with mode: 0644]
src/x86/cdef_avx2.asm [new file with mode: 0644]
src/x86/cdef_avx512.asm [new file with mode: 0644]
src/x86/cdef_init_tmpl.c [new file with mode: 0644]
src/x86/cdef_sse.asm [new file with mode: 0644]
src/x86/cpu.c [new file with mode: 0644]
src/x86/cpu.h [new file with mode: 0644]
src/x86/cpuid.asm [new file with mode: 0644]
src/x86/film_grain.asm [new file with mode: 0644]
src/x86/film_grain_init_tmpl.c [new file with mode: 0644]
src/x86/film_grain_ssse3.asm [new file with mode: 0644]
src/x86/ipred.asm [new file with mode: 0644]
src/x86/ipred_init_tmpl.c [new file with mode: 0644]
src/x86/ipred_ssse3.asm [new file with mode: 0644]
src/x86/itx.asm [new file with mode: 0644]
src/x86/itx_init_tmpl.c [new file with mode: 0644]
src/x86/itx_ssse3.asm [new file with mode: 0644]
src/x86/loopfilter.asm [new file with mode: 0644]
src/x86/loopfilter_init_tmpl.c [new file with mode: 0644]
src/x86/loopfilter_ssse3.asm [new file with mode: 0644]
src/x86/looprestoration.asm [new file with mode: 0644]
src/x86/looprestoration_init_tmpl.c [new file with mode: 0644]
src/x86/looprestoration_ssse3.asm [new file with mode: 0644]
src/x86/mc.asm [new file with mode: 0644]
src/x86/mc_init_tmpl.c [new file with mode: 0644]
src/x86/mc_sse.asm [new file with mode: 0644]
src/x86/msac.asm [new file with mode: 0644]
src/x86/msac.h [new file with mode: 0644]
src/x86/msac_init.c [new file with mode: 0644]
tests/checkasm/arm/checkasm_32.S [new file with mode: 0644]
tests/checkasm/arm/checkasm_64.S [new file with mode: 0644]
tests/checkasm/cdef.c [new file with mode: 0644]
tests/checkasm/checkasm.c [new file with mode: 0644]
tests/checkasm/checkasm.h [new file with mode: 0644]
tests/checkasm/filmgrain.c [new file with mode: 0644]
tests/checkasm/ipred.c [new file with mode: 0644]
tests/checkasm/itx.c [new file with mode: 0644]
tests/checkasm/loopfilter.c [new file with mode: 0644]
tests/checkasm/looprestoration.c [new file with mode: 0644]
tests/checkasm/mc.c [new file with mode: 0644]
tests/checkasm/msac.c [new file with mode: 0644]
tests/checkasm/x86/checkasm.asm [new file with mode: 0644]
tests/libfuzzer/alloc_fail.c [new file with mode: 0644]
tests/libfuzzer/alloc_fail.h [new file with mode: 0644]
tests/libfuzzer/dav1d_fuzzer.c [new file with mode: 0644]
tests/libfuzzer/dav1d_fuzzer.h [new file with mode: 0644]
tests/libfuzzer/main.c [new file with mode: 0644]
tests/libfuzzer/meson.build [new file with mode: 0644]
tests/meson.build [new file with mode: 0644]
tools/compat/getopt.c [new file with mode: 0644]
tools/dav1d.c [new file with mode: 0644]
tools/dav1d_cli_parse.c [new file with mode: 0644]
tools/dav1d_cli_parse.h [new file with mode: 0644]
tools/input/annexb.c [new file with mode: 0644]
tools/input/demuxer.h [new file with mode: 0644]
tools/input/input.c [new file with mode: 0644]
tools/input/input.h [new file with mode: 0644]
tools/input/ivf.c [new file with mode: 0644]
tools/input/parse.h [new file with mode: 0644]
tools/input/section5.c [new file with mode: 0644]
tools/meson.build [new file with mode: 0644]
tools/output/md5.c [new file with mode: 0644]
tools/output/muxer.h [new file with mode: 0644]
tools/output/null.c [new file with mode: 0644]
tools/output/output.c [new file with mode: 0644]
tools/output/output.h [new file with mode: 0644]
tools/output/y4m2.c [new file with mode: 0644]
tools/output/yuv.c [new file with mode: 0644]

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644 (file)
index 0000000..347741f
--- /dev/null
@@ -0,0 +1,56 @@
+# dav1d contribution guide
+
+## CoC
+The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies fully to this project.
+
+## ToDo
+
+The todo list can be found [on the wiki](https://code.videolan.org/videolan/dav1d/wikis/task-list).
+
+## Codebase language
+
+The codebase is developed with the following assumptions:
+
+For the library:
+- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extension,
+- x86 asm in .asm files, using the NASM syntax,
+- arm/arm64 in .S files, using the GAS syntax limited to subset llvm 5.0's internal assembler supports,
+- no C++ is allowed, whatever the version.
+
+For the tools and utils:
+- C *(see above for restrictions)*
+- Rust
+- C++ is only allowed for the MFT.
+
+If you want to use *Threads* or *Atomic* features, please conform to the **C11**/**POSIX** semantic and use a wrapper for older compilers/platforms *(like done in VLC)*.
+
+Please use modern standard POSIX functions *(strscpy, asprintf, tdestroy)*, and provide a compatibility fallback *(like done in VLC)*.
+
+We will make reasonable efforts for compilers that are a bit older, but we won't support gcc 3 or MSVC 2012.
+
+## Authorship
+
+Please provide a correct authorship for your commit logs, with a name and a valid email.
+
+We will reject anonymous contributions for now. As an exception, known pseudonyms from the multimedia community are accepted.
+
+This project is respecting **Copyright** and **Droit d'auteur**. There is no copyright attribution or CLA.
+
+## Commit logs
+
+Please read [How to Write a Git Commit Message](https://chris.beams.io/posts/git-commit/).
+
+## Submit requests (WIP)
+
+- Code,
+- [Compile](https://xkcd.com/303/),
+- Check your [code style](https://code.videolan.org/videolan/dav1d/wikis/Coding-style),
+- Test,
+- Try,
+- Submit patches through merge requests,
+- Check that this passes the CI.
+
+## Patent license
+
+You need to read, understand, and agree to the [AV1 patents license](doc/PATENTS), before committing.
+
diff --git a/COPYING b/COPYING
new file mode 100644 (file)
index 0000000..875b138
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,23 @@
+Copyright © 2018-2019, VideoLAN and dav1d authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/NEWS b/NEWS
new file mode 100644 (file)
index 0000000..1294dc5
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,176 @@
+Changes for 0.7.1 'Frigatebird':
+------------------------------
+
+0.7.1 is a minor update on 0.7.0:
+ - ARM32 NEON optimizations for itxfm, which can give up to 28% speedup, and MSAC
+ - SSE2 optimizations for prep_bilin and prep_8tap
+ - AVX2 optimizations for MC scaled
+ - Fix a clamping issue in motion vector projection
+ - Fix an issue on some specific Haswell CPU on ipred_z AVX2 functions
+ - Improvements on the dav1dplay utility player to support resizing
+
+
+Changes for 0.7.0 'Frigatebird':
+------------------------------
+
+0.7.0 is a major release for dav1d:
+ - Faster refmv implementation gaining up to 12% speed while -25% of RAM (Single Thread)
+ - 10b/12b ARM64 optimizations are mostly complete:
+   - ipred (paeth, smooth, dc, pal, filter, cfl)
+   - itxfm (only 10b)
+ - AVX2/SSSE3 for non-4:2:0 film grain and for mc.resize
+ - AVX2 for cfl4:4:4
+ - AVX-512 CDEF filter
+ - ARM64 8b improvements for cfl_ac and itxfm
+ - ARM64 implementation for emu_edge in 8b/10b/12b
+ - ARM32 implementation for emu_edge in 8b
+ - Improvements on the dav1dplay utility player to support 10 bit,
+   non-4:2:0 pixel formats and film grain on the GPU
+
+
+Changes for 0.6.0 'Gyrfalcon':
+------------------------------
+
+0.6.0 is a major release for dav1d:
+ - New ARM64 optimizations for the 10/12bit depth:
+    - mc_avg, mc_w_avg, mc_mask
+    - mc_put/mc_prep 8tap/bilin
+    - mc_warp_8x8
+    - mc_w_mask
+    - mc_blend
+    - wiener
+    - SGR
+    - loopfilter
+    - cdef
+ - New AVX-512 optimizations for prep_bilin, prep_8tap, cdef_filter, mc_avg/w_avg/mask
+ - New SSSE3 optimizations for film grain
+ - New AVX2 optimizations for msac_adapt16
+ - Fix rare mismatches against the reference decoder, notably because of clipping
+ - Improvements on ARM64 on msac, cdef and looprestoration optimizations
+ - Improvements on AVX2 optimizations for cdef_filter
+ - Improvements in the C version for itxfm, cdef_filter
+
+
+Changes for 0.5.2 'Asiatic Cheetah':
+------------------------------------
+
+0.5.2 is a small release improving speed for ARM32 and adding minor features:
+ - ARM32 optimizations for loopfilter, ipred_dc|h|v
+ - Add section-5 raw OBU demuxer
+ - Improve the speed by reducing the L2 cache collisions
+ - Fix minor issues
+
+
+Changes for 0.5.1 'Asiatic Cheetah':
+------------------------------------
+
+0.5.1 is a small release improving speeds and fixing minor issues
+compared to 0.5.0:
+ - SSE2 optimizations for CDEF, wiener and warp_affine
+ - NEON optimizations for SGR on ARM32
+ - Fix mismatch issue in x86 asm in inverse identity transforms
+ - Fix build issue in ARM64 assembly if debug info was enabled
+ - Add a workaround for Xcode 11 -fstack-check bug
+
+
+Changes for 0.5.0 'Asiatic Cheetah':
+------------------------------------
+
+0.5.0 is a medium release fixing regressions and minor issues,
+and improving speed significantly:
+ - Export ITU T.35 metadata
+ - Speed improvements on blend_ on ARM
+ - Speed improvements on decode_coef and MSAC
+ - NEON optimizations for blend*, w_mask_, ipred functions for ARM64
+ - NEON optimizations for CDEF and warp on ARM32
+ - SSE2 optimizations for MSAC hi_tok decoding
+ - SSSE3 optimizations for deblocking loopfilters and warp_affine
+ - AVX2 optimizations for film grain and ipred_z2
+ - SSE4 optimizations for warp_affine
+ - VSX optimizations for wiener
+ - Fix inverse transform overflows in x86 and NEON asm
+ - Fix integer overflows with large frames
+ - Improve film grain generation to match reference code
+ - Improve compatibility with older binutils for ARM
+ - More advanced Player example in tools
+
+
+Changes for 0.4.0 'Cheetah':
+----------------------------
+
+ - Fix playback with unknown OBUs
+ - Add an option to limit the maximum frame size
+ - SSE2 and ARM64 optimizations for MSAC
+ - Improve speed on 32bits systems
+ - Optimization in obmc blend
+ - Reduce RAM usage significantly
+ - The initial PPC SIMD code, cdef_filter
+ - NEON optimizations for blend functions on ARM
+ - NEON optimizations for w_mask functions on ARM
+ - NEON optimizations for inverse transforms on ARM64
+ - VSX optimizations for CDEF filter
+ - Improve handling of malloc failures
+ - Simple Player example in tools
+
+
+Changes for 0.3.1 'Sailfish':
+------------------------------
+
+ - Fix a buffer overflow in frame-threading mode on SSSE3 CPUs
+ - Reduce binary size, notably on Windows
+ - SSSE3 optimizations for ipred_filter
+ - ARM optimizations for MSAC
+
+
+Changes for 0.3.0 'Sailfish':
+------------------------------
+
+This is the final release for the numerous speed improvements of 0.3.0-rc.
+It mostly:
+ - Fixes an annoying crash on SSSE3 that happened in the itx functions
+
+
+Changes for 0.2.2 (0.3.0-rc) 'Antelope':
+-----------------------------
+
+ - Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
+   The impact is important on SSSE3, SSE4 and AVX2 cpus
+ - SSSE3 optimizations for all blocks size in itx
+ - SSSE3 optimizations for ipred_paeth and ipred_cfl (420, 422 and 444)
+ - Speed improvements on CDEF for SSE4 CPUs
+ - NEON optimizations for SGR and loop filter
+ - Minor crashes, improvements and build changes
+
+
+Changes for 0.2.1 'Antelope':
+----------------------------
+
+ - SSSE3 optimization for cdef_dir
+ - AVX2 improvements of the existing CDEF optimizations
+ - NEON improvements of the existing CDEF and wiener optimizations
+ - Clarification about the numbering/versionning scheme
+
+
+Changes for 0.2.0 'Antelope':
+----------------------------
+
+ - ARM64 and ARM optimizations using NEON instructions
+ - SSSE3 optimizations for both 32 and 64bits
+ - More AVX2 assembly, reaching almost completion
+ - Fix installation of includes
+ - Rewrite inverse transforms to avoid overflows
+ - Snap packaging for Linux
+ - Updated API (ABI and API break)
+ - Fixes for un-decodable samples
+
+
+Changes for 0.1.0 'Gazelle':
+----------------------------
+
+Initial release of dav1d, the fast and small AV1 decoder.
+ - Support for all features of the AV1 bitstream
+ - Support for all bitdepth, 8, 10 and 12bits
+ - Support for all chroma subsamplings 4:2:0, 4:2:2, 4:4:4 *and* grayscale
+ - Full acceleration for AVX2 64bits processors, making it the fastest decoder
+ - Partial acceleration for SSSE3 processors
+ - Partial acceleration for NEON processors
diff --git a/README.md b/README.md
new file mode 100644 (file)
index 0000000..edccb2f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,144 @@
+![dav1d logo](dav1d_logo.png)
+
+# dav1d
+
+**dav1d** is a new **AV1** cross-platform **d**ecoder, open-source, and focused on speed and correctness.
+
+The canonical repository URL for this repo is https://code.videolan.org/videolan/dav1d
+
+This project is partially funded by the *Alliance for Open Media*/**AOM**.
+
+## Goal and Features
+
+The goal of this project is to provide a decoder for **most platforms**, and achieve the **highest speed** possible to overcome the temporary lack of AV1 hardware decoder.
+
+It supports all features from AV1, including all subsampling and bit-depth parameters.
+
+In the future, this project will host simple tools or simple wrappings *(like, for example, an MFT transform)*.
+
+## License
+
+**dav1d** is released under a very liberal license, a contrario from the other VideoLAN projects, so that it can be embedded anywhere, including non-open-source software; or even drivers, to allow the creation of hybrid decoders.
+
+The reasoning behind this decision is the same as for libvorbis, see [RMS on vorbis](https://lwn.net/2001/0301/a/rms-ov-license.php3).
+
+# Roadmap
+
+The plan is the following:
+
+### Reached
+1. Complete C implementation of the decoder,
+2. Provide a usable API,
+3. Port to most platforms,
+4. Make it fast on desktop, by writing asm for AVX-2 chips.
+5. Make it fast on mobile, by writing asm for ARMv8 chips,
+6. Make it fast on older desktop, by writing asm for SSSE3+ chips.
+
+### On-going
+7. Make it fast on older mobiles, by writing asm for ARMv7 chips,
+8. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
+9. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
+
+### After
+10. Use more GPU, when possible.
+
+# Contribute
+
+Currently, we are looking for help from:
+- C developers,
+- asm developers,
+- platform-specific developers,
+- GPGPU developers,
+- testers.
+
+Our contributions guidelines are quite strict. We want to build a coherent codebase to simplify maintenance and achieve the highest possible speed.
+
+Notably, the codebase is in pure C and asm.
+
+We are on IRC, on the **#dav1d** channel on *Freenode*.
+
+See the [contributions document](CONTRIBUTING.md).
+
+## CLA
+
+There is no CLA.
+
+People will keep their copyright and their authorship rights, while adhering to the BSD 2-clause license.
+
+VideoLAN will only have the collective work rights.
+
+## CoC
+
+The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this project.
+
+# Compile
+
+1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.14 or higher)
+2. Run `mkdir build && cd build` to create a build directory and enter it
+3. Run `meson ..` to configure meson, add `--default-library=static` if static linking is desired
+4. Run `ninja` to compile
+
+## Cross-Compilation for 32- or 64-bit Windows, 32-bit Linux
+
+If you're on a linux build machine trying to compile .exe for a Windows target/host machine, run
+
+```
+meson build --cross-file=package/crossfiles/x86_64-w64-mingw32.meson
+```
+
+or, for 32-bit:
+
+```
+meson build --cross-file=package/crossfiles/i686-w64-mingw32.meson
+```
+
+`mingw-w64` is a pre-requisite and should be installed on your linux machine via your preferred method or package manager. Note the binary name formats may differ between distributions. Verify the names, and use `alias` if certain binaries cannot be found.
+
+For 32-bit linux, run
+
+```
+meson build --cross-file=package/crossfiles/i686-linux32.meson
+```
+
+# Run tests
+
+1. In the root directory, run `git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data` to fetch the test data repository
+2. During meson configuration, specify `-Dtestdata_tests=true`
+3. Run `meson test -v` after compiling
+
+# Support
+
+This project is partially funded by the *Alliance for Open Media*/**AOM** and is supported by TwoOrioles and VideoLabs.
+
+These companies can provide support and integration help, should you need it.
+
+
+# FAQ
+
+## Why do you not improve libaom rather than starting a new project?
+
+- We believe that libaom is a very good library. It was however developed for research purposes during AV1 design.
+We think that an implementation written from scratch can achieve faster decoding, in the same way that *ffvp9* was faster than *libvpx*.
+
+## Is dav1d a recursive acronym?
+
+- Yes.
+
+## Can I help?
+
+- Yes. See the [contributions document](CONTRIBUTING.md).
+
+## I am not a developer. Can I help?
+
+- Yes. We need testers, bug reporters, and documentation writers.
+
+## What about the AV1 patent license?
+
+- This project is an implementation of a decoder. It gives you no special rights on the AV1 patents.
+
+Please read the [AV1 patent license](doc/PATENTS) that applies to the AV1 specification and codec.
+
+## Will you care about <my_arch>? <my_os>?
+
+- We do, but we don't have either the time or the knowledge. Therefore, patches and contributions welcome.
+
diff --git a/THANKS.md b/THANKS.md
new file mode 100644 (file)
index 0000000..98f8094
--- /dev/null
+++ b/THANKS.md
@@ -0,0 +1,28 @@
+# The dav1d project and VideoLAN association would like to thank
+
+## AOM
+The Alliance for Open Media (AOM) for funding this project.
+
+## Companies
+* Two Orioles LLC, for important coding effort
+* VideoLabs SAS
+
+## Projects
+* VideoLAN
+* FFmpeg
+* libplacebo
+
+## Individual
+
+And all the dav1d Authors (git shortlog -sn), including:
+
+Janne Grunau, Ronald S. Bultje, Martin Storsjö, Henrik Gramner, James Almer,
+Marvin Scholz, Luc Trudeau, Jean-Baptiste Kempf, Victorien Le Couviour--Tuffet,
+David Michael Barr, Hugo Beauzée-Luyssen, Steve Lhomme, Nathan E. Egge,
+Francois Cartegnie, Konstantin Pavlov, Liwei Wang, Xuefeng Jiang,
+Derek Buitenhuis, Raphaël Zumer, Niklas Haas, Michael Bradshaw, Kyle Siefring,
+Raphael Zumer, Boyuan Xiao, Thierry Foucu, Matthias Dressel, Thomas Daede,
+Rupert Swarbrick, Jan Beich, Dale Curtis, SmilingWolf, Tristan Laurent,
+Vittorio Giovara, Rostislav Pehlivanov, Shiz, skal, Steinar Midtskogen,
+Luca Barbato, Justin Bull, Jean-Yves Avenard, Timo Gurr, Fred Barbier,
+Anisse Astier, Pablo Stebler, Nicolas Frattaroli, Mark Shuttleworth.
diff --git a/dav1d_logo.png b/dav1d_logo.png
new file mode 100644 (file)
index 0000000..2d00855
Binary files /dev/null and b/dav1d_logo.png differ
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
new file mode 100644 (file)
index 0000000..ffb374c
--- /dev/null
@@ -0,0 +1,19 @@
+PROJECT_NAME            = dav1d
+OUTPUT_DIRECTORY        = @DOXYGEN_OUTPUT@
+STRIP_FROM_PATH         = @DOXYGEN_STRIP@
+OUTPUT_LANGUAGE         = English
+TAB_SIZE                = 4
+EXTRACT_ALL             = YES
+OPTIMIZE_OUTPUT_FOR_C   = YES
+DOXYFILE_ENCODING       = UTF-8
+TYPEDEF_HIDES_STRUCT    = YES
+
+QUIET                   = YES
+WARNINGS                = YES
+WARN_IF_UNDOCUMENTED    = YES
+
+INPUT                   = @DOXYGEN_INPUT@
+FILE_PATTERNS           = *.h
+
+GENERATE_HTML           = YES
+GENERATE_LATEX          = NO
diff --git a/doc/PATENTS b/doc/PATENTS
new file mode 100644 (file)
index 0000000..d57102a
--- /dev/null
@@ -0,0 +1,108 @@
+Alliance for Open Media Patent License 1.0
+
+1. License Terms.
+
+1.1. Patent License. Subject to the terms and conditions of this License, each
+     Licensor, on behalf of itself and successors in interest and assigns,
+     grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive,
+     no-charge, royalty-free, irrevocable (except as expressly stated in this
+     License) patent license to its Necessary Claims to make, use, sell, offer
+     for sale, import or distribute any Implementation.
+
+1.2. Conditions.
+
+1.2.1. Availability. As a condition to the grant of rights to Licensee to make,
+       sell, offer for sale, import or distribute an Implementation under
+       Section 1.1, Licensee must make its Necessary Claims available under
+       this License, and must reproduce this License with any Implementation
+       as follows:
+
+       a. For distribution in source code, by including this License in the
+          root directory of the source code with its Implementation.
+
+       b. For distribution in any other form (including binary, object form,
+          and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist,
+          GDSII, etc.)), by including this License in the documentation, legal
+          notices, and/or other written materials provided with the
+          Implementation.
+
+1.2.2. Additional Conditions. This license is directly from Licensor to
+       Licensee.  Licensee acknowledges as a condition of benefiting from it
+       that no rights from Licensor are received from suppliers, distributors,
+       or otherwise in connection with this License.
+
+1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents
+     initiates patent litigation or files, maintains, or voluntarily
+     participates in a lawsuit against another entity or any person asserting
+     that any Implementation infringes Necessary Claims, any patent licenses
+     granted under this License directly to the Licensee are immediately
+     terminated as of the date of the initiation of action unless 1) that suit
+     was in response to a corresponding suit regarding an Implementation first
+     brought against an initiating entity, or 2) that suit was brought to
+     enforce the terms of this License (including intervention in a third-party
+     action by a Licensee).
+
+1.4. Disclaimers. The Reference Implementation and Specification are provided
+     "AS IS" and without warranty. The entire risk as to implementing or
+     otherwise using the Reference Implementation or Specification is assumed
+     by the implementer and user. Licensor expressly disclaims any warranties
+     (express, implied, or otherwise), including implied warranties of
+     merchantability, non-infringement, fitness for a particular purpose, or
+     title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO
+     ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL,
+     INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF
+     ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH
+     OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR
+     NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+2. Definitions.
+
+2.1. Affiliate.  “Affiliate” means an entity that directly or indirectly
+     Controls, is Controlled by, or is under common Control of that party.
+
+2.2. Control. “Control” means direct or indirect control of more than 50% of
+     the voting power to elect directors of that corporation, or for any other
+     entity, the power to direct management of such entity.
+
+2.3. Decoder.  "Decoder" means any decoder that conforms fully with all
+     non-optional portions of the Specification.
+
+2.4. Encoder.  "Encoder" means any encoder that produces a bitstream that can
+     be decoded by a Decoder only to the extent it produces such a bitstream.
+
+2.5. Final Deliverable.  “Final Deliverable” means the final version of a
+     deliverable approved by the Alliance for Open Media as a Final
+     Deliverable.
+
+2.6. Implementation.  "Implementation" means any implementation, including the
+     Reference Implementation, that is an Encoder and/or a Decoder. An
+     Implementation also includes components of an Implementation only to the
+     extent they are used as part of an Implementation.
+
+2.7. License. “License” means this license.
+
+2.8. Licensee. “Licensee” means any person or entity who exercises patent
+     rights granted under this License.
+
+2.9. Licensor.  "Licensor" means (i) any Licensee that makes, sells, offers
+     for sale, imports or distributes any Implementation, or (ii) a person
+     or entity that has a licensing obligation to the Implementation as a
+     result of its membership and/or participation in the Alliance for Open
+     Media working group that developed the Specification.
+
+2.10. Necessary Claims.  "Necessary Claims" means all claims of patents or
+      patent applications, (a) that currently or at any time in the future,
+      are owned or controlled by the Licensor, and (b) (i) would be an
+      Essential Claim as defined by the W3C Policy as of February 5, 2004
+      (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential)
+      as if the Specification was a W3C Recommendation; or (ii) are infringed
+      by the Reference Implementation.
+
+2.11. Reference Implementation. “Reference Implementation” means an Encoder
+      and/or Decoder released by the Alliance for Open Media as a Final
+      Deliverable.
+
+2.12. Specification. “Specification” means the specification designated by
+      the Alliance for Open Media as a Final Deliverable for which this
+      License was issued.
+
diff --git a/doc/dav1d_logo.svg b/doc/dav1d_logo.svg
new file mode 100644 (file)
index 0000000..2795db8
--- /dev/null
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 892.555 241.469"><defs><style>.a,.b,.c,.d,.e,.f,.g{fill-rule:evenodd;}.a{fill:url(#a);}.b{fill:url(#b);}.c{fill:url(#c);}.d{fill:url(#d);}.e{fill:url(#e);}.f{fill:url(#f);}.g{fill:#ec7f38;}.h{fill:#e9800b;}.i{fill:#1c1c1e;}</style><linearGradient id="a" x1="-4.141" y1="797.831" x2="-3.832" y2="797.831" gradientTransform="matrix(0, 93.772, 93.772, 0, -74468.833, 665.048)" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#f38200"/><stop offset="1" stop-color="#e33b00"/></linearGradient><linearGradient id="b" x1="-0.193" y1="799.854" x2="0.116" y2="799.854" gradientTransform="matrix(0, 259.375, 259.375, 0, -207116.812, 249.437)" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#d86722"/><stop offset="1" stop-color="#faa000"/></linearGradient><radialGradient id="c" cx="0.155" cy="801.934" r="0.33" gradientTransform="matrix(344.397, 0, 0, -344.397, 294.508, 276388.851)" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#e25600"/><stop offset="1" stop-color="#e25900" stop-opacity="0"/></radialGradient><linearGradient id="d" x1="1.014" y1="800.555" x2="1.323" y2="800.555" gradientTransform="matrix(0, 667.187, 667.187, 0, -533774.755, -619.44)" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#faa000"/><stop offset="1" stop-color="#d85f15"/></linearGradient><linearGradient id="e" x1="0.612" y1="799.09" x2="0.921" y2="799.09" gradientTransform="matrix(0, 155.49, 155.49, 0, -123905.179, -6.855)" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#edeef0"/><stop offset="1" stop-color="#d1d3d5"/></linearGradient><linearGradient id="f" x1="-0.24" y1="799.463" x2="0.069" y2="799.463" gradientTransform="matrix(0, 193.148, 193.148, 0, -154069.718, 214.703)" xlink:href="#e"/></defs><title>dav1d</title><path class="a" d="M245.519,277.146a.726.726,0,0,1,.8-.741,24.866,24.866,0,0,1,8.143-1.615l178.01.755c2.341.01,6.131.214,8.448.5,0,0,4.411.311,4.411,1.1a17.347,17.347,0,0,1-.221,2.132,4.979,4.979,0,0,1-4.807,4.2H250.547a4.977,4.977,0,0,1-4.807-4.2A17.347,17.347,0,0,1,245.519,277.146Z" transform="translate(-43.722 -44.266)"/><path class="b" d="M417.643,199.5H273.205a6.1,6.1,0,0,0-5.494,4.07L245.7,275.547a2.971,2.971,0,0,0,3.016,4.069h193.42a2.971,2.971,0,0,0,3.016-4.069l-22.012-71.978A6.106,6.106,0,0,0,417.643,199.5Z" transform="translate(-43.722 -44.266)"/><path class="c" d="M417.643,199.5H273.205a6.1,6.1,0,0,0-5.494,4.07L245.7,275.547a2.971,2.971,0,0,0,3.016,4.069h193.42a2.971,2.971,0,0,0,3.016-4.069l-22.012-71.978A6.106,6.106,0,0,0,417.643,199.5Z" transform="translate(-43.722 -44.266)"/><path class="d" d="M345.409,263.207c-65.958,0-65.526-30.6-65.526-30.6a16.462,16.462,0,0,1,.506-6.082L331.908,62.461a6.91,6.91,0,0,1,3.872-4.056s1.623-1.282,9.945-1.282,9.4,1.213,9.4,1.213a7.372,7.372,0,0,1,3.835,4.125l51.47,164.064a15.127,15.127,0,0,1,.506,6.082S411.368,263.207,345.409,263.207Z" transform="translate(-43.722 -44.266)"/><path class="e" d="M323.571,89.008a63.831,63.831,0,0,0,21.068,4.024,57.933,57.933,0,0,0,22.411-4.786l12.293,39.187c-3.4,3.207-12.52,8.842-34.7,8.842-21.625,0-30.077-5.355-33.211-8.595Z" transform="translate(-43.722 -44.266)"/><path class="f" d="M298.33,169.392C300.473,172.8,310.156,184.1,344.4,184.1c35.433,0,45.78-12.7,47.769-15.785l11.675,37.214c-2.27,4.578-14.039,22.447-59.444,22.447-43.386,0-54.973-16.317-57.632-21.761Z" transform="translate(-43.722 -44.266)"/><path class="g" d="M357.132,59.691c-1.31,1.574-6.055,2.741-11.708,2.741-5.636,0-10.369-1.159-11.695-2.726a5.631,5.631,0,0,1,2.051-1.3s1.623-1.282,9.945-1.282,9.4,1.213,9.4,1.213A6.418,6.418,0,0,1,357.132,59.691Z" transform="translate(-43.722 -44.266)"/><path class="h" d="M638.853,57.806h86.119a6.6,6.6,0,0,1,6.6,6.595V276.882a6.6,6.6,0,0,1-6.6,6.6H680.778a6.6,6.6,0,0,1-6.6-6.6V110.18a6.6,6.6,0,0,0-6.6-6.6H626.174a6.6,6.6,0,0,1-6.146-8.987L632.706,62.01A6.6,6.6,0,0,1,638.853,57.806Z" transform="translate(-43.722 -44.266)"/><path class="i" d="M826.988,107.454c16.982,0,31.481,4.969,42.836,14.117a6.584,6.584,0,0,0,10.68-5.17V50.861a6.6,6.6,0,0,1,6.6-6.595h42.583a6.6,6.6,0,0,1,6.6,6.595V276.882a6.6,6.6,0,0,1-6.6,6.6H887.1a6.6,6.6,0,0,1-6.6-6.6h0a6.586,6.586,0,0,0-10.729-5.129c-11.113,9.061-25.076,13.981-41.5,13.981-49,0-81.243-36.107-81.243-89.946C747.035,142.594,778.951,107.454,826.988,107.454Zm14.83,135.08c23.212,0,38.686-18.7,38.686-45.457s-15.474-45.134-38.686-45.134-38.687,18.376-39.009,45.134C803.131,223.836,818.606,242.534,841.818,242.534Z" transform="translate(-43.722 -44.266)"/><path class="i" d="M123.675,107.454c16.982,0,31.481,4.969,42.836,14.117a6.584,6.584,0,0,0,10.68-5.17V50.861a6.6,6.6,0,0,1,6.6-6.595h42.581a6.6,6.6,0,0,1,6.6,6.595V276.882a6.6,6.6,0,0,1-6.6,6.6H183.787a6.6,6.6,0,0,1-6.6-6.6h0a6.586,6.586,0,0,0-10.729-5.129c-11.113,9.061-25.076,13.981-41.5,13.981-49,0-81.243-36.107-81.243-89.946C43.722,142.594,75.639,107.454,123.675,107.454Zm14.83,135.08c23.212,0,38.686-18.7,38.686-45.457s-15.474-45.134-38.686-45.134S99.818,170.319,99.5,197.077C99.818,223.836,115.293,242.534,138.505,242.534Z" transform="translate(-43.722 -44.266)"/><path class="h" d="M532.87,279.286,616.05,66.805a6.6,6.6,0,0,0-6.142-9H573.477A6.6,6.6,0,0,0,567.335,62L508.5,212.286a6.6,6.6,0,0,1-12.283,0L437.386,62a6.6,6.6,0,0,0-6.142-4.191H394.813a6.6,6.6,0,0,0-6.142,9L471.85,279.286a6.6,6.6,0,0,0,6.142,4.192h48.737A6.594,6.594,0,0,0,532.87,279.286Z" transform="translate(-43.722 -44.266)"/></svg>
diff --git a/doc/meson.build b/doc/meson.build
new file mode 100644 (file)
index 0000000..0ef7123
--- /dev/null
@@ -0,0 +1,43 @@
+# Copyright © 2018, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+doxygen = find_program('doxygen', required: false)
+dot = find_program('dot', required: false)
+
+if doxygen.found() and dot.found()
+    conf_data = configuration_data()
+    conf_data.set('DOXYGEN_INPUT', join_paths(dav1d_src_root, 'include/dav1d'))
+    conf_data.set('DOXYGEN_STRIP', join_paths(dav1d_src_root, 'include'))
+    conf_data.set('DOXYGEN_OUTPUT', meson.current_build_dir())
+    doxyfile = configure_file(input: 'Doxyfile.in',
+                              output: 'Doxyfile',
+                              configuration: conf_data)
+
+    custom_target('doc',
+                  build_by_default: false,
+                  command: [doxygen, doxyfile],
+                  output: ['html']
+    )
+endif
+
diff --git a/examples/dav1dplay.c b/examples/dav1dplay.c
new file mode 100644 (file)
index 0000000..d6bb262
--- /dev/null
@@ -0,0 +1,583 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "vcs_version.h"
+
+#include <getopt.h>
+#include <stdbool.h>
+
+#include <SDL.h>
+
+#include "dav1d/dav1d.h"
+
+#include "common/attributes.h"
+#include "tools/input/input.h"
+#include "dp_fifo.h"
+#include "dp_renderer.h"
+
+// Selected renderer callbacks and cookie
+static const Dav1dPlayRenderInfo *renderer_info = { NULL };
+
+/**
+ * Render context structure
+ * This structure contains informations necessary
+ * to be shared between the decoder and the renderer
+ * threads.
+ */
+typedef struct render_context
+{
+    Dav1dPlaySettings settings;
+    Dav1dSettings lib_settings;
+
+    // Renderer private data (passed to callbacks)
+    void *rd_priv;
+
+    // Lock to protect access to the context structure
+    SDL_mutex *lock;
+
+    // Timestamp of previous decoded frame
+    int64_t last_pts;
+    // Timestamp of current decoded frame
+    int64_t current_pts;
+    // Ticks when last frame was received
+    uint32_t last_ticks;
+    // PTS time base
+    double timebase;
+
+    // Fifo
+    Dav1dPlayPtrFifo *fifo;
+
+    // Custom SDL2 event type
+    uint32_t renderer_event_type;
+
+    // Indicates if termination of the decoder thread was requested
+    uint8_t dec_should_terminate;
+} Dav1dPlayRenderContext;
+
+static void dp_settings_print_usage(const char *const app,
+    const char *const reason, ...)
+{
+    if (reason) {
+        va_list args;
+
+        va_start(args, reason);
+        vfprintf(stderr, reason, args);
+        va_end(args);
+        fprintf(stderr, "\n\n");
+    }
+    fprintf(stderr, "Usage: %s [options]\n\n", app);
+    fprintf(stderr, "Supported options:\n"
+            " --input/-i  $file:    input file\n"
+            " --untimed/-u:         ignore PTS, render as fast as possible\n"
+            " --framethreads $num:  number of frame threads (default: 1)\n"
+            " --tilethreads $num:   number of tile threads (default: 1)\n"
+            " --highquality:        enable high quality rendering\n"
+            " --zerocopy/-z:        enable zero copy upload path\n"
+            " --gpugrain/-g:        enable GPU grain synthesis\n"
+            " --version/-v:         print version and exit\n"
+            " --renderer/-r:        select renderer backend (default: auto)\n");
+    exit(1);
+}
+
+static unsigned parse_unsigned(const char *const optarg, const int option,
+                               const char *const app)
+{
+    char *end;
+    const unsigned res = (unsigned) strtoul(optarg, &end, 0);
+    if (*end || end == optarg)
+        dp_settings_print_usage(app, "Invalid argument \"%s\" for option %s; should be an integer",
+          optarg, option);
+    return res;
+}
+
+static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
+    const int argc, char *const *const argv)
+{
+    int o;
+    Dav1dPlaySettings *settings = &rd_ctx->settings;
+    Dav1dSettings *lib_settings = &rd_ctx->lib_settings;
+
+    // Short options
+    static const char short_opts[] = "i:vuzgr:";
+
+    enum {
+        ARG_FRAME_THREADS = 256,
+        ARG_TILE_THREADS,
+        ARG_HIGH_QUALITY,
+    };
+
+    // Long options
+    static const struct option long_opts[] = {
+        { "input",          1, NULL, 'i' },
+        { "version",        0, NULL, 'v' },
+        { "untimed",        0, NULL, 'u' },
+        { "framethreads",   1, NULL, ARG_FRAME_THREADS },
+        { "tilethreads",    1, NULL, ARG_TILE_THREADS },
+        { "highquality",    0, NULL, ARG_HIGH_QUALITY },
+        { "zerocopy",       0, NULL, 'z' },
+        { "gpugrain",       0, NULL, 'g' },
+        { "renderer",       0, NULL, 'r'},
+        { NULL,             0, NULL, 0 },
+    };
+
+    while ((o = getopt_long(argc, argv, short_opts, long_opts, NULL)) != -1) {
+        switch (o) {
+            case 'i':
+                settings->inputfile = optarg;
+                break;
+            case 'v':
+                fprintf(stderr, "%s\n", dav1d_version());
+                exit(0);
+            case 'u':
+                settings->untimed = true;
+                break;
+            case ARG_HIGH_QUALITY:
+                settings->highquality = true;
+                break;
+            case 'z':
+                settings->zerocopy = true;
+                break;
+            case 'g':
+                settings->gpugrain = true;
+                break;
+            case 'r':
+                settings->renderer_name = optarg;
+                break;
+            case ARG_FRAME_THREADS:
+                lib_settings->n_frame_threads =
+                    parse_unsigned(optarg, ARG_FRAME_THREADS, argv[0]);
+                break;
+            case ARG_TILE_THREADS:
+                lib_settings->n_tile_threads =
+                    parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]);
+                break;
+            default:
+                dp_settings_print_usage(argv[0], NULL);
+        }
+    }
+
+    if (optind < argc)
+        dp_settings_print_usage(argv[0],
+            "Extra/unused arguments found, e.g. '%s'\n", argv[optind]);
+    if (!settings->inputfile)
+        dp_settings_print_usage(argv[0], "Input file (-i/--input) is required");
+    if (settings->renderer_name && strcmp(settings->renderer_name, "auto") == 0)
+        settings->renderer_name = NULL;
+}
+
+/**
+ * Destroy a Dav1dPlayRenderContext
+ */
+static void dp_rd_ctx_destroy(Dav1dPlayRenderContext *rd_ctx)
+{
+    assert(rd_ctx != NULL);
+
+    renderer_info->destroy_renderer(rd_ctx->rd_priv);
+    dp_fifo_destroy(rd_ctx->fifo);
+    SDL_DestroyMutex(rd_ctx->lock);
+    free(rd_ctx);
+}
+
+/**
+ * Create a Dav1dPlayRenderContext
+ *
+ * \note  The Dav1dPlayRenderContext must be destroyed
+ *        again by using dp_rd_ctx_destroy.
+ */
+static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
+{
+    Dav1dPlayRenderContext *rd_ctx;
+
+    // Alloc
+    rd_ctx = malloc(sizeof(Dav1dPlayRenderContext));
+    if (rd_ctx == NULL) {
+        return NULL;
+    }
+
+    // Register a custom event to notify our SDL main thread
+    // about new frames
+    rd_ctx->renderer_event_type = SDL_RegisterEvents(1);
+    if (rd_ctx->renderer_event_type == UINT32_MAX) {
+        fprintf(stderr, "Failure to create custom SDL event type!\n");
+        free(rd_ctx);
+        return NULL;
+    }
+
+    rd_ctx->fifo = dp_fifo_create(5);
+    if (rd_ctx->fifo == NULL) {
+        fprintf(stderr, "Failed to create FIFO for output pictures!\n");
+        free(rd_ctx);
+        return NULL;
+    }
+
+    rd_ctx->lock = SDL_CreateMutex();
+    if (rd_ctx->lock == NULL) {
+        fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
+        dp_fifo_destroy(rd_ctx->fifo);
+        free(rd_ctx);
+        return NULL;
+    }
+
+    // Parse and validate arguments
+    dav1d_default_settings(&rd_ctx->lib_settings);
+    memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings));
+    dp_rd_ctx_parse_args(rd_ctx, argc, argv);
+
+    // Select renderer
+    renderer_info = dp_get_renderer(rd_ctx->settings.renderer_name);
+
+    if (renderer_info == NULL) {
+        printf("No suitable rendered matching %s found.\n",
+            (rd_ctx->settings.renderer_name) ? rd_ctx->settings.renderer_name : "auto");
+    } else {
+        printf("Using %s renderer\n", renderer_info->name);
+    }
+
+    rd_ctx->rd_priv = (renderer_info) ? renderer_info->create_renderer() : NULL;
+    if (rd_ctx->rd_priv == NULL) {
+        SDL_DestroyMutex(rd_ctx->lock);
+        dp_fifo_destroy(rd_ctx->fifo);
+        free(rd_ctx);
+        return NULL;
+    }
+
+    rd_ctx->last_pts = 0;
+    rd_ctx->last_ticks = 0;
+    rd_ctx->current_pts = 0;
+    rd_ctx->timebase = 0;
+    rd_ctx->dec_should_terminate = 0;
+
+    return rd_ctx;
+}
+
+/**
+ * Notify about new available frame
+ */
+static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
+{
+    SDL_Event event;
+    SDL_zero(event);
+    event.type = rd_ctx->renderer_event_type;
+    event.user.code = code;
+    SDL_PushEvent(&event);
+}
+
+/**
+ * Update the decoder context with a new dav1d picture
+ *
+ * Once the decoder decoded a new picture, this call can be used
+ * to update the internal texture of the render context with the
+ * new picture.
+ */
+static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx,
+    Dav1dPicture *dav1d_pic)
+{
+    renderer_info->update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
+    rd_ctx->current_pts = dav1d_pic->m.timestamp;
+}
+
+/**
+ * Terminate decoder thread (async)
+ */
+static void dp_rd_ctx_request_shutdown(Dav1dPlayRenderContext *rd_ctx)
+{
+    SDL_LockMutex(rd_ctx->lock);
+    rd_ctx->dec_should_terminate = 1;
+    SDL_UnlockMutex(rd_ctx->lock);
+}
+
+/**
+ * Query state of decoder shutdown request
+ */
+static int dp_rd_ctx_should_terminate(Dav1dPlayRenderContext *rd_ctx)
+{
+    int ret = 0;
+    SDL_LockMutex(rd_ctx->lock);
+    ret = rd_ctx->dec_should_terminate;
+    SDL_UnlockMutex(rd_ctx->lock);
+    return ret;
+}
+
+/**
+ * Render the currently available texture
+ *
+ * Renders the currently available texture, if any.
+ */
+static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
+{
+    // Calculate time since last frame was received
+    uint32_t ticks_now = SDL_GetTicks();
+    uint32_t ticks_diff = (rd_ctx->last_ticks != 0) ? ticks_now - rd_ctx->last_ticks : 0;
+
+    // Calculate when to display the frame
+    int64_t pts_diff = rd_ctx->current_pts - rd_ctx->last_pts;
+    int32_t wait_time = (pts_diff * rd_ctx->timebase) * 1000 - ticks_diff;
+    rd_ctx->last_pts = rd_ctx->current_pts;
+
+    // In untimed mode, simply don't wait
+    if (rd_ctx->settings.untimed)
+        wait_time = 0;
+
+    // This way of timing the playback is not accurate, as there is no guarantee
+    // that SDL_Delay will wait for exactly the requested amount of time so in a
+    // accurate player this would need to be done in a better way.
+    if (wait_time > 0) {
+        SDL_Delay(wait_time);
+    } else if (wait_time < -10) { // Do not warn for minor time drifts
+        fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000);
+    }
+
+    renderer_info->render(rd_ctx->rd_priv, &rd_ctx->settings);
+
+    rd_ctx->last_ticks = SDL_GetTicks();
+}
+
+/* Decoder thread "main" function */
+static int decoder_thread_main(void *cookie)
+{
+    Dav1dPlayRenderContext *rd_ctx = cookie;
+
+    Dav1dPicture *p;
+    Dav1dContext *c = NULL;
+    Dav1dData data;
+    DemuxerContext *in_ctx = NULL;
+    int res = 0;
+    unsigned n_out = 0, total, timebase[2], fps[2];
+
+    // Store current ticks for stats calculation
+    uint32_t decoder_start = SDL_GetTicks();
+
+    Dav1dPlaySettings settings = rd_ctx->settings;
+
+    if ((res = input_open(&in_ctx, "ivf",
+                          settings.inputfile,
+                          fps, &total, timebase)) < 0)
+    {
+        fprintf(stderr, "Failed to open demuxer\n");
+        res = 1;
+        goto cleanup;
+    }
+
+    double timebase_d = timebase[1]/(double)timebase[0];
+    rd_ctx->timebase = timebase_d;
+
+    if ((res = dav1d_open(&c, &rd_ctx->lib_settings))) {
+        fprintf(stderr, "Failed opening dav1d decoder\n");
+        res = 1;
+        goto cleanup;
+    }
+
+    if ((res = input_read(in_ctx, &data)) < 0) {
+        fprintf(stderr, "Failed demuxing input\n");
+        res = 1;
+        goto cleanup;
+    }
+
+    // Decoder loop
+    do {
+        if (dp_rd_ctx_should_terminate(rd_ctx))
+            break;
+
+        // Send data packets we got from the demuxer to dav1d
+        if ((res = dav1d_send_data(c, &data)) < 0) {
+            // On EAGAIN, dav1d can not consume more data and
+            // dav1d_get_picture needs to be called first, which
+            // will happen below, so just keep going in that case
+            // and do not error out.
+            if (res != DAV1D_ERR(EAGAIN)) {
+                dav1d_data_unref(&data);
+                fprintf(stderr, "Error decoding frame: %s\n",
+                        strerror(-res));
+                break;
+            }
+        }
+
+        p = calloc(1, sizeof(*p));
+
+        // Try to get a decoded frame
+        if ((res = dav1d_get_picture(c, p)) < 0) {
+            // In all error cases, even EAGAIN, p needs to be freed as
+            // it is never added to the queue and would leak.
+            free(p);
+
+            // On EAGAIN, it means dav1d has not enough data to decode
+            // therefore this is not a decoding error but just means
+            // we need to feed it more data, which happens in the next
+            // run of this decoder loop.
+            if (res != DAV1D_ERR(EAGAIN)) {
+                fprintf(stderr, "Error decoding frame: %s\n",
+                        strerror(-res));
+                break;
+            }
+            res = 0;
+        } else {
+
+            // Queue frame
+            dp_fifo_push(rd_ctx->fifo, p);
+            dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_NEW_FRAME);
+
+            n_out++;
+        }
+    } while ((data.sz > 0 || !input_read(in_ctx, &data)));
+
+    // Release remaining data
+    if (data.sz > 0) dav1d_data_unref(&data);
+
+    // Do not drain in case an error occured and caused us to leave the
+    // decoding loop early.
+    if (res < 0)
+        goto cleanup;
+
+    // Drain decoder
+    // When there is no more data to feed to the decoder, for example
+    // because the file ended, we still need to request pictures, as
+    // even though we do not have more data, there can be frames decoded
+    // from data we sent before. So we need to call dav1d_get_picture until
+    // we get an EAGAIN error.
+    do {
+        if (dp_rd_ctx_should_terminate(rd_ctx))
+            break;
+
+        p = calloc(1, sizeof(*p));
+        res = dav1d_get_picture(c, p);
+        if (res < 0) {
+            free(p);
+            if (res != DAV1D_ERR(EAGAIN)) {
+                fprintf(stderr, "Error decoding frame: %s\n",
+                        strerror(-res));
+                break;
+            }
+        } else {
+            // Queue frame
+            dp_fifo_push(rd_ctx->fifo, p);
+            dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_NEW_FRAME);
+
+            n_out++;
+        }
+    } while (res != DAV1D_ERR(EAGAIN));
+
+    // Print stats
+    uint32_t decoding_time_ms = SDL_GetTicks() - decoder_start;
+    printf("Decoded %u frames in %d seconds, avg %.02f fps\n",
+        n_out, decoding_time_ms/1000, n_out / (decoding_time_ms / 1000.0));
+
+cleanup:
+    dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_DEC_QUIT);
+
+    if (in_ctx)
+        input_close(in_ctx);
+    if (c)
+        dav1d_close(&c);
+
+    return (res != DAV1D_ERR(EAGAIN) && res < 0);
+}
+
+int main(int argc, char **argv)
+{
+    SDL_Thread *decoder_thread;
+
+    // Check for version mismatch between library and tool
+    const char *version = dav1d_version();
+    if (strcmp(version, DAV1D_VERSION)) {
+        fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n",
+                version, DAV1D_VERSION);
+        return 1;
+    }
+
+    // Init SDL2 library
+    if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) < 0)
+        return 10;
+
+    // Create render context
+    Dav1dPlayRenderContext *rd_ctx = dp_rd_ctx_create(argc, argv);
+    if (rd_ctx == NULL) {
+        fprintf(stderr, "Failed creating render context\n");
+        return 5;
+    }
+
+    if (rd_ctx->settings.zerocopy) {
+        if (renderer_info->alloc_pic) {
+            rd_ctx->lib_settings.allocator = (Dav1dPicAllocator) {
+                .cookie = rd_ctx->rd_priv,
+                .alloc_picture_callback = renderer_info->alloc_pic,
+                .release_picture_callback = renderer_info->release_pic,
+            };
+        } else {
+            fprintf(stderr, "--zerocopy unsupported by selected renderer\n");
+        }
+    }
+
+    if (rd_ctx->settings.gpugrain) {
+        if (renderer_info->supports_gpu_grain) {
+            rd_ctx->lib_settings.apply_grain = 0;
+        } else {
+            fprintf(stderr, "--gpugrain unsupported by selected renderer\n");
+        }
+    }
+
+    // Start decoder thread
+    decoder_thread = SDL_CreateThread(decoder_thread_main, "Decoder thread", rd_ctx);
+
+    // Main loop
+    while (1) {
+
+        SDL_Event e;
+        if (SDL_WaitEvent(&e)) {
+            if (e.type == SDL_QUIT) {
+                dp_rd_ctx_request_shutdown(rd_ctx);
+            } else if (e.type == SDL_WINDOWEVENT) {
+                if (e.window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
+                    // TODO: Handle window resizes
+                }
+            } else if (e.type == rd_ctx->renderer_event_type) {
+                if (e.user.code == DAV1D_EVENT_NEW_FRAME) {
+                    // Dequeue frame and update the render context with it
+                    Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
+
+                    // Do not update textures during termination
+                    if (!dp_rd_ctx_should_terminate(rd_ctx))
+                        dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
+                    dav1d_picture_unref(p);
+                    free(p);
+                } else if (e.user.code == DAV1D_EVENT_DEC_QUIT) {
+                    break;
+                }
+            }
+        }
+
+        // Do not render during termination
+        if (!dp_rd_ctx_should_terminate(rd_ctx))
+            dp_rd_ctx_render(rd_ctx);
+    }
+
+    int decoder_ret = 0;
+    SDL_WaitThread(decoder_thread, &decoder_ret);
+
+    dp_rd_ctx_destroy(rd_ctx);
+
+    return decoder_ret;
+}
diff --git a/examples/dp_fifo.c b/examples/dp_fifo.c
new file mode 100644 (file)
index 0000000..243d2e9
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <SDL.h>
+#include <assert.h>
+
+#include "dp_fifo.h"
+
+// FIFO structure
+struct dp_fifo
+{
+    SDL_mutex *lock;
+    SDL_cond *cond_change;
+    size_t capacity;
+    size_t count;
+    void **entries;
+};
+
+
+Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity)
+{
+    Dav1dPlayPtrFifo *fifo;
+
+    assert(capacity > 0);
+    if (capacity <= 0)
+        return NULL;
+
+    fifo = malloc(sizeof(*fifo));
+    if (fifo == NULL)
+        return NULL;
+
+    fifo->capacity = capacity;
+    fifo->count = 0;
+
+    fifo->lock = SDL_CreateMutex();
+    if (fifo->lock == NULL) {
+        free(fifo);
+        return NULL;
+    }
+    fifo->cond_change = SDL_CreateCond();
+    if (fifo->cond_change == NULL) {
+        SDL_DestroyMutex(fifo->lock);
+        free(fifo);
+        return NULL;
+    }
+
+    fifo->entries = calloc(capacity, sizeof(void*));
+    if (fifo->entries == NULL) {
+        dp_fifo_destroy(fifo);
+        return NULL;
+    }
+
+    return fifo;
+}
+
+// Destroy FIFO
+void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo)
+{
+    assert(fifo->count == 0);
+    SDL_DestroyMutex(fifo->lock);
+    SDL_DestroyCond(fifo->cond_change);
+    free(fifo->entries);
+    free(fifo);
+}
+
+// Push to FIFO
+void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element)
+{
+    SDL_LockMutex(fifo->lock);
+    while (fifo->count == fifo->capacity)
+        SDL_CondWait(fifo->cond_change, fifo->lock);
+    fifo->entries[fifo->count++] = element;
+    if (fifo->count == 1)
+        SDL_CondSignal(fifo->cond_change);
+    SDL_UnlockMutex(fifo->lock);
+}
+
+// Helper that shifts the FIFO array
+static void *dp_fifo_array_shift(void **arr, size_t len)
+{
+    void *shifted_element = arr[0];
+    for (size_t i = 1; i < len; ++i)
+        arr[i-1] = arr[i];
+    return shifted_element;
+}
+
+// Get item from FIFO
+void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo)
+{
+    SDL_LockMutex(fifo->lock);
+    while (fifo->count == 0)
+        SDL_CondWait(fifo->cond_change, fifo->lock);
+    void *res = dp_fifo_array_shift(fifo->entries, fifo->count--);
+    if (fifo->count == fifo->capacity - 1)
+        SDL_CondSignal(fifo->cond_change);
+    SDL_UnlockMutex(fifo->lock);
+    return res;
+}
+
+
diff --git a/examples/dp_fifo.h b/examples/dp_fifo.h
new file mode 100644 (file)
index 0000000..a94b089
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Dav1dPlay FIFO helper
+ */
+
+typedef struct dp_fifo Dav1dPlayPtrFifo;
+
+/* Create a FIFO
+ *
+ * Creates a FIFO with the given capacity.
+ * If the capacity is reached, new inserts into the FIFO
+ * will block until enough space is available again.
+ */
+Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity);
+
+/* Destroy a FIFO
+ *
+ * The FIFO must be empty before it is destroyed!
+ */
+void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo);
+
+/* Shift FIFO
+ *
+ * Return the first item from the FIFO, thereby removing it from
+ * the FIFO and making room for new entries.
+ */
+void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo);
+
+/* Push to FIFO
+ *
+ * Add an item to the end of the FIFO.
+ * If the FIFO is full, this call will block until there is again enough
+ * space in the FIFO, so calling this from the "consumer" thread if no
+ * other thread will call dp_fifo_shift will lead to a deadlock.
+ */
+void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element);
diff --git a/examples/dp_renderer.h b/examples/dp_renderer.h
new file mode 100644 (file)
index 0000000..4c6f295
--- /dev/null
@@ -0,0 +1,132 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <string.h>
+
+#include "dav1d/dav1d.h"
+
+#include <SDL.h>
+#ifdef HAVE_PLACEBO
+# include <libplacebo/config.h>
+#endif
+
+// Check libplacebo Vulkan rendering
+#if defined(HAVE_VULKAN) && defined(SDL_VIDEO_VULKAN)
+# if defined(PL_HAVE_VULKAN) && PL_HAVE_VULKAN
+#  define HAVE_RENDERER_PLACEBO
+#  define HAVE_PLACEBO_VULKAN
+# endif
+#endif
+
+// Check libplacebo OpenGL rendering
+#if defined(PL_HAVE_OPENGL) && PL_HAVE_OPENGL
+# define HAVE_RENDERER_PLACEBO
+# define HAVE_PLACEBO_OPENGL
+#endif
+
+/**
+ * Settings structure
+ * Hold all settings available for the player,
+ * this is usually filled by parsing arguments
+ * from the console.
+ */
+typedef struct {
+    const char *inputfile;
+    const char *renderer_name;
+    int highquality;
+    int untimed;
+    int zerocopy;
+    int gpugrain;
+} Dav1dPlaySettings;
+
+#define WINDOW_WIDTH  910
+#define WINDOW_HEIGHT 512
+
+#define DAV1D_EVENT_NEW_FRAME 1
+#define DAV1D_EVENT_DEC_QUIT  2
+
+/**
+ * Renderer info
+ */
+typedef struct rdr_info
+{
+    // Renderer name
+    const char *name;
+    // Cookie passed to the renderer implementation callbacks
+    void *cookie;
+    // Callback to create the renderer
+    void* (*create_renderer)();
+    // Callback to destroy the renderer
+    void (*destroy_renderer)(void *cookie);
+    // Callback to the render function that renders a prevously sent frame
+    void (*render)(void *cookie, const Dav1dPlaySettings *settings);
+    // Callback to the send frame function
+    int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
+                        const Dav1dPlaySettings *settings);
+    // Callback for alloc/release pictures (optional)
+    int (*alloc_pic)(Dav1dPicture *pic, void *cookie);
+    void (*release_pic)(Dav1dPicture *pic, void *cookie);
+    // Whether or not this renderer can apply on-GPU film grain synthesis
+    int supports_gpu_grain;
+} Dav1dPlayRenderInfo;
+
+extern const Dav1dPlayRenderInfo rdr_placebo_vk;
+extern const Dav1dPlayRenderInfo rdr_placebo_gl;
+extern const Dav1dPlayRenderInfo rdr_sdl;
+
+// Available renderes ordered by priority
+static const Dav1dPlayRenderInfo* const dp_renderers[] = {
+    &rdr_placebo_vk,
+    &rdr_placebo_gl,
+    &rdr_sdl,
+};
+
+static inline const Dav1dPlayRenderInfo *dp_get_renderer(const char *name)
+{
+    for (size_t i = 0; i < (sizeof(dp_renderers)/sizeof(*dp_renderers)); ++i)
+    {
+        if (dp_renderers[i]->name == NULL)
+            continue;
+
+        if (name == NULL || strcmp(name, dp_renderers[i]->name) == 0) {
+            return dp_renderers[i];
+        }
+    }
+    return NULL;
+}
+
+static inline SDL_Window *dp_create_sdl_window(int window_flags)
+{
+    SDL_Window *win;
+    window_flags |= SDL_WINDOW_SHOWN | SDL_WINDOW_ALLOW_HIGHDPI;
+
+    win = SDL_CreateWindow("Dav1dPlay", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED,
+        WINDOW_WIDTH, WINDOW_HEIGHT, window_flags);
+    SDL_SetWindowResizable(win, SDL_TRUE);
+
+    return win;
+}
diff --git a/examples/dp_renderer_placebo.c b/examples/dp_renderer_placebo.c
new file mode 100644 (file)
index 0000000..beb1d42
--- /dev/null
@@ -0,0 +1,723 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "dp_renderer.h"
+
+#ifdef HAVE_RENDERER_PLACEBO
+#include <assert.h>
+
+#include <libplacebo/renderer.h>
+#include <libplacebo/utils/upload.h>
+
+#ifdef HAVE_PLACEBO_VULKAN
+# include <libplacebo/vulkan.h>
+# include <SDL_vulkan.h>
+#endif
+#ifdef HAVE_PLACEBO_OPENGL
+# include <libplacebo/opengl.h>
+# include <SDL_opengl.h>
+#endif
+
+
+/**
+ * Renderer context for libplacebo
+ */
+typedef struct renderer_priv_ctx
+{
+    // SDL window
+    SDL_Window *win;
+    // Placebo context
+    struct pl_context *ctx;
+    // Placebo renderer
+    struct pl_renderer *renderer;
+#ifdef HAVE_PLACEBO_VULKAN
+    // Placebo Vulkan handle
+    const struct pl_vulkan *vk;
+    // Placebo Vulkan instance
+    const struct pl_vk_inst *vk_inst;
+    // Vulkan surface
+    VkSurfaceKHR surf;
+#endif
+#ifdef HAVE_PLACEBO_OPENGL
+    // Placebo OpenGL handle
+    const struct pl_opengl *gl;
+#endif
+    // Placebo GPU
+    const struct pl_gpu *gpu;
+    // Placebo swapchain
+    const struct pl_swapchain *swapchain;
+    // Lock protecting access to the texture
+    SDL_mutex *lock;
+    // Image to render, and planes backing them
+    struct pl_image image;
+    const struct pl_tex *plane_tex[3];
+} Dav1dPlayRendererPrivateContext;
+
+static Dav1dPlayRendererPrivateContext*
+    placebo_renderer_create_common(int window_flags)
+{
+    // Create Window
+    SDL_Window *sdlwin = dp_create_sdl_window(window_flags | SDL_WINDOW_RESIZABLE);
+    if (sdlwin == NULL)
+        return NULL;
+
+    // Alloc
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext));
+    if (rd_priv_ctx == NULL) {
+        return NULL;
+    }
+
+    *rd_priv_ctx = (Dav1dPlayRendererPrivateContext) {0};
+    rd_priv_ctx->win = sdlwin;
+
+    // Init libplacebo
+    rd_priv_ctx->ctx = pl_context_create(PL_API_VER, &(struct pl_context_params) {
+        .log_cb     = pl_log_color,
+#ifndef NDEBUG
+        .log_level  = PL_LOG_DEBUG,
+#else
+        .log_level  = PL_LOG_WARN,
+#endif
+    });
+    if (rd_priv_ctx->ctx == NULL) {
+        free(rd_priv_ctx);
+        return NULL;
+    }
+
+    // Create Mutex
+    rd_priv_ctx->lock = SDL_CreateMutex();
+    if (rd_priv_ctx->lock == NULL) {
+        fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
+        pl_context_destroy(&(rd_priv_ctx->ctx));
+        free(rd_priv_ctx);
+        return NULL;
+    }
+
+    return rd_priv_ctx;
+}
+
+#ifdef HAVE_PLACEBO_OPENGL
+static void *placebo_renderer_create_gl()
+{
+    SDL_Window *sdlwin = NULL;
+    SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG);
+
+    // Common init
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx =
+        placebo_renderer_create_common(SDL_WINDOW_OPENGL);
+
+    if (rd_priv_ctx == NULL)
+        return NULL;
+    sdlwin = rd_priv_ctx->win;
+
+    // Init OpenGL
+    struct pl_opengl_params params = pl_opengl_default_params;
+# ifndef NDEBUG
+    params.debug = true;
+# endif
+
+    SDL_GLContext glcontext = SDL_GL_CreateContext(sdlwin);
+    SDL_GL_MakeCurrent(sdlwin, glcontext);
+
+    rd_priv_ctx->gl = pl_opengl_create(rd_priv_ctx->ctx, &params);
+    if (!rd_priv_ctx->gl) {
+        fprintf(stderr, "Failed creating opengl device!\n");
+        exit(2);
+    }
+
+    rd_priv_ctx->swapchain = pl_opengl_create_swapchain(rd_priv_ctx->gl,
+        &(struct pl_opengl_swapchain_params) {
+            .swap_buffers = (void (*)(void *)) SDL_GL_SwapWindow,
+            .priv = sdlwin,
+        });
+
+    if (!rd_priv_ctx->swapchain) {
+        fprintf(stderr, "Failed creating opengl swapchain!\n");
+        exit(2);
+    }
+
+    int w = WINDOW_WIDTH, h = WINDOW_HEIGHT;
+    SDL_GL_GetDrawableSize(sdlwin, &w, &h);
+
+    if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) {
+        fprintf(stderr, "Failed resizing vulkan swapchain!\n");
+        exit(2);
+    }
+
+    rd_priv_ctx->gpu = rd_priv_ctx->gl->gpu;
+
+    if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT)
+        printf("Note: window dimensions differ (got %dx%d)\n", w, h);
+
+    return rd_priv_ctx;
+}
+#endif
+
+#ifdef HAVE_PLACEBO_VULKAN
+static void *placebo_renderer_create_vk()
+{
+    SDL_Window *sdlwin = NULL;
+
+    // Common init
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx =
+        placebo_renderer_create_common(SDL_WINDOW_VULKAN);
+
+    if (rd_priv_ctx == NULL)
+        return NULL;
+    sdlwin = rd_priv_ctx->win;
+
+    // Init Vulkan
+    unsigned num = 0;
+    if (!SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, NULL)) {
+        fprintf(stderr, "Failed enumerating Vulkan extensions: %s\n", SDL_GetError());
+        exit(1);
+    }
+
+    const char **extensions = malloc(num * sizeof(const char *));
+    assert(extensions);
+
+    SDL_bool ok = SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, extensions);
+    if (!ok) {
+        fprintf(stderr, "Failed getting Vk instance extensions\n");
+        exit(1);
+    }
+
+    if (num > 0) {
+        printf("Requesting %d additional Vulkan extensions:\n", num);
+        for (unsigned i = 0; i < num; i++)
+            printf("    %s\n", extensions[i]);
+    }
+
+    struct pl_vk_inst_params iparams = pl_vk_inst_default_params;
+    iparams.extensions = extensions;
+    iparams.num_extensions = num;
+
+    rd_priv_ctx->vk_inst = pl_vk_inst_create(rd_priv_ctx->ctx, &iparams);
+    if (!rd_priv_ctx->vk_inst) {
+        fprintf(stderr, "Failed creating Vulkan instance!\n");
+        exit(1);
+    }
+    free(extensions);
+
+    if (!SDL_Vulkan_CreateSurface(sdlwin, rd_priv_ctx->vk_inst->instance, &rd_priv_ctx->surf)) {
+        fprintf(stderr, "Failed creating vulkan surface: %s\n", SDL_GetError());
+        exit(1);
+    }
+
+    struct pl_vulkan_params params = pl_vulkan_default_params;
+    params.instance = rd_priv_ctx->vk_inst->instance;
+    params.surface = rd_priv_ctx->surf;
+    params.allow_software = true;
+
+    rd_priv_ctx->vk = pl_vulkan_create(rd_priv_ctx->ctx, &params);
+    if (!rd_priv_ctx->vk) {
+        fprintf(stderr, "Failed creating vulkan device!\n");
+        exit(2);
+    }
+
+    // Create swapchain
+    rd_priv_ctx->swapchain = pl_vulkan_create_swapchain(rd_priv_ctx->vk,
+        &(struct pl_vulkan_swapchain_params) {
+            .surface = rd_priv_ctx->surf,
+            .present_mode = VK_PRESENT_MODE_IMMEDIATE_KHR,
+        });
+
+    if (!rd_priv_ctx->swapchain) {
+        fprintf(stderr, "Failed creating vulkan swapchain!\n");
+        exit(2);
+    }
+
+    int w = WINDOW_WIDTH, h = WINDOW_HEIGHT;
+    if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) {
+        fprintf(stderr, "Failed resizing vulkan swapchain!\n");
+        exit(2);
+    }
+
+    rd_priv_ctx->gpu = rd_priv_ctx->vk->gpu;
+
+    if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT)
+        printf("Note: window dimensions differ (got %dx%d)\n", w, h);
+
+    return rd_priv_ctx;
+}
+#endif
+
+static void placebo_renderer_destroy(void *cookie)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+
+    pl_renderer_destroy(&(rd_priv_ctx->renderer));
+    pl_swapchain_destroy(&(rd_priv_ctx->swapchain));
+    for (int i = 0; i < 3; i++)
+        pl_tex_destroy(rd_priv_ctx->gpu, &(rd_priv_ctx->plane_tex[i]));
+
+#ifdef HAVE_PLACEBO_VULKAN
+    if (rd_priv_ctx->vk) {
+        pl_vulkan_destroy(&(rd_priv_ctx->vk));
+        vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL);
+        pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst));
+    }
+#endif
+#ifdef HAVE_PLACEBO_OPENGL
+    if (rd_priv_ctx->gl)
+        pl_opengl_destroy(&(rd_priv_ctx->gl));
+#endif
+
+    SDL_DestroyWindow(rd_priv_ctx->win);
+
+    pl_context_destroy(&(rd_priv_ctx->ctx));
+}
+
+static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+
+    SDL_LockMutex(rd_priv_ctx->lock);
+    if (!rd_priv_ctx->image.num_planes) {
+        SDL_UnlockMutex(rd_priv_ctx->lock);
+        return;
+    }
+
+    // Prepare rendering
+    if (rd_priv_ctx->renderer == NULL) {
+        rd_priv_ctx->renderer = pl_renderer_create(rd_priv_ctx->ctx, rd_priv_ctx->gpu);
+    }
+
+    struct pl_swapchain_frame frame;
+    bool ok = pl_swapchain_start_frame(rd_priv_ctx->swapchain, &frame);
+    if (!ok) {
+        SDL_UnlockMutex(rd_priv_ctx->lock);
+        return;
+    }
+
+    struct pl_render_params render_params = {0};
+    if (settings->highquality)
+        render_params = pl_render_default_params;
+
+    struct pl_render_target target;
+    pl_render_target_from_swapchain(&target, &frame);
+    target.profile = (struct pl_icc_profile) {
+        .data = NULL,
+        .len = 0,
+    };
+
+#if PL_API_VER >= 66
+    pl_rect2df_aspect_copy(&target.dst_rect, &rd_priv_ctx->image.src_rect, 0.0);
+    if (pl_render_target_partial(&target))
+        pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 0.0 });
+#endif
+
+    if (!pl_render_image(rd_priv_ctx->renderer, &rd_priv_ctx->image, &target, &render_params)) {
+        fprintf(stderr, "Failed rendering frame!\n");
+        pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 1.0 });
+    }
+
+    ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain);
+    if (!ok) {
+        fprintf(stderr, "Failed submitting frame!\n");
+        SDL_UnlockMutex(rd_priv_ctx->lock);
+        return;
+    }
+
+    pl_swapchain_swap_buffers(rd_priv_ctx->swapchain);
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+}
+
+static int placebo_upload_image(void *cookie, Dav1dPicture *dav1d_pic,
+                                const Dav1dPlaySettings *settings)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+
+    SDL_LockMutex(rd_priv_ctx->lock);
+
+    if (dav1d_pic == NULL) {
+        SDL_UnlockMutex(rd_priv_ctx->lock);
+        return 0;
+    }
+
+    int width = dav1d_pic->p.w;
+    int height = dav1d_pic->p.h;
+    int sub_x = 0, sub_y = 0;
+    int bytes = (dav1d_pic->p.bpc + 7) / 8; // rounded up
+    enum pl_chroma_location chroma_loc = PL_CHROMA_UNKNOWN;
+
+    struct pl_image *image = &rd_priv_ctx->image;
+    *image = (struct pl_image) {
+        .num_planes = 3,
+        .width      = width,
+        .height     = height,
+        .src_rect   = {0, 0, width, height},
+
+        .repr = {
+            .bits = {
+                .sample_depth = bytes * 8,
+                .color_depth = dav1d_pic->p.bpc,
+            },
+        },
+    };
+
+    // Figure out the correct plane dimensions/count
+    switch (dav1d_pic->p.layout) {
+    case DAV1D_PIXEL_LAYOUT_I400:
+        image->num_planes = 1;
+        break;
+    case DAV1D_PIXEL_LAYOUT_I420:
+        sub_x = sub_y = 1;
+        break;
+    case DAV1D_PIXEL_LAYOUT_I422:
+        sub_x = 1;
+        break;
+    case DAV1D_PIXEL_LAYOUT_I444:
+        break;
+    }
+
+    // Set the right colorspace metadata etc.
+    switch (dav1d_pic->seq_hdr->pri) {
+    case DAV1D_COLOR_PRI_UNKNOWN:   image->color.primaries = PL_COLOR_PRIM_UNKNOWN; break;
+    case DAV1D_COLOR_PRI_BT709:     image->color.primaries = PL_COLOR_PRIM_BT_709; break;
+    case DAV1D_COLOR_PRI_BT470M:    image->color.primaries = PL_COLOR_PRIM_BT_470M; break;
+    case DAV1D_COLOR_PRI_BT470BG:   image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
+    case DAV1D_COLOR_PRI_BT601:     image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
+    case DAV1D_COLOR_PRI_BT2020:    image->color.primaries = PL_COLOR_PRIM_BT_2020; break;
+
+    case DAV1D_COLOR_PRI_XYZ:
+        // Handled below
+        assert(dav1d_pic->seq_hdr->mtrx == DAV1D_MC_IDENTITY);
+        break;
+
+    default:
+        printf("warning: unknown dav1d color primaries %d.. ignoring, picture "
+               "may be very incorrect\n", dav1d_pic->seq_hdr->pri);
+        break;
+    }
+
+    switch (dav1d_pic->seq_hdr->trc) {
+    case DAV1D_TRC_BT709:
+    case DAV1D_TRC_BT470M:
+    case DAV1D_TRC_BT470BG:
+    case DAV1D_TRC_BT601:
+    case DAV1D_TRC_SMPTE240:
+    case DAV1D_TRC_BT2020_10BIT:
+    case DAV1D_TRC_BT2020_12BIT:
+        // These all map to the effective "SDR" CRT-based EOTF, BT.1886
+        image->color.transfer = PL_COLOR_TRC_BT_1886;
+        break;
+
+    case DAV1D_TRC_UNKNOWN:     image->color.transfer = PL_COLOR_TRC_UNKNOWN; break;
+    case DAV1D_TRC_LINEAR:      image->color.transfer = PL_COLOR_TRC_LINEAR; break;
+    case DAV1D_TRC_SRGB:        image->color.transfer = PL_COLOR_TRC_SRGB; break;
+    case DAV1D_TRC_SMPTE2084:   image->color.transfer = PL_COLOR_TRC_PQ; break;
+    case DAV1D_TRC_HLG:         image->color.transfer = PL_COLOR_TRC_HLG; break;
+
+    default:
+        printf("warning: unknown dav1d color transfer %d.. ignoring, picture "
+               "may be very incorrect\n", dav1d_pic->seq_hdr->trc);
+        break;
+    }
+
+    switch (dav1d_pic->seq_hdr->mtrx) {
+    case DAV1D_MC_IDENTITY:
+        // This is going to be either RGB or XYZ
+        if (dav1d_pic->seq_hdr->pri == DAV1D_COLOR_PRI_XYZ) {
+            image->repr.sys = PL_COLOR_SYSTEM_XYZ;
+        } else {
+            image->repr.sys = PL_COLOR_SYSTEM_RGB;
+        }
+        break;
+
+    case DAV1D_MC_UNKNOWN:
+        // PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one
+        image->repr.sys = pl_color_system_guess_ycbcr(width, height);
+        break;
+
+    case DAV1D_MC_BT709:        image->repr.sys = PL_COLOR_SYSTEM_BT_709; break;
+    case DAV1D_MC_BT601:        image->repr.sys = PL_COLOR_SYSTEM_BT_601; break;
+    case DAV1D_MC_SMPTE240:     image->repr.sys = PL_COLOR_SYSTEM_SMPTE_240M; break;
+    case DAV1D_MC_SMPTE_YCGCO:  image->repr.sys = PL_COLOR_SYSTEM_YCGCO; break;
+    case DAV1D_MC_BT2020_NCL:   image->repr.sys = PL_COLOR_SYSTEM_BT_2020_NC; break;
+    case DAV1D_MC_BT2020_CL:    image->repr.sys = PL_COLOR_SYSTEM_BT_2020_C; break;
+
+    case DAV1D_MC_ICTCP:
+        // This one is split up based on the actual HDR curve in use
+        if (dav1d_pic->seq_hdr->trc == DAV1D_TRC_HLG) {
+            image->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG;
+        } else {
+            image->repr.sys = PL_COLOR_SYSTEM_BT_2100_PQ;
+        }
+        break;
+
+    default:
+        printf("warning: unknown dav1d color matrix %d.. ignoring, picture "
+               "may be very incorrect\n", dav1d_pic->seq_hdr->mtrx);
+        break;
+    }
+
+    if (dav1d_pic->seq_hdr->color_range) {
+        image->repr.levels = PL_COLOR_LEVELS_PC;
+    } else {
+        image->repr.levels = PL_COLOR_LEVELS_TV;
+    }
+
+    switch (dav1d_pic->seq_hdr->chr) {
+    case DAV1D_CHR_UNKNOWN:     chroma_loc = PL_CHROMA_UNKNOWN; break;
+    case DAV1D_CHR_VERTICAL:    chroma_loc = PL_CHROMA_LEFT; break;
+    case DAV1D_CHR_COLOCATED:   chroma_loc = PL_CHROMA_TOP_LEFT; break;
+    }
+
+#if PL_API_VER >= 63
+    if (settings->gpugrain && dav1d_pic->frame_hdr->film_grain.present) {
+        Dav1dFilmGrainData *src = &dav1d_pic->frame_hdr->film_grain.data;
+        struct pl_av1_grain_data *dst = &image->av1_grain;
+        *dst = (struct pl_av1_grain_data) {
+            .grain_seed     = src->seed,
+            .num_points_y   = src->num_y_points,
+            .chroma_scaling_from_luma = src->chroma_scaling_from_luma,
+            .num_points_uv  = { src->num_uv_points[0], src->num_uv_points[1] },
+            .scaling_shift  = src->scaling_shift,
+            .ar_coeff_lag   = src->ar_coeff_lag,
+            .ar_coeff_shift = src->ar_coeff_shift,
+            .grain_scale_shift = src->grain_scale_shift,
+            .uv_mult        = { src->uv_mult[0], src->uv_mult[1] },
+            .uv_mult_luma   = { src->uv_luma_mult[0], src->uv_luma_mult[1] },
+            .uv_offset      = { src->uv_offset[0], src->uv_offset[1] },
+            .overlap        = src->overlap_flag,
+        };
+
+        assert(sizeof(dst->points_y) == sizeof(src->y_points));
+        assert(sizeof(dst->points_uv) == sizeof(src->uv_points));
+        assert(sizeof(dst->ar_coeffs_y) == sizeof(src->ar_coeffs_y));
+        memcpy(dst->points_y, src->y_points, sizeof(src->y_points));
+        memcpy(dst->points_uv, src->uv_points, sizeof(src->uv_points));
+        memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(src->ar_coeffs_y));
+
+        // this one has different row sizes for alignment
+        for (int c = 0; c < 2; c++) {
+            for (int i = 0; i < 25; i++)
+                dst->ar_coeffs_uv[c][i] = src->ar_coeffs_uv[c][i];
+        }
+    }
+#endif
+
+    // Upload the actual planes
+    struct pl_plane_data data[3] = {
+        {
+            // Y plane
+            .type           = PL_FMT_UNORM,
+            .width          = width,
+            .height         = height,
+            .pixel_stride   = bytes,
+            .row_stride     = dav1d_pic->stride[0],
+            .component_size = {bytes * 8},
+            .component_map  = {0},
+        }, {
+            // U plane
+            .type           = PL_FMT_UNORM,
+            .width          = width >> sub_x,
+            .height         = height >> sub_y,
+            .pixel_stride   = bytes,
+            .row_stride     = dav1d_pic->stride[1],
+            .component_size = {bytes * 8},
+            .component_map  = {1},
+        }, {
+            // V plane
+            .type           = PL_FMT_UNORM,
+            .width          = width >> sub_x,
+            .height         = height >> sub_y,
+            .pixel_stride   = bytes,
+            .row_stride     = dav1d_pic->stride[1],
+            .component_size = {bytes * 8},
+            .component_map  = {2},
+        },
+    };
+
+    bool ok = true;
+
+    for (int i = 0; i < image->num_planes; i++) {
+        if (settings->zerocopy) {
+            const struct pl_buf *buf = dav1d_pic->allocator_data;
+            assert(buf);
+            data[i].buf = buf;
+            data[i].buf_offset = (uintptr_t) dav1d_pic->data[i] - (uintptr_t) buf->data;
+        } else {
+            data[i].pixels = dav1d_pic->data[i];
+        }
+
+        ok &= pl_upload_plane(rd_priv_ctx->gpu, &image->planes[i], &rd_priv_ctx->plane_tex[i], &data[i]);
+    }
+
+    // Apply the correct chroma plane shift. This has to be done after pl_upload_plane
+#if PL_API_VER >= 67
+    pl_image_set_chroma_location(image, chroma_loc);
+#else
+    pl_chroma_location_offset(chroma_loc, &image->planes[1].shift_x, &image->planes[1].shift_y);
+    pl_chroma_location_offset(chroma_loc, &image->planes[2].shift_x, &image->planes[2].shift_y);
+#endif
+
+    if (!ok) {
+        fprintf(stderr, "Failed uploading planes!\n");
+        *image = (struct pl_image) {0};
+    }
+
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+    return !ok;
+}
+
+// Align to power of 2
+#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+    SDL_LockMutex(rd_priv_ctx->lock);
+
+    const struct pl_gpu *gpu = rd_priv_ctx->gpu;
+    int ret = DAV1D_ERR(ENOMEM);
+
+    // Copied from dav1d_default_picture_alloc
+    const int hbd = p->p.bpc > 8;
+    const int aligned_w = ALIGN2(p->p.w, 128);
+    const int aligned_h = ALIGN2(p->p.h, 128);
+    const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
+    const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    p->stride[0] = aligned_w << hbd;
+    p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
+
+    // Align strides up to multiples of the GPU performance hints
+    p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
+    p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
+
+    // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
+    size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
+    const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
+    const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
+
+    // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
+    // even in the case that the driver gives us insane alignments
+    const size_t pic_size = y_sz + 2 * uv_sz;
+    const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
+
+    // Validate size limitations
+    if (total_size > gpu->limits.max_xfer_size) {
+        printf("alloc of %zu bytes exceeds limits\n", total_size);
+        goto err;
+    }
+
+    const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+        .type = PL_BUF_TEX_TRANSFER,
+        .host_mapped = true,
+        .size = total_size,
+        .memory_type = PL_BUF_MEM_HOST,
+        .user_data = p,
+    });
+
+    if (!buf) {
+        printf("alloc of GPU mapped buffer failed\n");
+        goto err;
+    }
+
+    assert(buf->data);
+    uintptr_t base = (uintptr_t) buf->data, data[3];
+    data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
+    data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
+    data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
+
+    // Sanity check offset alignment for the sake of debugging
+    if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
+        data[1] - base != ALIGN2(data[1] - base, off_align) ||
+        data[2] - base != ALIGN2(data[2] - base, off_align))
+    {
+        printf("GPU buffer horribly misaligned, expect slowdown!\n");
+    }
+
+    p->allocator_data = (void *) buf;
+    p->data[0] = (void *) data[0];
+    p->data[1] = (void *) data[1];
+    p->data[2] = (void *) data[2];
+    ret = 0;
+
+    // fall through
+err:
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+    return ret;
+}
+
+static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+    assert(pic->allocator_data);
+
+    SDL_LockMutex(rd_priv_ctx->lock);
+    const struct pl_gpu *gpu = rd_priv_ctx->gpu;
+    pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+}
+
+#ifdef HAVE_PLACEBO_VULKAN
+const Dav1dPlayRenderInfo rdr_placebo_vk = {
+    .name = "placebo-vk",
+    .create_renderer = placebo_renderer_create_vk,
+    .destroy_renderer = placebo_renderer_destroy,
+    .render = placebo_render,
+    .update_frame = placebo_upload_image,
+    .alloc_pic = placebo_alloc_pic,
+    .release_pic = placebo_release_pic,
+
+# if PL_API_VER >= 63
+    .supports_gpu_grain = 1,
+# endif
+};
+#else
+const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
+#endif
+
+#ifdef HAVE_PLACEBO_OPENGL
+const Dav1dPlayRenderInfo rdr_placebo_gl = {
+    .name = "placebo-gl",
+    .create_renderer = placebo_renderer_create_gl,
+    .destroy_renderer = placebo_renderer_destroy,
+    .render = placebo_render,
+    .update_frame = placebo_upload_image,
+    .alloc_pic = placebo_alloc_pic,
+    .release_pic = placebo_release_pic,
+
+# if PL_API_VER >= 63
+    .supports_gpu_grain = 1,
+# endif
+};
+#else
+const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL };
+#endif
+
+#else
+const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
+const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL };
+#endif
diff --git a/examples/dp_renderer_sdl.c b/examples/dp_renderer_sdl.c
new file mode 100644 (file)
index 0000000..078d613
--- /dev/null
@@ -0,0 +1,164 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "dp_renderer.h"
+
+#include <assert.h>
+
+/**
+ * Renderer context for SDL
+ */
+typedef struct renderer_priv_ctx
+{
+    // SDL window
+    SDL_Window *win;
+    // SDL renderer
+    SDL_Renderer *renderer;
+    // Lock protecting access to the texture
+    SDL_mutex *lock;
+    // Texture to render
+    SDL_Texture *tex;
+} Dav1dPlayRendererPrivateContext;
+
+static void *sdl_renderer_create()
+{
+    SDL_Window *win = dp_create_sdl_window(0);
+    if (win == NULL)
+        return NULL;
+
+    // Alloc
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext));
+    if (rd_priv_ctx == NULL) {
+        return NULL;
+    }
+    rd_priv_ctx->win = win;
+
+    // Create renderer
+    rd_priv_ctx->renderer = SDL_CreateRenderer(win, -1, SDL_RENDERER_ACCELERATED);
+    // Set scale quality
+    SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "linear");
+
+    // Create Mutex
+    rd_priv_ctx->lock = SDL_CreateMutex();
+    if (rd_priv_ctx->lock == NULL) {
+        fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
+        free(rd_priv_ctx);
+        return NULL;
+    }
+
+    rd_priv_ctx->tex = NULL;
+
+    return rd_priv_ctx;
+}
+
+static void sdl_renderer_destroy(void *cookie)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+
+    SDL_DestroyRenderer(rd_priv_ctx->renderer);
+    SDL_DestroyMutex(rd_priv_ctx->lock);
+    free(rd_priv_ctx);
+}
+
+static void sdl_render(void *cookie, const Dav1dPlaySettings *settings)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+
+    SDL_LockMutex(rd_priv_ctx->lock);
+
+    if (rd_priv_ctx->tex == NULL) {
+        SDL_UnlockMutex(rd_priv_ctx->lock);
+        return;
+    }
+
+    // Display the frame
+    SDL_RenderClear(rd_priv_ctx->renderer);
+    SDL_RenderCopy(rd_priv_ctx->renderer, rd_priv_ctx->tex, NULL, NULL);
+    SDL_RenderPresent(rd_priv_ctx->renderer);
+
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+}
+
+static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic,
+                              const Dav1dPlaySettings *settings)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+
+    SDL_LockMutex(rd_priv_ctx->lock);
+
+    if (dav1d_pic == NULL) {
+        rd_priv_ctx->tex = NULL;
+        SDL_UnlockMutex(rd_priv_ctx->lock);
+        return 0;
+    }
+
+    int width = dav1d_pic->p.w;
+    int height = dav1d_pic->p.h;
+    int tex_w = width;
+    int tex_h = height;
+
+    enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout;
+
+    if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) {
+        fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n");
+        exit(50);
+    }
+
+    SDL_Texture *texture = rd_priv_ctx->tex;
+    if (texture != NULL) {
+        SDL_QueryTexture(texture, NULL, NULL, &tex_w, &tex_h);
+        if (tex_w != width || tex_h != height) {
+            SDL_DestroyTexture(texture);
+            texture = NULL;
+        }
+    }
+
+    if (texture == NULL) {
+        texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV,
+            SDL_TEXTUREACCESS_STREAMING, width, height);
+    }
+
+    SDL_UpdateYUVTexture(texture, NULL,
+        dav1d_pic->data[0], (int)dav1d_pic->stride[0], // Y
+        dav1d_pic->data[1], (int)dav1d_pic->stride[1], // U
+        dav1d_pic->data[2], (int)dav1d_pic->stride[1]  // V
+        );
+
+    rd_priv_ctx->tex = texture;
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+    return 0;
+}
+
+const Dav1dPlayRenderInfo rdr_sdl = {
+    .name = "sdl",
+    .create_renderer = sdl_renderer_create,
+    .destroy_renderer = sdl_renderer_destroy,
+    .render = sdl_render,
+    .update_frame = sdl_update_texture
+};
diff --git a/examples/meson.build b/examples/meson.build
new file mode 100644 (file)
index 0000000..50e097a
--- /dev/null
@@ -0,0 +1,74 @@
+# Copyright © 2018, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# Build definition for the dav1d examples
+#
+
+# Leave subdir if examples are disabled
+if not get_option('enable_examples')
+    subdir_done()
+endif
+
+
+# dav1d player sources
+dav1dplay_sources = files(
+    'dav1dplay.c',
+    'dp_fifo.c',
+    'dp_renderer_placebo.c',
+    'dp_renderer_sdl.c',
+)
+
+sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: true)
+
+if sdl2_dependency.found()
+    dav1dplay_deps = [sdl2_dependency]
+    dav1dplay_cflags = []
+
+    placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false)
+
+    if placebo_dependency.found()
+        dav1dplay_deps += placebo_dependency
+        dav1dplay_cflags += '-DHAVE_PLACEBO'
+
+        # If libplacebo is found, we might be able to use Vulkan
+        # with it, in which case we need the Vulkan library too.
+        vulkan_dependency = dependency('vulkan', required: false)
+        if vulkan_dependency.found()
+            dav1dplay_deps += vulkan_dependency
+            dav1dplay_cflags += '-DHAVE_VULKAN'
+        endif
+    endif
+
+    dav1dplay = executable('dav1dplay',
+        dav1dplay_sources,
+        rev_target,
+
+        link_with : [libdav1d, dav1d_input_objs],
+        include_directories : [dav1d_inc_dirs],
+        dependencies : [getopt_dependency, dav1dplay_deps],
+        install : true,
+        c_args : dav1dplay_cflags,
+    )
+endif
diff --git a/gcovr.cfg b/gcovr.cfg
new file mode 100644 (file)
index 0000000..f768de8
--- /dev/null
+++ b/gcovr.cfg
@@ -0,0 +1,3 @@
+exclude = .*/tests/.*
+exclude = .*/tools/.*
+exclude = .*/include/common/dump.h
diff --git a/include/common/attributes.h b/include/common/attributes.h
new file mode 100644 (file)
index 0000000..0683b50
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_COMMON_ATTRIBUTES_H
+#define DAV1D_COMMON_ATTRIBUTES_H
+
+#include "config.h"
+
+#include <stddef.h>
+
+#ifdef __GNUC__
+#define ATTR_ALIAS __attribute__((may_alias))
+#define ATTR_FORMAT_PRINTF(fmt, attr) __attribute__((__format__(__printf__, fmt, attr)))
+#define COLD __attribute__((cold))
+#else
+#define ATTR_ALIAS
+#define ATTR_FORMAT_PRINTF(fmt, attr)
+#define COLD
+#endif
+
+#if ARCH_X86_64
+/* x86-64 needs 32- and 64-byte alignment for AVX2 and AVX-512. */
+#define ALIGN_64_VAL 64
+#define ALIGN_32_VAL 32
+#define ALIGN_16_VAL 16
+#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
+/* ARM doesn't benefit from anything more than 16-byte alignment. */
+#define ALIGN_64_VAL 16
+#define ALIGN_32_VAL 16
+#define ALIGN_16_VAL 16
+#else
+/* No need for extra alignment on platforms without assembly. */
+#define ALIGN_64_VAL 8
+#define ALIGN_32_VAL 8
+#define ALIGN_16_VAL 8
+#endif
+
+/*
+ * API for variables, struct members (ALIGN()) like:
+ * uint8_t var[1][2][3][4]
+ * becomes:
+ * ALIGN(uint8_t var[1][2][3][4], alignment).
+ */
+#ifdef _MSC_VER
+#define ALIGN(ll, a) \
+    __declspec(align(a)) ll
+#else
+#define ALIGN(line, align) \
+    line __attribute__((aligned(align)))
+#endif
+
+/*
+ * API for stack alignment (ALIGN_STK_$align()) of variables like:
+ * uint8_t var[1][2][3][4]
+ * becomes:
+ * ALIGN_STK_$align(uint8_t, var, 1, [2][3][4])
+ */
+#define ALIGN_STK_64(type, var, sz1d, sznd) \
+    ALIGN(type var[sz1d]sznd, ALIGN_64_VAL)
+#define ALIGN_STK_32(type, var, sz1d, sznd) \
+    ALIGN(type var[sz1d]sznd, ALIGN_32_VAL)
+#define ALIGN_STK_16(type, var, sz1d, sznd) \
+    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
+
+/*
+ * Forbid inlining of a function:
+ * static NOINLINE void func() {}
+ */
+#ifdef _MSC_VER
+#define NOINLINE __declspec(noinline)
+#else /* !_MSC_VER */
+#define NOINLINE __attribute__((noinline))
+#endif /* !_MSC_VER */
+
+#ifdef __clang__
+#define NO_SANITIZE(x) __attribute__((no_sanitize(x)))
+#else
+#define NO_SANITIZE(x)
+#endif
+
+#if defined(NDEBUG) && (defined(__GNUC__) || defined(__clang__))
+#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
+#elif defined(NDEBUG) && defined(_MSC_VER)
+#define assert __assume
+#else
+#include <assert.h>
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+#    define dav1d_uninit(x) x=x
+#else
+#    define dav1d_uninit(x) x
+#endif
+
+ #ifdef _MSC_VER
+ #include <intrin.h>
+
+static inline int ctz(const unsigned int mask) {
+    unsigned long idx;
+    _BitScanForward(&idx, mask);
+    return idx;
+}
+
+static inline int clz(const unsigned int mask) {
+    unsigned long leading_zero = 0;
+    _BitScanReverse(&leading_zero, mask);
+    return (31 - leading_zero);
+}
+
+#ifdef _WIN64
+static inline int clzll(const unsigned long long mask) {
+    unsigned long leading_zero = 0;
+    _BitScanReverse64(&leading_zero, mask);
+    return (63 - leading_zero);
+}
+#else /* _WIN64 */
+static inline int clzll(const unsigned long long mask) {
+    if (mask >> 32)
+        return clz((unsigned)(mask >> 32));
+    else
+        return clz((unsigned)mask) + 32;
+}
+#endif /* _WIN64 */
+#else /* !_MSC_VER */
+static inline int ctz(const unsigned int mask) {
+    return __builtin_ctz(mask);
+}
+
+static inline int clz(const unsigned int mask) {
+    return __builtin_clz(mask);
+}
+
+static inline int clzll(const unsigned long long mask) {
+    return __builtin_clzll(mask);
+}
+#endif /* !_MSC_VER */
+
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#endif /* DAV1D_COMMON_ATTRIBUTES_H */
diff --git a/include/common/bitdepth.h b/include/common/bitdepth.h
new file mode 100644 (file)
index 0000000..88a822a
--- /dev/null
@@ -0,0 +1,93 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_COMMON_BITDEPTH_H
+#define DAV1D_COMMON_BITDEPTH_H
+
+#include <stdint.h>
+#include <string.h>
+
+#include "common/attributes.h"
+
+#if !defined(BITDEPTH)
+typedef void pixel;
+typedef void coef;
+#define HIGHBD_DECL_SUFFIX /* nothing */
+#define HIGHBD_CALL_SUFFIX /* nothing */
+#define HIGHBD_TAIL_SUFFIX /* nothing */
+#elif BITDEPTH == 8
+typedef uint8_t pixel;
+typedef int16_t coef;
+#define PIXEL_TYPE uint8_t
+#define COEF_TYPE int16_t
+#define pixel_copy memcpy
+#define pixel_set memset
+#define iclip_pixel iclip_u8
+#define PIX_HEX_FMT "%02x"
+#define bitfn(x) x##_8bpc
+#define BF(x, suffix) x##_8bpc_##suffix
+#define PXSTRIDE(x) (x)
+#define highbd_only(x)
+#define HIGHBD_DECL_SUFFIX /* nothing */
+#define HIGHBD_CALL_SUFFIX /* nothing */
+#define HIGHBD_TAIL_SUFFIX /* nothing */
+#define bitdepth_from_max(x) 8
+#define BITDEPTH_MAX 0xff
+#elif BITDEPTH == 16
+typedef uint16_t pixel;
+typedef int32_t coef;
+#define PIXEL_TYPE uint16_t
+#define COEF_TYPE int32_t
+#define pixel_copy(a, b, c) memcpy(a, b, (c) << 1)
+static inline void pixel_set(pixel *const dst, const int val, const int num) {
+    for (int n = 0; n < num; n++)
+        dst[n] = val;
+}
+#define PIX_HEX_FMT "%03x"
+#define iclip_pixel(x) iclip(x, 0, bitdepth_max)
+#define HIGHBD_DECL_SUFFIX , const int bitdepth_max
+#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
+#define HIGHBD_TAIL_SUFFIX , bitdepth_max
+#define bitdepth_from_max(bitdepth_max) (32 - clz(bitdepth_max))
+#define BITDEPTH_MAX bitdepth_max
+#define bitfn(x) x##_16bpc
+#define BF(x, suffix) x##_16bpc_##suffix
+static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
+    assert(!(x & 1));
+    return x >> 1;
+}
+#define highbd_only(x) x
+#else
+#error invalid value for bitdepth
+#endif
+#define bytefn(x) bitfn(x)
+
+#define bitfn_decls(name, ...) \
+name##_8bpc(__VA_ARGS__); \
+name##_16bpc(__VA_ARGS__)
+
+#endif /* DAV1D_COMMON_BITDEPTH_H */
diff --git a/include/common/dump.h b/include/common/dump.h
new file mode 100644 (file)
index 0000000..9ffab6a
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_COMMON_DUMP_H
+#define DAV1D_COMMON_DUMP_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "common/bitdepth.h"
+
+static inline void append_plane_to_file(const pixel *buf, ptrdiff_t stride,
+                                        int w, int h, const char *const file)
+{
+    FILE *const f = fopen(file, "ab");
+    while (h--) {
+        fwrite(buf, w * sizeof(pixel), 1, f);
+        buf += PXSTRIDE(stride);
+    }
+    fclose(f);
+}
+
+static inline void hex_fdump(FILE *out, const pixel *buf, ptrdiff_t stride,
+                             int w, int h, const char *what)
+{
+    fprintf(out, "%s\n", what);
+    while (h--) {
+        int x;
+        for (x = 0; x < w; x++)
+            fprintf(out, " " PIX_HEX_FMT, buf[x]);
+        buf += PXSTRIDE(stride);
+        fprintf(out, "\n");
+    }
+}
+
+static inline void hex_dump(const pixel *buf, ptrdiff_t stride,
+                            int w, int h, const char *what)
+{
+    hex_fdump(stdout, buf, stride, w, h, what);
+}
+
+static inline void coef_dump(const coef *buf, const int w, const int h,
+                             const int len, const char *what)
+{
+    int y;
+    printf("%s\n", what);
+    for (y = 0; y < h; y++) {
+        int x;
+        for (x = 0; x < w; x++)
+            printf(" %*d", len, buf[x]);
+        buf += w;
+        printf("\n");
+    }
+}
+
+static inline void ac_dump(const int16_t *buf, int w, int h, const char *what)
+{
+    printf("%s\n", what);
+    while (h--) {
+        for (int x = 0; x < w; x++)
+            printf(" %03d", buf[x]);
+        buf += w;
+        printf("\n");
+    }
+}
+
+#endif /* DAV1D_COMMON_DUMP_H */
diff --git a/include/common/intops.h b/include/common/intops.h
new file mode 100644 (file)
index 0000000..2d21998
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_COMMON_INTOPS_H
+#define DAV1D_COMMON_INTOPS_H
+
+#include <stdint.h>
+
+#include "common/attributes.h"
+
+static inline int imax(const int a, const int b) {
+    return a > b ? a : b;
+}
+
+static inline int imin(const int a, const int b) {
+    return a < b ? a : b;
+}
+
+static inline unsigned umax(const unsigned a, const unsigned b) {
+    return a > b ? a : b;
+}
+
+static inline unsigned umin(const unsigned a, const unsigned b) {
+    return a < b ? a : b;
+}
+
+static inline int iclip(const int v, const int min, const int max) {
+    return v < min ? min : v > max ? max : v;
+}
+
+static inline int iclip_u8(const int v) {
+    return iclip(v, 0, 255);
+}
+
+static inline int apply_sign(const int v, const int s) {
+    return s < 0 ? -v : v;
+}
+
+static inline int apply_sign64(const int v, const int64_t s) {
+    return s < 0 ? -v : v;
+}
+
+static inline int ulog2(const unsigned v) {
+    return 31 - clz(v);
+}
+
+static inline int u64log2(const uint64_t v) {
+    return 63 - clzll(v);
+}
+
+static inline unsigned inv_recenter(const unsigned r, const unsigned v) {
+    if (v > (r << 1))
+        return v;
+    else if ((v & 1) == 0)
+        return (v >> 1) + r;
+    else
+        return r - ((v + 1) >> 1);
+}
+
+#endif /* DAV1D_COMMON_INTOPS_H */
diff --git a/include/common/mem.h b/include/common/mem.h
new file mode 100644 (file)
index 0000000..74cdaf2
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_COMMON_MEM_H
+#define DAV1D_COMMON_MEM_H
+
+#include <stdlib.h>
+
+#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
+#include <malloc.h>
+#endif
+
+#include "common/attributes.h"
+
+/*
+ * Allocate align-byte aligned memory. The return value can be released
+ * by calling the dav1d_free_aligned() function.
+ */
+static inline void *dav1d_alloc_aligned(size_t sz, size_t align) {
+    assert(!(align & (align - 1)));
+#ifdef HAVE_POSIX_MEMALIGN
+    void *ptr;
+    if (posix_memalign(&ptr, align, sz)) return NULL;
+    return ptr;
+#elif defined(HAVE_ALIGNED_MALLOC)
+    return _aligned_malloc(sz, align);
+#elif defined(HAVE_MEMALIGN)
+    return memalign(align, sz);
+#else
+#error Missing aligned alloc implementation
+#endif
+}
+
+static inline void dav1d_free_aligned(void* ptr) {
+#ifdef HAVE_POSIX_MEMALIGN
+    free(ptr);
+#elif defined(HAVE_ALIGNED_MALLOC)
+    _aligned_free(ptr);
+#elif defined(HAVE_MEMALIGN)
+    free(ptr);
+#endif
+}
+
+static inline void dav1d_freep_aligned(void* ptr) {
+    void **mem = (void **) ptr;
+    if (*mem) {
+        dav1d_free_aligned(*mem);
+        *mem = NULL;
+    }
+}
+
+static inline void freep(void *ptr) {
+    void **mem = (void **) ptr;
+    if (*mem) {
+        free(*mem);
+        *mem = NULL;
+    }
+}
+
+#endif /* DAV1D_COMMON_MEM_H */
diff --git a/include/common/validate.h b/include/common/validate.h
new file mode 100644 (file)
index 0000000..3096f3d
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_COMMON_VALIDATE_H
+#define DAV1D_COMMON_VALIDATE_H
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(NDEBUG)
+#define debug_abort()
+#else
+#define debug_abort abort
+#endif
+
+#define validate_input_or_ret_with_msg(x, r, ...) \
+    if (!(x)) { \
+        fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \
+                #x, __func__); \
+        fprintf(stderr, __VA_ARGS__); \
+        debug_abort(); \
+        return r; \
+    }
+
+#define validate_input_or_ret(x, r) \
+    if (!(x)) { \
+        fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \
+                #x, __func__); \
+        debug_abort(); \
+        return r; \
+    }
+
+#define validate_input(x) validate_input_or_ret(x, )
+
+#endif /* DAV1D_COMMON_VALIDATE_H */
diff --git a/include/compat/gcc/stdatomic.h b/include/compat/gcc/stdatomic.h
new file mode 100644 (file)
index 0000000..6d16a2c
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+* Copyright © 2018, VideoLAN and dav1d authors
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+* 1. Redistributions of source code must retain the above copyright notice, this
+*    list of conditions and the following disclaimer.
+*
+* 2. Redistributions in binary form must reproduce the above copyright notice,
+*    this list of conditions and the following disclaimer in the documentation
+*    and/or other materials provided with the distribution.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef GCCVER_STDATOMIC_H_
+#define GCCVER_STDATOMIC_H_
+
+#if !defined(__cplusplus)
+
+typedef int atomic_int;
+typedef unsigned int atomic_uint;
+
+#define memory_order_relaxed __ATOMIC_RELAXED
+#define memory_order_acquire __ATOMIC_ACQUIRE
+
+#define atomic_init(p_a, v)           do { *(p_a) = (v); } while(0)
+#define atomic_store(p_a, v)          __atomic_store_n(p_a, v, __ATOMIC_SEQ_CST)
+#define atomic_load(p_a)              __atomic_load_n(p_a, __ATOMIC_SEQ_CST)
+#define atomic_load_explicit(p_a, mo) __atomic_load_n(p_a, mo)
+#define atomic_fetch_add(p_a, inc)    __atomic_fetch_add(p_a, inc, __ATOMIC_SEQ_CST)
+#define atomic_fetch_sub(p_a, dec)    __atomic_fetch_sub(p_a, dec, __ATOMIC_SEQ_CST)
+
+#endif /* !defined(__cplusplus) */
+
+#endif /* GCCVER_STDATOMIC_H_ */
diff --git a/include/compat/getopt.h b/include/compat/getopt.h
new file mode 100644 (file)
index 0000000..930e002
--- /dev/null
@@ -0,0 +1,95 @@
+#ifndef __GETOPT_H__
+/**
+ * DISCLAIMER
+ * This file has no copyright assigned and is placed in the Public Domain.
+ * This file is part of the mingw-w64 runtime package.
+ *
+ * The mingw-w64 runtime package and its code is distributed in the hope that it 
+ * will be useful but WITHOUT ANY WARRANTY.  ALL WARRANTIES, EXPRESSED OR 
+ * IMPLIED ARE HEREBY DISCLAIMED.  This includes but is not limited to 
+ * warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#define __GETOPT_H__
+
+/* All the headers include this file. */
+#include <crtdefs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int optind;             /* index of first non-option in argv      */
+extern int optopt;             /* single option character, as parsed     */
+extern int opterr;             /* flag to enable built-in diagnostics... */
+                               /* (user may set to zero, to suppress)    */
+
+extern char *optarg;           /* pointer to argument of current option  */
+
+extern int getopt(int nargc, char * const *nargv, const char *options);
+
+#ifdef _BSD_SOURCE
+/*
+ * BSD adds the non-standard `optreset' feature, for reinitialisation
+ * of `getopt' parsing.  We support this feature, for applications which
+ * proclaim their BSD heritage, before including this header; however,
+ * to maintain portability, developers are advised to avoid it.
+ */
+# define optreset  __mingw_optreset
+extern int optreset;
+#endif
+#ifdef __cplusplus
+}
+#endif
+/*
+ * POSIX requires the `getopt' API to be specified in `unistd.h';
+ * thus, `unistd.h' includes this header.  However, we do not want
+ * to expose the `getopt_long' or `getopt_long_only' APIs, when
+ * included in this manner.  Thus, close the standard __GETOPT_H__
+ * declarations block, and open an additional __GETOPT_LONG_H__
+ * specific block, only when *not* __UNISTD_H_SOURCED__, in which
+ * to declare the extended API.
+ */
+#if !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__)
+#define __GETOPT_LONG_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct option          /* specification for a long form option...      */
+{
+  const char *name;            /* option name, without leading hyphens */
+  int         has_arg;         /* does it take an argument?            */
+  int        *flag;            /* where to save its status, or NULL    */
+  int         val;             /* its associated status value          */
+};
+
+enum                   /* permitted values for its `has_arg' field...  */
+{
+  no_argument = 0,             /* option never takes an argument       */
+  required_argument,           /* option always requires an argument   */
+  optional_argument            /* option may take an argument          */
+};
+
+extern int getopt_long(int nargc, char * const *nargv, const char *options,
+    const struct option *long_options, int *idx);
+extern int getopt_long_only(int nargc, char * const *nargv, const char *options,
+    const struct option *long_options, int *idx);
+/*
+ * Previous MinGW implementation had...
+ */
+#ifndef HAVE_DECL_GETOPT
+/*
+ * ...for the long form API only; keep this for compatibility.
+ */
+# define HAVE_DECL_GETOPT      1
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) */
+
+#endif /* !defined(__GETOPT_H__) */
diff --git a/include/compat/msvc/stdatomic.h b/include/compat/msvc/stdatomic.h
new file mode 100644 (file)
index 0000000..979ee2b
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+* Copyright © 2018, VideoLAN and dav1d authors
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+* 1. Redistributions of source code must retain the above copyright notice, this
+*    list of conditions and the following disclaimer.
+*
+* 2. Redistributions in binary form must reproduce the above copyright notice,
+*    this list of conditions and the following disclaimer in the documentation
+*    and/or other materials provided with the distribution.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef MSCVER_STDATOMIC_H_
+#define MSCVER_STDATOMIC_H_
+
+#if !defined(__cplusplus) && defined(_MSC_VER)
+
+#pragma warning(push)
+#pragma warning(disable:4067)    /* newline for __has_include_next */
+
+#if defined(__clang__) && __has_include_next(<stdatomic.h>)
+   /* use the clang stdatomic.h with clang-cl*/
+#  include_next <stdatomic.h>
+#else /* ! stdatomic.h */
+
+#include <windows.h>
+
+#include "common/attributes.h"
+
+typedef volatile LONG  __declspec(align(32)) atomic_int;
+typedef volatile ULONG __declspec(align(32)) atomic_uint;
+
+typedef enum {
+    memory_order_relaxed,
+    memory_order_acquire
+} msvc_atomic_memory_order;
+
+#define atomic_init(p_a, v)           do { *(p_a) = (v); } while(0)
+#define atomic_store(p_a, v)          InterlockedExchange((LONG*)p_a, v)
+#define atomic_load(p_a)              InterlockedCompareExchange((LONG*)p_a, 0, 0)
+#define atomic_load_explicit(p_a, mo) atomic_load(p_a)
+
+/*
+ * TODO use a special call to increment/decrement
+ * using InterlockedIncrement/InterlockedDecrement
+ */
+#define atomic_fetch_add(p_a, inc)    InterlockedExchangeAdd(p_a, inc)
+#define atomic_fetch_sub(p_a, dec)    InterlockedExchangeAdd(p_a, -(dec))
+
+#endif /* ! stdatomic.h */
+
+#pragma warning(pop)
+
+#endif /* !defined(__cplusplus) && defined(_MSC_VER) */
+
+#endif /* MSCVER_STDATOMIC_H_ */
diff --git a/include/dav1d/common.h b/include/dav1d/common.h
new file mode 100644 (file)
index 0000000..b55e939
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_COMMON_H
+#define DAV1D_COMMON_H
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef DAV1D_API
+    #if defined _WIN32
+      #if defined DAV1D_BUILDING_DLL
+        #define DAV1D_API __declspec(dllexport)
+      #else
+        #define DAV1D_API
+      #endif
+    #else
+      #if __GNUC__ >= 4
+        #define DAV1D_API __attribute__ ((visibility ("default")))
+      #else
+        #define DAV1D_API
+      #endif
+    #endif
+#endif
+
+#if EPERM > 0
+#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
+#else
+#define DAV1D_ERR(e) (e)
+#endif
+
+/**
+ * A reference-counted object wrapper for a user-configurable pointer.
+ */
+typedef struct Dav1dUserData {
+    const uint8_t *data; ///< data pointer
+    struct Dav1dRef *ref; ///< allocation origin
+} Dav1dUserData;
+
+/**
+ * Input packet metadata which are copied from the input data used to
+ * decode each image into the matching structure of the output image
+ * returned back to the user. Since these are metadata fields, they
+ * can be used for other purposes than the documented ones, they will
+ * still be passed from input data to output picture without being
+ * used internally.
+ */
+typedef struct Dav1dDataProps {
+    int64_t timestamp; ///< container timestamp of input data, INT64_MIN if unknown (default)
+    int64_t duration; ///< container duration of input data, 0 if unknown (default)
+    int64_t offset; ///< stream offset of input data, -1 if unknown (default)
+    size_t size; ///< packet size, default Dav1dData.sz
+    struct Dav1dUserData user_data; ///< user-configurable data, default NULL members
+} Dav1dDataProps;
+
+#endif /* DAV1D_COMMON_H */
diff --git a/include/dav1d/data.h b/include/dav1d/data.h
new file mode 100644 (file)
index 0000000..f945a04
--- /dev/null
@@ -0,0 +1,109 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_DATA_H
+#define DAV1D_DATA_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common.h"
+
+typedef struct Dav1dData {
+    const uint8_t *data; ///< data pointer
+    size_t sz; ///< data size
+    struct Dav1dRef *ref; ///< allocation origin
+    Dav1dDataProps m; ///< user provided metadata passed to the output picture
+} Dav1dData;
+
+/**
+ * Allocate data.
+ *
+ * @param data Input context.
+ * @param   sz Size of the data that should be allocated.
+ *
+ * @return Pointer to the allocated buffer on success. NULL on error.
+ */
+DAV1D_API uint8_t * dav1d_data_create(Dav1dData *data, size_t sz);
+
+/**
+ * Wrap an existing data array.
+ *
+ * @param          data Input context.
+ * @param           buf The data to be wrapped.
+ * @param            sz Size of the data.
+ * @param free_callback Function to be called when we release our last
+ *                      reference to this data. In this callback, $buf will be
+ *                      the $buf argument to this function, and $cookie will
+ *                      be the $cookie input argument to this function.
+ * @param        cookie Opaque parameter passed to free_callback().
+ *
+ * @return 0 on success. A negative DAV1D_ERR value on error.
+ */
+DAV1D_API int dav1d_data_wrap(Dav1dData *data, const uint8_t *buf, size_t sz,
+                              void (*free_callback)(const uint8_t *buf, void *cookie),
+                              void *cookie);
+
+/**
+ * Wrap a user-provided data pointer into a reference counted object.
+ *
+ * data->m.user_data field will initialized to wrap the provided $user_data
+ * pointer.
+ *
+ * $free_callback will be called on the same thread that released the last
+ * reference. If frame threading is used, make sure $free_callback is
+ * thread-safe.
+ *
+ * @param          data Input context.
+ * @param     user_data The user data to be wrapped.
+ * @param free_callback Function to be called when we release our last
+ *                      reference to this data. In this callback, $user_data
+ *                      will be the $user_data argument to this function, and
+ *                      $cookie will be the $cookie input argument to this
+ *                      function.
+ * @param        cookie Opaque parameter passed to $free_callback.
+ *
+ * @return 0 on success. A negative DAV1D_ERR value on error.
+ */
+DAV1D_API int dav1d_data_wrap_user_data(Dav1dData *data,
+                                        const uint8_t *user_data,
+                                        void (*free_callback)(const uint8_t *user_data,
+                                                              void *cookie),
+                                        void *cookie);
+
+/**
+ * Free the data reference.
+ *
+ * The reference count for data->m.user_data will be decremented (if it has been
+ * initialized with dav1d_data_wrap_user_data). The $data object will be memset
+ * to 0.
+ *
+ * @param data Input context.
+ */
+DAV1D_API void dav1d_data_unref(Dav1dData *data);
+
+#endif /* DAV1D_DATA_H */
diff --git a/include/dav1d/dav1d.h b/include/dav1d/dav1d.h
new file mode 100644 (file)
index 0000000..32fe8c3
--- /dev/null
@@ -0,0 +1,207 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_H
+#define DAV1D_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <errno.h>
+#include <stdarg.h>
+
+#include "common.h"
+#include "picture.h"
+#include "data.h"
+#include "version.h"
+
+typedef struct Dav1dContext Dav1dContext;
+typedef struct Dav1dRef Dav1dRef;
+
+#define DAV1D_MAX_FRAME_THREADS 256
+#define DAV1D_MAX_TILE_THREADS 64
+
+typedef struct Dav1dLogger {
+    void *cookie; ///< Custom data to pass to the callback.
+    /**
+     * Logger callback. May be NULL to disable logging.
+     *
+     * @param cookie Custom pointer passed to all calls.
+     * @param format The vprintf compatible format string.
+     * @param     ap List of arguments referenced by the format string.
+     */
+    void (*callback)(void *cookie, const char *format, va_list ap);
+} Dav1dLogger;
+
+typedef struct Dav1dSettings {
+    int n_frame_threads;
+    int n_tile_threads;
+    int apply_grain;
+    int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
+    int all_layers; ///< output all spatial layers of a scalable AV1 biststream
+    unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
+    uint8_t reserved[32]; ///< reserved for future use
+    Dav1dPicAllocator allocator; ///< Picture allocator callback.
+    Dav1dLogger logger; ///< Logger callback.
+} Dav1dSettings;
+
+/**
+ * Get library version.
+ */
+DAV1D_API const char *dav1d_version(void);
+
+/**
+ * Initialize settings to default values.
+ *
+ * @param s Input settings context.
+ */
+DAV1D_API void dav1d_default_settings(Dav1dSettings *s);
+
+/**
+ * Allocate and open a decoder instance.
+ *
+ * @param c_out The decoder instance to open. *c_out will be set to the
+ *              allocated context.
+ * @param     s Input settings context.
+ *
+ * @note The context must be freed using dav1d_close() when decoding is
+ *       finished.
+ *
+ * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error.
+ */
+DAV1D_API int dav1d_open(Dav1dContext **c_out, const Dav1dSettings *s);
+
+/**
+ * Parse a Sequence Header OBU from bitstream data.
+ *
+ * @param out Output Sequence Header.
+ * @param buf The data to be parser.
+ * @param sz  Size of the data.
+ *
+ * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error.
+ *
+ * @note It is safe to feed this function data containing other OBUs than a
+ *       Sequence Header, as they will simply be ignored. If there is more than
+ *       one Sequence Header OBU present, only the last will be returned.
+ */
+DAV1D_API int dav1d_parse_sequence_header(Dav1dSequenceHeader *out,
+                                          const uint8_t *buf, const size_t sz);
+
+/**
+ * Feed bitstream data to the decoder.
+ *
+ * @param   c Input decoder instance.
+ * @param  in Input bitstream data. On success, ownership of the reference is
+ *            passed to the library.
+ *
+ * @return
+ *         0: Success, and the data was consumed.
+ *  DAV1D_ERR(EAGAIN): The data can't be consumed. dav1d_get_picture() should
+ *                     be called to get one or more frames before the function
+ *                     can consume new data.
+ *  other negative DAV1D_ERR codes: Error during decoding or because of invalid
+ *                                  passed-in arguments.
+ */
+DAV1D_API int dav1d_send_data(Dav1dContext *c, Dav1dData *in);
+
+/**
+ * Return a decoded picture.
+ *
+ * @param   c Input decoder instance.
+ * @param out Output frame. The caller assumes ownership of the returned
+ *            reference.
+ *
+ * @return
+ *         0: Success, and a frame is returned.
+ *  DAV1D_ERR(EAGAIN): Not enough data to output a frame. dav1d_send_data()
+ *                     should be called with new input.
+ *  other negative DAV1D_ERR codes: Error during decoding or because of invalid
+ *                                  passed-in arguments.
+ *
+ * @note To drain buffered frames from the decoder (i.e. on end of stream),
+ *       call this function until it returns DAV1D_ERR(EAGAIN).
+ *
+ * @code{.c}
+ *  Dav1dData data = { 0 };
+ *  Dav1dPicture p = { 0 };
+ *  int res;
+ *
+ *  read_data(&data);
+ *  do {
+ *      res = dav1d_send_data(c, &data);
+ *      // Keep going even if the function can't consume the current data
+ *         packet. It eventually will after one or more frames have been
+ *         returned in this loop.
+ *      if (res < 0 && res != DAV1D_ERR(EAGAIN))
+ *          free_and_abort();
+ *      res = dav1d_get_picture(c, &p);
+ *      if (res < 0) {
+ *          if (res != DAV1D_ERR(EAGAIN))
+ *              free_and_abort();
+ *      } else
+ *          output_and_unref_picture(&p);
+ *  // Stay in the loop as long as there's data to consume.
+ *  } while (data.sz || read_data(&data) == SUCCESS);
+ *
+ *  // Handle EOS by draining all buffered frames.
+ *  do {
+ *      res = dav1d_get_picture(c, &p);
+ *      if (res < 0) {
+ *          if (res != DAV1D_ERR(EAGAIN))
+ *              free_and_abort();
+ *      } else
+ *          output_and_unref_picture(&p);
+ *  } while (res == 0);
+ * @endcode
+ */
+DAV1D_API int dav1d_get_picture(Dav1dContext *c, Dav1dPicture *out);
+
+/**
+ * Close a decoder instance and free all associated memory.
+ *
+ * @param c_out The decoder instance to close. *c_out will be set to NULL.
+ */
+DAV1D_API void dav1d_close(Dav1dContext **c_out);
+
+/**
+ * Flush all delayed frames in decoder and clear internal decoder state,
+ * to be used when seeking.
+ *
+ * @param c Input decoder instance.
+ *
+ * @note Decoding will start only after a valid sequence header OBU is
+ *       delivered to dav1d_send_data().
+ *
+ */
+DAV1D_API void dav1d_flush(Dav1dContext *c);
+
+# ifdef __cplusplus
+}
+# endif
+
+#endif /* DAV1D_H */
diff --git a/include/dav1d/headers.h b/include/dav1d/headers.h
new file mode 100644 (file)
index 0000000..d7e8a2b
--- /dev/null
@@ -0,0 +1,431 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_HEADERS_H
+#define DAV1D_HEADERS_H
+
+#include <stddef.h>
+
+// Constants from Section 3. "Symbols and abbreviated terms"
+#define DAV1D_MAX_CDEF_STRENGTHS 8
+#define DAV1D_MAX_OPERATING_POINTS 32
+#define DAV1D_MAX_TILE_COLS 64
+#define DAV1D_MAX_TILE_ROWS 64
+#define DAV1D_MAX_SEGMENTS 8
+#define DAV1D_NUM_REF_FRAMES 8
+#define DAV1D_PRIMARY_REF_NONE 7
+#define DAV1D_REFS_PER_FRAME 7
+#define DAV1D_TOTAL_REFS_PER_FRAME (DAV1D_REFS_PER_FRAME + 1)
+
+enum Dav1dObuType {
+    DAV1D_OBU_SEQ_HDR   = 1,
+    DAV1D_OBU_TD        = 2,
+    DAV1D_OBU_FRAME_HDR = 3,
+    DAV1D_OBU_TILE_GRP  = 4,
+    DAV1D_OBU_METADATA  = 5,
+    DAV1D_OBU_FRAME     = 6,
+    DAV1D_OBU_REDUNDANT_FRAME_HDR = 7,
+    DAV1D_OBU_PADDING   = 15,
+};
+
+enum Dav1dTxfmMode {
+    DAV1D_TX_4X4_ONLY,
+    DAV1D_TX_LARGEST,
+    DAV1D_TX_SWITCHABLE,
+    DAV1D_N_TX_MODES,
+};
+
+enum Dav1dFilterMode {
+    DAV1D_FILTER_8TAP_REGULAR,
+    DAV1D_FILTER_8TAP_SMOOTH,
+    DAV1D_FILTER_8TAP_SHARP,
+    DAV1D_N_SWITCHABLE_FILTERS,
+    DAV1D_FILTER_BILINEAR = DAV1D_N_SWITCHABLE_FILTERS,
+    DAV1D_N_FILTERS,
+    DAV1D_FILTER_SWITCHABLE = DAV1D_N_FILTERS,
+};
+
+enum Dav1dAdaptiveBoolean {
+    DAV1D_OFF = 0,
+    DAV1D_ON = 1,
+    DAV1D_ADAPTIVE = 2,
+};
+
+enum Dav1dRestorationType {
+    DAV1D_RESTORATION_NONE,
+    DAV1D_RESTORATION_SWITCHABLE,
+    DAV1D_RESTORATION_WIENER,
+    DAV1D_RESTORATION_SGRPROJ,
+};
+
+enum Dav1dWarpedMotionType {
+    DAV1D_WM_TYPE_IDENTITY,
+    DAV1D_WM_TYPE_TRANSLATION,
+    DAV1D_WM_TYPE_ROT_ZOOM,
+    DAV1D_WM_TYPE_AFFINE,
+};
+
+typedef struct Dav1dWarpedMotionParams {
+    enum Dav1dWarpedMotionType type;
+    int32_t matrix[6];
+    union {
+        struct {
+            int16_t alpha, beta, gamma, delta;
+        };
+        int16_t abcd[4];
+    };
+} Dav1dWarpedMotionParams;
+
+enum Dav1dPixelLayout {
+    DAV1D_PIXEL_LAYOUT_I400, ///< monochrome
+    DAV1D_PIXEL_LAYOUT_I420, ///< 4:2:0 planar
+    DAV1D_PIXEL_LAYOUT_I422, ///< 4:2:2 planar
+    DAV1D_PIXEL_LAYOUT_I444, ///< 4:4:4 planar
+};
+
+enum Dav1dFrameType {
+    DAV1D_FRAME_TYPE_KEY = 0,    ///< Key Intra frame
+    DAV1D_FRAME_TYPE_INTER = 1,  ///< Inter frame
+    DAV1D_FRAME_TYPE_INTRA = 2,  ///< Non key Intra frame
+    DAV1D_FRAME_TYPE_SWITCH = 3, ///< Switch Inter frame
+};
+
+enum Dav1dColorPrimaries {
+    DAV1D_COLOR_PRI_BT709 = 1,
+    DAV1D_COLOR_PRI_UNKNOWN = 2,
+    DAV1D_COLOR_PRI_BT470M = 4,
+    DAV1D_COLOR_PRI_BT470BG = 5,
+    DAV1D_COLOR_PRI_BT601 = 6,
+    DAV1D_COLOR_PRI_SMPTE240 = 7,
+    DAV1D_COLOR_PRI_FILM = 8,
+    DAV1D_COLOR_PRI_BT2020 = 9,
+    DAV1D_COLOR_PRI_XYZ = 10,
+    DAV1D_COLOR_PRI_SMPTE431 = 11,
+    DAV1D_COLOR_PRI_SMPTE432 = 12,
+    DAV1D_COLOR_PRI_EBU3213 = 22,
+};
+
+enum Dav1dTransferCharacteristics {
+    DAV1D_TRC_BT709 = 1,
+    DAV1D_TRC_UNKNOWN = 2,
+    DAV1D_TRC_BT470M = 4,
+    DAV1D_TRC_BT470BG = 5,
+    DAV1D_TRC_BT601 = 6,
+    DAV1D_TRC_SMPTE240 = 7,
+    DAV1D_TRC_LINEAR = 8,
+    DAV1D_TRC_LOG100 = 9,         ///< logarithmic (100:1 range)
+    DAV1D_TRC_LOG100_SQRT10 = 10, ///< lograithmic (100*sqrt(10):1 range)
+    DAV1D_TRC_IEC61966 = 11,
+    DAV1D_TRC_BT1361 = 12,
+    DAV1D_TRC_SRGB = 13,
+    DAV1D_TRC_BT2020_10BIT = 14,
+    DAV1D_TRC_BT2020_12BIT = 15,
+    DAV1D_TRC_SMPTE2084 = 16,     ///< PQ
+    DAV1D_TRC_SMPTE428 = 17,
+    DAV1D_TRC_HLG = 18,           ///< hybrid log/gamma (BT.2100 / ARIB STD-B67)
+};
+
+enum Dav1dMatrixCoefficients {
+    DAV1D_MC_IDENTITY = 0,
+    DAV1D_MC_BT709 = 1,
+    DAV1D_MC_UNKNOWN = 2,
+    DAV1D_MC_FCC = 4,
+    DAV1D_MC_BT470BG = 5,
+    DAV1D_MC_BT601 = 6,
+    DAV1D_MC_SMPTE240 = 7,
+    DAV1D_MC_SMPTE_YCGCO = 8,
+    DAV1D_MC_BT2020_NCL = 9,
+    DAV1D_MC_BT2020_CL = 10,
+    DAV1D_MC_SMPTE2085 = 11,
+    DAV1D_MC_CHROMAT_NCL = 12, ///< Chromaticity-derived
+    DAV1D_MC_CHROMAT_CL = 13,
+    DAV1D_MC_ICTCP = 14,
+};
+
+enum Dav1dChromaSamplePosition {
+    DAV1D_CHR_UNKNOWN = 0,
+    DAV1D_CHR_VERTICAL = 1,  ///< Horizontally co-located with luma(0, 0)
+                           ///< sample, between two vertical samples
+    DAV1D_CHR_COLOCATED = 2, ///< Co-located with luma(0, 0) sample
+};
+
+typedef struct Dav1dContentLightLevel {
+    int max_content_light_level;
+    int max_frame_average_light_level;
+} Dav1dContentLightLevel;
+
+typedef struct Dav1dMasteringDisplay {
+    ///< 0.16 fixed point
+    uint16_t primaries[3][2];
+    ///< 0.16 fixed point
+    uint16_t white_point[2];
+    ///< 24.8 fixed point
+    uint32_t max_luminance;
+    ///< 18.14 fixed point
+    uint32_t min_luminance;
+} Dav1dMasteringDisplay;
+
+typedef struct Dav1dITUTT35 {
+    uint8_t  country_code;
+    uint8_t  country_code_extension_byte;
+    size_t   payload_size;
+    uint8_t *payload;
+} Dav1dITUTT35;
+
+typedef struct Dav1dSequenceHeader {
+    /**
+     * Stream profile, 0 for 8-10 bits/component 4:2:0 or monochrome;
+     * 1 for 8-10 bits/component 4:4:4; 2 for 4:2:2 at any bits/component,
+     * or 12 bits/component at any chroma subsampling.
+     */
+    int profile;
+    /**
+     * Maximum dimensions for this stream. In non-scalable streams, these
+     * are often the actual dimensions of the stream, although that is not
+     * a normative requirement.
+     */
+    int max_width, max_height;
+    enum Dav1dPixelLayout layout; ///< format of the picture
+    enum Dav1dColorPrimaries pri; ///< color primaries (av1)
+    enum Dav1dTransferCharacteristics trc; ///< transfer characteristics (av1)
+    enum Dav1dMatrixCoefficients mtrx; ///< matrix coefficients (av1)
+    enum Dav1dChromaSamplePosition chr; ///< chroma sample position (av1)
+    /**
+     * 0, 1 and 2 mean 8, 10 or 12 bits/component, respectively. This is not
+     * exactly the same as 'hbd' from the spec; the spec's hbd distinguishes
+     * between 8 (0) and 10-12 (1) bits/component, and another element
+     * (twelve_bit) to distinguish between 10 and 12 bits/component. To get
+     * the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2.
+     */
+    int hbd;
+    /**
+     * Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of
+     * MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma).
+     */
+    int color_range;
+
+    int num_operating_points;
+    struct Dav1dSequenceHeaderOperatingPoint {
+        int major_level, minor_level;
+        int initial_display_delay;
+        int idc;
+        int tier;
+        int decoder_model_param_present;
+        int display_model_param_present;
+    } operating_points[DAV1D_MAX_OPERATING_POINTS];
+
+    int still_picture;
+    int reduced_still_picture_header;
+    int timing_info_present;
+    int num_units_in_tick;
+    int time_scale;
+    int equal_picture_interval;
+    unsigned num_ticks_per_picture;
+    int decoder_model_info_present;
+    int encoder_decoder_buffer_delay_length;
+    int num_units_in_decoding_tick;
+    int buffer_removal_delay_length;
+    int frame_presentation_delay_length;
+    int display_model_info_present;
+    int width_n_bits, height_n_bits;
+    int frame_id_numbers_present;
+    int delta_frame_id_n_bits;
+    int frame_id_n_bits;
+    int sb128;
+    int filter_intra;
+    int intra_edge_filter;
+    int inter_intra;
+    int masked_compound;
+    int warped_motion;
+    int dual_filter;
+    int order_hint;
+    int jnt_comp;
+    int ref_frame_mvs;
+    enum Dav1dAdaptiveBoolean screen_content_tools;
+    enum Dav1dAdaptiveBoolean force_integer_mv;
+    int order_hint_n_bits;
+    int super_res;
+    int cdef;
+    int restoration;
+    int ss_hor, ss_ver, monochrome;
+    int color_description_present;
+    int separate_uv_delta_q;
+    int film_grain_present;
+
+    // Dav1dSequenceHeaders of the same sequence are required to be
+    // bit-identical until this offset. See 7.5 "Ordering of OBUs":
+    //   Within a particular coded video sequence, the contents of
+    //   sequence_header_obu must be bit-identical each time the
+    //   sequence header appears except for the contents of
+    //   operating_parameters_info.
+    struct Dav1dSequenceHeaderOperatingParameterInfo {
+        int decoder_buffer_delay;
+        int encoder_buffer_delay;
+        int low_delay_mode;
+    } operating_parameter_info[DAV1D_MAX_OPERATING_POINTS];
+} Dav1dSequenceHeader;
+
+typedef struct Dav1dSegmentationData {
+    int delta_q;
+    int delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v;
+    int ref;
+    int skip;
+    int globalmv;
+} Dav1dSegmentationData;
+
+typedef struct Dav1dSegmentationDataSet {
+    Dav1dSegmentationData d[DAV1D_MAX_SEGMENTS];
+    int preskip;
+    int last_active_segid;
+} Dav1dSegmentationDataSet;
+
+typedef struct Dav1dLoopfilterModeRefDeltas {
+    int mode_delta[2 /* is_zeromv */];
+    int ref_delta[DAV1D_TOTAL_REFS_PER_FRAME];
+} Dav1dLoopfilterModeRefDeltas;
+
+typedef struct Dav1dFilmGrainData {
+    unsigned seed;
+    int num_y_points;
+    uint8_t y_points[14][2 /* value, scaling */];
+    int chroma_scaling_from_luma;
+    int num_uv_points[2];
+    uint8_t uv_points[2][10][2 /* value, scaling */];
+    int scaling_shift;
+    int ar_coeff_lag;
+    int8_t ar_coeffs_y[24];
+    int8_t ar_coeffs_uv[2][25 + 3 /* padding for alignment purposes */];
+    uint64_t ar_coeff_shift;
+    int grain_scale_shift;
+    int uv_mult[2];
+    int uv_luma_mult[2];
+    int uv_offset[2];
+    int overlap_flag;
+    int clip_to_restricted_range;
+} Dav1dFilmGrainData;
+
+typedef struct Dav1dFrameHeader {
+    struct {
+        Dav1dFilmGrainData data;
+        int present, update;
+    } film_grain; ///< film grain parameters
+    enum Dav1dFrameType frame_type; ///< type of the picture
+    int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
+    int frame_offset; ///< frame number
+    int temporal_id; ///< temporal id of the frame for SVC
+    int spatial_id; ///< spatial id of the frame for SVC
+
+    int show_existing_frame;
+    int existing_frame_idx;
+    int frame_id;
+    int frame_presentation_delay;
+    int show_frame;
+    int showable_frame;
+    int error_resilient_mode;
+    int disable_cdf_update;
+    int allow_screen_content_tools;
+    int force_integer_mv;
+    int frame_size_override;
+    int primary_ref_frame;
+    int buffer_removal_time_present;
+    struct Dav1dFrameHeaderOperatingPoint {
+        int buffer_removal_time;
+    } operating_points[DAV1D_MAX_OPERATING_POINTS];
+    int refresh_frame_flags;
+    int render_width, render_height;
+    struct {
+        int width_scale_denominator;
+        int enabled;
+    } super_res;
+    int have_render_size;
+    int allow_intrabc;
+    int frame_ref_short_signaling;
+    int refidx[DAV1D_REFS_PER_FRAME];
+    int hp;
+    enum Dav1dFilterMode subpel_filter_mode;
+    int switchable_motion_mode;
+    int use_ref_frame_mvs;
+    int refresh_context;
+    struct {
+        int uniform;
+        unsigned n_bytes;
+        int min_log2_cols, max_log2_cols, log2_cols, cols;
+        int min_log2_rows, max_log2_rows, log2_rows, rows;
+        uint16_t col_start_sb[DAV1D_MAX_TILE_COLS + 1];
+        uint16_t row_start_sb[DAV1D_MAX_TILE_ROWS + 1];
+        int update;
+    } tiling;
+    struct {
+        int yac;
+        int ydc_delta;
+        int udc_delta, uac_delta, vdc_delta, vac_delta;
+        int qm, qm_y, qm_u, qm_v;
+    } quant;
+    struct {
+        int enabled, update_map, temporal, update_data;
+        Dav1dSegmentationDataSet seg_data;
+        int lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS];
+    } segmentation;
+    struct {
+        struct {
+            int present;
+            int res_log2;
+        } q;
+        struct {
+            int present;
+            int res_log2;
+            int multi;
+        } lf;
+    } delta;
+    int all_lossless;
+    struct {
+        int level_y[2 /* dir */];
+        int level_u, level_v;
+        int mode_ref_delta_enabled;
+        int mode_ref_delta_update;
+        Dav1dLoopfilterModeRefDeltas mode_ref_deltas;
+        int sharpness;
+    } loopfilter;
+    struct {
+        int damping;
+        int n_bits;
+        int y_strength[DAV1D_MAX_CDEF_STRENGTHS];
+        int uv_strength[DAV1D_MAX_CDEF_STRENGTHS];
+    } cdef;
+    struct {
+        enum Dav1dRestorationType type[3 /* plane */];
+        int unit_size[2 /* y, uv */];
+    } restoration;
+    enum Dav1dTxfmMode txfm_mode;
+    int switchable_comp_refs;
+    int skip_mode_allowed, skip_mode_enabled, skip_mode_refs[2];
+    int warp_motion;
+    int reduced_txtp_set;
+    Dav1dWarpedMotionParams gmv[DAV1D_REFS_PER_FRAME];
+} Dav1dFrameHeader;
+
+#endif /* DAV1D_HEADERS_H */
diff --git a/include/dav1d/meson.build b/include/dav1d/meson.build
new file mode 100644 (file)
index 0000000..b5649d3
--- /dev/null
@@ -0,0 +1,41 @@
+# Copyright © 2019, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# installed version.h header generation
+version_h_data = configuration_data()
+version_h_data.set('DAV1D_API_VERSION_MAJOR', dav1d_api_version_major)
+version_h_data.set('DAV1D_API_VERSION_MINOR', dav1d_api_version_minor)
+version_h_data.set('DAV1D_API_VERSION_PATCH', dav1d_api_version_revision)
+version_h_target = configure_file(input: 'version.h.in',
+                                  output: 'version.h',
+                                  configuration: version_h_data)
+
+# install headers
+install_headers('common.h',
+                'data.h',
+                'dav1d.h',
+                'headers.h',
+                'picture.h',
+                version_h_target,
+                subdir : 'dav1d')
diff --git a/include/dav1d/picture.h b/include/dav1d/picture.h
new file mode 100644 (file)
index 0000000..98e5eb5
--- /dev/null
@@ -0,0 +1,144 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_PICTURE_H
+#define DAV1D_PICTURE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common.h"
+#include "headers.h"
+
+/* Number of bytes to align AND pad picture memory buffers by, so that SIMD
+ * implementations can over-read by a few bytes, and use aligned read/write
+ * instructions. */
+#define DAV1D_PICTURE_ALIGNMENT 64
+
+typedef struct Dav1dPictureParameters {
+    int w; ///< width (in pixels)
+    int h; ///< height (in pixels)
+    enum Dav1dPixelLayout layout; ///< format of the picture
+    int bpc; ///< bits per pixel component (8 or 10)
+} Dav1dPictureParameters;
+
+typedef struct Dav1dPicture {
+    Dav1dSequenceHeader *seq_hdr;
+    Dav1dFrameHeader *frame_hdr;
+
+    /**
+     * Pointers to planar image data (Y is [0], U is [1], V is [2]). The data
+     * should be bytes (for 8 bpc) or words (for 10 bpc). In case of words
+     * containing 10 bpc image data, the pixels should be located in the LSB
+     * bits, so that values range between [0, 1023]; the upper bits should be
+     * zero'ed out.
+     */
+    void *data[3];
+
+    /**
+     * Number of bytes between 2 lines in data[] for luma [0] or chroma [1].
+     */
+    ptrdiff_t stride[2];
+
+    Dav1dPictureParameters p;
+    Dav1dDataProps m;
+
+    /**
+     * High Dynamic Range Content Light Level metadata applying to this picture,
+     * as defined in section 5.8.3 and 6.7.3
+     */
+    Dav1dContentLightLevel *content_light;
+    /**
+     * High Dynamic Range Mastering Display Color Volume metadata applying to
+     * this picture, as defined in section 5.8.4 and 6.7.4
+     */
+    Dav1dMasteringDisplay *mastering_display;
+    /**
+     * ITU-T T.35 metadata as defined in section 5.8.2 and 6.7.2
+     */
+    Dav1dITUTT35 *itut_t35;
+
+    uintptr_t reserved[4]; ///< reserved for future use
+
+    struct Dav1dRef *frame_hdr_ref; ///< Dav1dFrameHeader allocation origin
+    struct Dav1dRef *seq_hdr_ref; ///< Dav1dSequenceHeader allocation origin
+    struct Dav1dRef *content_light_ref; ///< Dav1dContentLightLevel allocation origin
+    struct Dav1dRef *mastering_display_ref; ///< Dav1dMasteringDisplay allocation origin
+    struct Dav1dRef *itut_t35_ref; ///< Dav1dITUTT35 allocation origin
+    uintptr_t reserved_ref[4]; ///< reserved for future use
+    struct Dav1dRef *ref; ///< Frame data allocation origin
+
+    void *allocator_data; ///< pointer managed by the allocator
+} Dav1dPicture;
+
+typedef struct Dav1dPicAllocator {
+    void *cookie; ///< custom data to pass to the allocator callbacks.
+    /**
+     * Allocate the picture buffer based on the Dav1dPictureParameters.
+     *
+     * The data[0], data[1] and data[2] must be DAV1D_PICTURE_ALIGNMENT byte
+     * aligned and with a pixel width/height multiple of 128 pixels. Any
+     * allocated memory area should also be padded by DAV1D_PICTURE_ALIGNMENT
+     * bytes.
+     * data[1] and data[2] must share the same stride[1].
+     *
+     * This function will be called on the main thread (the thread which calls
+     * dav1d_get_picture()).
+     *
+     * @param  pic The picture to allocate the buffer for. The callback needs to
+     *             fill the picture data[0], data[1], data[2], stride[0] and
+     *             stride[1].
+     *             The allocator can fill the pic allocator_data pointer with
+     *             a custom pointer that will be passed to
+     *             release_picture_callback().
+     * @param cookie Custom pointer passed to all calls.
+     *
+     * @note No fields other than data, stride and allocator_data must be filled
+     *       by this callback.
+     * @return 0 on success. A negative DAV1D_ERR value on error.
+     */
+    int (*alloc_picture_callback)(Dav1dPicture *pic, void *cookie);
+    /**
+     * Release the picture buffer.
+     *
+     * If frame threading is used, this function may be called by the main
+     * thread (the thread which calls dav1d_get_picture()) or any of the frame
+     * threads and thus must be thread-safe. If frame threading is not used,
+     * this function will only be called on the main thread.
+     *
+     * @param pic    The picture that was filled by alloc_picture_callback().
+     * @param cookie Custom pointer passed to all calls.
+     */
+    void (*release_picture_callback)(Dav1dPicture *pic, void *cookie);
+} Dav1dPicAllocator;
+
+/**
+ * Release reference to a picture.
+ */
+DAV1D_API void dav1d_picture_unref(Dav1dPicture *p);
+
+#endif /* DAV1D_PICTURE_H */
diff --git a/include/dav1d/version.h.in b/include/dav1d/version.h.in
new file mode 100644 (file)
index 0000000..30bfd11
--- /dev/null
@@ -0,0 +1,34 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_VERSION_H
+#define DAV1D_VERSION_H
+
+#define DAV1D_API_VERSION_MAJOR @DAV1D_API_VERSION_MAJOR@
+#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@
+#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@
+
+#endif /* DAV1D_VERSION_H */
diff --git a/include/meson.build b/include/meson.build
new file mode 100644 (file)
index 0000000..c83bfcd
--- /dev/null
@@ -0,0 +1,36 @@
+# Copyright © 2018, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Revision file (vcs_version.h) generation
+dav1d_git_dir = join_paths(dav1d_src_root, '.git')
+rev_target = vcs_tag(command: [
+        'git', '--git-dir', dav1d_git_dir,
+        'describe', '--tags', '--long',
+        '--match', '?.*.*', '--always'
+    ],
+    input: 'vcs_version.h.in',
+    output: 'vcs_version.h'
+)
+
+subdir('dav1d')
diff --git a/include/vcs_version.h.in b/include/vcs_version.h.in
new file mode 100644 (file)
index 0000000..71ed2f6
--- /dev/null
@@ -0,0 +1,2 @@
+/* auto-generated, do not edit */
+#define DAV1D_VERSION "@VCS_TAG@"
diff --git a/meson.build b/meson.build
new file mode 100644 (file)
index 0000000..d5366f9
--- /dev/null
@@ -0,0 +1,447 @@
+# Copyright © 2018-2020, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+project('dav1d', ['c'],
+    version: '0.7.1',
+    default_options: ['c_std=c99',
+                      'warning_level=2',
+                      'buildtype=release',
+                      'b_ndebug=if-release'],
+    meson_version: '>= 0.47.0')
+
+dav1d_soname_version       = '4.0.2'
+dav1d_api_version_array    = dav1d_soname_version.split('.')
+dav1d_api_version_major    = dav1d_api_version_array[0]
+dav1d_api_version_minor    = dav1d_api_version_array[1]
+dav1d_api_version_revision = dav1d_api_version_array[2]
+
+dav1d_src_root = meson.current_source_dir()
+cc = meson.get_compiler('c')
+
+# Configuratin data for config.h
+cdata = configuration_data()
+
+# Configuration data for config.asm
+cdata_asm = configuration_data()
+
+# Include directories
+dav1d_inc_dirs = include_directories(['.', 'include/dav1d', 'include'])
+
+
+
+#
+# Option handling
+#
+
+# Bitdepth option
+dav1d_bitdepths = get_option('bitdepths')
+foreach bitdepth : ['8', '16']
+    cdata.set10('CONFIG_@0@BPC'.format(bitdepth), dav1d_bitdepths.contains(bitdepth))
+endforeach
+
+# ASM option
+is_asm_enabled = (get_option('enable_asm') == true and
+    (host_machine.cpu_family().startswith('x86') or
+     host_machine.cpu_family() == 'aarch64'      or
+     host_machine.cpu_family().startswith('arm') or
+     host_machine.cpu() == 'ppc64le'))
+cdata.set10('HAVE_ASM', is_asm_enabled)
+
+if is_asm_enabled and get_option('b_sanitize') == 'memory'
+    error('asm causes false positive with memory sanitizer. Use \'-Denable_asm=false\'.')
+endif
+
+# Logging option
+cdata.set10('CONFIG_LOG', get_option('logging'))
+
+#
+# OS/Compiler checks and defines
+#
+
+# Arguments in test_args will be used even on feature tests
+test_args = []
+
+optional_arguments = []
+
+if host_machine.system() == 'linux'
+    test_args += '-D_GNU_SOURCE'
+    add_project_arguments('-D_GNU_SOURCE', language: 'c')
+elif host_machine.system() == 'darwin'
+    test_args += '-D_DARWIN_C_SOURCE'
+    add_project_arguments('-D_DARWIN_C_SOURCE', language: 'c')
+else
+    test_args += '-D_POSIX_C_SOURCE=200112L'
+    add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
+endif
+
+if host_machine.system() == 'windows'
+    cdata.set('_WIN32_WINNT',           '0x0601')
+    cdata.set('UNICODE',                1) # Define to 1 for Unicode (Wide Chars) APIs
+    cdata.set('_UNICODE',               1) # Define to 1 for Unicode (Wide Chars) APIs
+    cdata.set('__USE_MINGW_ANSI_STDIO', 1) # Define to force use of MinGW printf
+    cdata.set('_CRT_DECLARE_NONSTDC_NAMES', 1) # Define to get off_t from sys/types.h on MSVC
+    if cc.has_function('fseeko', prefix : '#include <stdio.h>', args : test_args)
+        cdata.set('_FILE_OFFSET_BITS', 64) # Not set by default by Meson on Windows
+    else
+        cdata.set('fseeko', '_fseeki64')
+        cdata.set('ftello', '_ftelli64')
+    endif
+
+    if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc')
+        optional_arguments += '-mcmodel=small'
+    endif
+
+    # On Windows, we use a compatibility layer to emulate pthread
+    thread_dependency = []
+    thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
+
+    rt_dependency = []
+else
+    thread_dependency = dependency('threads')
+    thread_compat_dep = []
+
+    rt_dependency = []
+    if cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args)
+        cdata.set('HAVE_CLOCK_GETTIME', 1)
+    elif host_machine.system() != 'darwin'
+        rt_dependency = cc.find_library('rt', required: false)
+        if not cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args, dependencies : rt_dependency)
+            error('clock_gettime not found')
+        endif
+        cdata.set('HAVE_CLOCK_GETTIME', 1)
+    endif
+endif
+
+# check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64
+have_fseeko = true
+if host_machine.system() == 'android'
+    if not cc.has_function('fseeko', prefix : '#include <stdio.h>', args : test_args)
+        if cc.has_function('fseeko', prefix : '#include <stdio.h>', args : test_args + ['-U_FILE_OFFSET_BITS'])
+            warning('Files larger than 2 gigabytes might not be supported in the dav1d CLI tool.')
+            add_project_arguments('-U_FILE_OFFSET_BITS', language: 'c')
+        elif get_option('enable_tools')
+            error('dav1d CLI tool needs fseeko()')
+        else
+            have_fseeko = false
+        endif
+    endif
+endif
+
+libdl_dependency = []
+if host_machine.system() == 'linux'
+    libdl_dependency = cc.find_library('dl', required : false)
+    if cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency)
+        cdata.set('HAVE_DLSYM', 1)
+    endif
+endif
+
+
+# Header checks
+
+stdatomic_dependency = []
+if not cc.check_header('stdatomic.h')
+    if cc.get_id() == 'msvc'
+        # we have a custom replacement for MSVC
+        stdatomic_dependency = declare_dependency(
+            include_directories : include_directories('include/compat/msvc'),
+        )
+    elif cc.compiles('''int main() { int v = 0; return __atomic_fetch_add(&v, 1, __ATOMIC_SEQ_CST); }''',
+                     name : 'GCC-style atomics', args : test_args)
+        stdatomic_dependency = declare_dependency(
+            include_directories : include_directories('include/compat/gcc'),
+        )
+    else
+        error('Atomics not supported')
+    endif
+endif
+
+if cc.check_header('unistd.h')
+    cdata.set('HAVE_UNISTD_H', 1)
+endif
+
+if cc.check_header('io.h')
+    cdata.set('HAVE_IO_H', 1)
+endif
+
+
+# Function checks
+
+if not cc.has_function('getopt_long', prefix : '#include <getopt.h>', args : test_args)
+    getopt_dependency = declare_dependency(
+        sources: files('tools/compat/getopt.c'),
+        include_directories : include_directories('include/compat'),
+    )
+else
+    getopt_dependency = []
+endif
+
+if cc.has_function('_aligned_malloc', prefix : '#include <malloc.h>', args : test_args)
+    cdata.set('HAVE_ALIGNED_MALLOC', 1)
+elif cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
+    cdata.set('HAVE_POSIX_MEMALIGN', 1)
+elif cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
+    cdata.set('HAVE_MEMALIGN', 1)
+endif
+
+if (host_machine.cpu_family() == 'aarch64' or
+    host_machine.cpu_family().startswith('arm') or
+    host_machine.cpu() == 'ppc64le')
+    if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
+        cdata.set('HAVE_GETAUXVAL', 1)
+    endif
+    if cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args)
+        cdata.set('HAVE_ELF_AUX_INFO', 1)
+    endif
+endif
+
+# Compiler flag tests
+
+if cc.has_argument('-fvisibility=hidden')
+    add_project_arguments('-fvisibility=hidden', language: 'c')
+else
+    warning('Compiler does not support -fvisibility=hidden, all symbols will be public!')
+endif
+
+# Compiler flags that should be set
+# But when the compiler does not supports them
+# it is not an error and silently tolerated
+if cc.get_id() != 'msvc'
+    optional_arguments += [
+      '-Wundef',
+      '-Werror=vla',
+      '-Wno-maybe-uninitialized',
+      '-Wno-missing-field-initializers',
+      '-Wno-unused-parameter',
+      '-Werror=missing-prototypes',
+      '-Wshorten-64-to-32',
+    ]
+    if host_machine.cpu_family() == 'x86'
+        optional_arguments += [
+          '-msse2',
+          '-mfpmath=sse',
+        ]
+    endif
+else
+    optional_arguments += [
+      '-wd4028', # parameter different from declaration
+      '-wd4996'  # use of POSIX functions
+    ]
+endif
+
+if (get_option('buildtype') != 'debug' and get_option('buildtype') != 'plain')
+    optional_arguments += '-fomit-frame-pointer'
+    optional_arguments += '-ffast-math'
+endif
+
+if (host_machine.system() == 'darwin' and cc.get_id() == 'clang' and
+    cc.version().startswith('11'))
+    # Workaround for Xcode 11 -fstack-check bug, see #301
+    optional_arguments += '-fno-stack-check'
+endif
+
+add_project_arguments(cc.get_supported_arguments(optional_arguments), language : 'c')
+
+# libFuzzer related things
+fuzzing_engine = get_option('fuzzing_engine')
+if fuzzing_engine == 'libfuzzer'
+    if not cc.has_argument('-fsanitize=fuzzer')
+        error('fuzzing_engine libfuzzer requires "-fsanitize=fuzzer"')
+    endif
+    fuzzer_args = ['-fsanitize=fuzzer-no-link', '-fsanitize=fuzzer']
+    add_project_arguments(cc.first_supported_argument(fuzzer_args), language : 'c')
+endif
+
+# Stack alignments flags
+
+stackalign_flag = []
+stackrealign_flag = []
+
+cdata.set10('ENDIANNESS_BIG', host_machine.endian() == 'big')
+
+if host_machine.cpu_family().startswith('x86')
+    if get_option('stack_alignment') > 0
+        stack_alignment = get_option('stack_alignment')
+    elif host_machine.cpu_family() == 'x86_64'
+        if cc.has_argument('-mpreferred-stack-boundary=6')
+            stackalign_flag = ['-mpreferred-stack-boundary=6']
+            stackrealign_flag = ['-mincoming-stack-boundary=4']
+            stack_alignment = 32
+        elif cc.has_argument('-mstack-alignment=64')
+            stackalign_flag = ['-mstack-alignment=64']
+            stackrealign_flag = ['-mstackrealign']
+            stack_alignment = 32
+        else
+            stack_alignment = 16
+        endif
+    else
+        if host_machine.system() == 'linux' or host_machine.system() == 'darwin'
+            stack_alignment = 16
+        elif cc.has_argument('-mpreferred-stack-boundary=4')
+            stackalign_flag = ['-mpreferred-stack-boundary=4']
+            stackrealign_flag = ['-mincoming-stack-boundary=2']
+            stack_alignment = 16
+        elif cc.has_argument('-mstack-alignment=16')
+            stackalign_flag = ['-mstack-alignment=16']
+            stackrealign_flag = ['-mstackrealign']
+            stack_alignment = 16
+        else
+            stack_alignment = 4
+        endif
+    endif
+    cdata_asm.set('STACK_ALIGNMENT', stack_alignment)
+    cdata.set('STACK_ALIGNMENT', stack_alignment)
+endif
+
+cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64')
+cdata.set10('ARCH_ARM',     host_machine.cpu_family().startswith('arm'))
+if (is_asm_enabled and
+    (host_machine.cpu_family() == 'aarch64' or
+     host_machine.cpu_family().startswith('arm')))
+
+   as_func_code = '''__asm__ (
+".func meson_test"
+".endfunc"
+);
+'''
+    have_as_func = cc.compiles(as_func_code)
+    cdata.set10('HAVE_AS_FUNC', have_as_func)
+
+    # fedora package build infrastructure uses a gcc specs file to enable
+    # '-fPIE' by default. The chosen way only adds '-fPIE' to the C compiler
+    # with integrated preprocessor. It is not added to the standalone
+    # preprocessor or the preprocessing stage of '.S' files. So we have to
+    # compile code to check if we have to define PIC for the arm asm to
+    # avoid absolute relocations when building for example checkasm.
+    check_pic_code = '''
+#if defined(PIC)
+#error "PIC already defined"
+#elif !(defined(__PIC__) || defined(__pic__))
+#error "no pic"
+#endif
+'''
+    if cc.compiles(check_pic_code)
+        cdata.set('PIC', '3')
+    endif
+endif
+
+cdata.set10('ARCH_X86', host_machine.cpu_family().startswith('x86'))
+cdata.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
+cdata.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
+
+if host_machine.cpu_family().startswith('x86')
+    cdata_asm.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
+    cdata_asm.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
+    cdata_asm.set10('PIC', true)
+endif
+
+cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le')
+
+if cc.symbols_have_underscore_prefix()
+    cdata.set10('PREFIX', true)
+    cdata_asm.set10('PREFIX', true)
+endif
+
+#
+# ASM specific stuff
+#
+if is_asm_enabled and host_machine.cpu_family().startswith('x86')
+
+    # NASM compiler support
+
+    nasm = find_program('nasm')
+
+    # check NASM version
+    if nasm.found()
+        nasm_r = run_command(nasm, '-v')
+
+        if nasm_r.returncode() != 0
+            error('failed running nasm to obtain its version')
+        endif
+
+        out = nasm_r.stdout().strip().split()
+        if out[1].to_lower() == 'version'
+            if out[2].version_compare('<2.13.02')
+                error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2]))
+            elif out[2].version_compare('<2.14') and get_option('enable_avx512')
+                error('nasm 2.14 or later is required for AVX-512 asm.\n' +
+                       'AVX-512 asm can be disabled with \'-Denable_avx512=false\'')
+            endif
+            cdata.set10('HAVE_AVX512ICL', get_option('enable_avx512'))
+            cdata_asm.set10('HAVE_AVX512ICL', get_option('enable_avx512'))
+        else
+            error('unexpected nasm version string: @0@'.format(nasm_r.stdout()))
+        endif
+    endif
+
+    # Generate config.asm
+    config_asm_target = configure_file(output: 'config.asm', output_format: 'nasm', configuration: cdata_asm)
+
+    if host_machine.system() == 'windows'
+        nasm_format = 'win'
+    elif host_machine.system() == 'darwin'
+        nasm_format = 'macho'
+    else
+        nasm_format = 'elf'
+    endif
+    if host_machine.cpu_family() == 'x86_64'
+        nasm_format += '64'
+    else
+        nasm_format += '32'
+    endif
+
+    nasm_gen = generator(nasm,
+        output: '@BASENAME@.obj',
+        depfile: '@BASENAME@.obj.ndep',
+        arguments: [
+            '-f', nasm_format,
+            '-I', '@0@/src/'.format(dav1d_src_root),
+            '-I', '@0@/'.format(meson.current_build_dir()),
+            '-MQ', '@OUTPUT@', '-MF', '@DEPFILE@',
+            '@EXTRA_ARGS@',
+            '@INPUT@',
+            '-o', '@OUTPUT@'
+        ])
+endif
+
+
+# Generate config.h
+config_h_target = configure_file(output: 'config.h', configuration: cdata)
+
+
+
+#
+# Include subdir meson.build files
+# The order is important!
+
+subdir('include')
+
+subdir('doc')
+
+subdir('src')
+
+subdir('tools')
+
+subdir('examples')
+
+subdir('tests')
diff --git a/meson_options.txt b/meson_options.txt
new file mode 100644 (file)
index 0000000..37bd084
--- /dev/null
@@ -0,0 +1,55 @@
+# General options
+
+option('bitdepths',
+    type: 'array',
+    choices: ['8', '16'],
+    description: 'Enable only specified bitdepths')
+
+option('enable_asm',
+    type: 'boolean',
+    value: true,
+    description: 'Build asm files, if available')
+
+option('enable_avx512',
+    type: 'boolean',
+    value: true,
+    description: 'Build AVX-512 asm files, requires nasm 2.14')
+
+option('enable_tools',
+    type: 'boolean',
+    value: true,
+    description: 'Build dav1d cli tools')
+
+option('enable_examples',
+    type: 'boolean',
+    value: false,
+    description: 'Build dav1d examples')
+
+option('enable_tests',
+    type: 'boolean',
+    value: true,
+    description: 'Build dav1d tests')
+
+option('logging',
+    type: 'boolean',
+    value: true,
+    description: 'Print error log messages using the provided callback function')
+
+option('testdata_tests',
+    type: 'boolean',
+    value: false,
+    description: 'Run tests requiring the test data repository')
+
+option('fuzzing_engine',
+    type: 'combo',
+    choices : ['none', 'libfuzzer', 'oss-fuzz'],
+    value: 'none',
+    description: 'Select the fuzzing engine')
+
+option('fuzzer_ldflags',
+    type: 'string',
+    description: 'Extra LDFLAGS used during linking of fuzzing binaries')
+
+option('stack_alignment',
+    type: 'integer',
+    value: 0)
diff --git a/package/crossfiles/aarch64-android.meson b/package/crossfiles/aarch64-android.meson
new file mode 100644 (file)
index 0000000..a25ea43
--- /dev/null
@@ -0,0 +1,16 @@
+[binaries]
+c = 'aarch64-linux-android21-clang'
+cpp = 'aarch64-linux-android21-clang++'
+ar = 'aarch64-linux-android-ar'
+strip = 'aarch64-linux-android-strip'
+pkgconfig = 'pkg-config'
+windres = 'aarch64-linux-android-windres'
+
+[properties]
+needs_exe_wrapper = true
+
+[host_machine]
+system = 'android'
+cpu_family = 'aarch64'
+endian = 'little'
+cpu = 'aarch64'
diff --git a/package/crossfiles/arm-android.meson b/package/crossfiles/arm-android.meson
new file mode 100644 (file)
index 0000000..dd07d98
--- /dev/null
@@ -0,0 +1,16 @@
+[binaries]
+c = 'armv7a-linux-androideabi16-clang'
+cpp = 'armv7a-linux-androideabi16-clang++'
+ar = 'arm-linux-androideabi-ar'
+strip = 'arm-linux-androideabi-strip'
+pkgconfig = 'pkg-config'
+windres = 'arm-linux-androideabi-windres'
+
+[properties]
+needs_exe_wrapper = true
+
+[host_machine]
+system = 'android'
+cpu_family = 'arm'
+endian = 'little'
+cpu = 'arm'
diff --git a/package/crossfiles/i686-linux32.meson b/package/crossfiles/i686-linux32.meson
new file mode 100644 (file)
index 0000000..95e99b3
--- /dev/null
@@ -0,0 +1,15 @@
+[binaries]
+c = 'gcc'
+cpp = 'g++'
+ar = 'ar'
+strip = 'strip'
+
+[properties]
+c_link_args = ['-m32']
+c_args = ['-m32']
+
+[host_machine]
+system = 'linux'
+cpu_family = 'x86'
+cpu = 'i686'
+endian = 'little'
diff --git a/package/crossfiles/i686-w64-mingw32.meson b/package/crossfiles/i686-w64-mingw32.meson
new file mode 100644 (file)
index 0000000..1fae70a
--- /dev/null
@@ -0,0 +1,17 @@
+[binaries]
+c = 'i686-w64-mingw32-gcc'
+cpp = 'i686-w64-mingw32-g++'
+ar = 'i686-w64-mingw32-ar'
+strip = 'i686-w64-mingw32-strip'
+windres = 'i686-w64-mingw32-windres'
+exe_wrapper = ['wine']
+
+[properties]
+c_link_args = ['-static-libgcc']
+needs_exe_wrapper = true
+
+[host_machine]
+system = 'windows'
+cpu_family = 'x86'
+cpu = 'i686'
+endian = 'little'
diff --git a/package/crossfiles/x86_64-w64-mingw32.meson b/package/crossfiles/x86_64-w64-mingw32.meson
new file mode 100644 (file)
index 0000000..068a128
--- /dev/null
@@ -0,0 +1,17 @@
+[binaries]
+c = 'x86_64-w64-mingw32-gcc'
+cpp = 'x86_64-w64-mingw32-g++'
+ar = 'x86_64-w64-mingw32-ar'
+strip = 'x86_64-w64-mingw32-strip'
+windres = 'x86_64-w64-mingw32-windres'
+exe_wrapper = ['wine']
+
+[properties]
+c_link_args = ['-static-libgcc']
+needs_exe_wrapper = true
+
+[host_machine]
+system = 'windows'
+cpu_family = 'x86_64'
+cpu = 'x86_64'
+endian = 'little'
diff --git a/package/snap/snapcraft.yaml b/package/snap/snapcraft.yaml
new file mode 100644 (file)
index 0000000..cecd0bd
--- /dev/null
@@ -0,0 +1,24 @@
+name: dav1d
+base: core18
+version: git
+version-script: git describe HEAD --always
+summary: AV1 decoder from VideoLAN
+description: |
+  A small and fast AV1 decoder from the people who brought you VLC.
+
+grade: stable
+confinement: strict # use 'strict' once you have the right plugs and slots
+
+apps:
+  dav1d:
+    command: usr/bin/dav1d
+    plugs: [ 'home' ]
+
+parts:
+  dav1d:
+    plugin: meson
+    source: ../../
+    build-packages: [ 'nasm' ]
+    meson-parameters:
+      - --prefix=/usr
+      - --buildtype=release
diff --git a/src/arm/32/cdef.S b/src/arm/32/cdef.S
new file mode 100644 (file)
index 0000000..d8e72be
--- /dev/null
@@ -0,0 +1,974 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// n1 = s0/d0
+// w1 = d0/q0
+// n2 = s4/d2
+// w2 = d2/q1
+.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret
+        tst             r6,  #1 // CDEF_HAVE_LEFT
+        beq             2f
+        // CDEF_HAVE_LEFT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
+        beq             1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        ldrh            r12, [\s1, #-2]
+        vldr            \n1, [\s1]
+        vdup.16         d4,  r12
+        ldrh            r12, [\s1, #\w]
+        vmov.16         d4[1], r12
+        ldrh            r12, [\s2, #-2]
+        vldr            \n2, [\s2]
+        vmov.16         d4[2], r12
+        ldrh            r12, [\s2, #\w]
+        vmovl.u8        q0,  d0
+        vmov.16         d4[3], r12
+        vmovl.u8        q1,  d2
+        vmovl.u8        q2,  d4
+        vstr            s8,  [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s9,  [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        vstr            s10, [r0, #-4]
+        vst1.16         {\w2}, [r0, :\align]
+        vstr            s11, [r0, #2*\w]
+.if \ret
+        pop             {r4-r7,pc}
+.else
+        add             r0,  r0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ldrh            r12, [\s1, #-2]
+        vldr            \n1, [\s1]
+        vdup.16         d4,  r12
+        ldrh            r12, [\s2, #-2]
+        vldr            \n2, [\s2]
+        vmovl.u8        q0,  d0
+        vmov.16         d4[1], r12
+        vmovl.u8        q1,  d2
+        vmovl.u8        q2,  d4
+        vstr            s8,  [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        vstr            s9,  [r0, #-4]
+        vst1.16         {\w2}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+.if \ret
+        pop             {r4-r7,pc}
+.else
+        add             r0,  r0,  #2*\stride
+        b               3f
+.endif
+
+2:
+        // !CDEF_HAVE_LEFT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
+        beq             1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        vldr            \n1, [\s1]
+        ldrh            r12, [\s1, #\w]
+        vldr            \n2, [\s2]
+        vdup.16         d4,  r12
+        ldrh            r12, [\s2, #\w]
+        vmovl.u8        q0,  d0
+        vmov.16         d4[1], r12
+        vmovl.u8        q1,  d2
+        vmovl.u8        q2,  d4
+        vstr            s12, [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s8,  [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        vstr            s12, [r0, #-4]
+        vst1.16         {\w2}, [r0, :\align]
+        vstr            s9,  [r0, #2*\w]
+.if \ret
+        pop             {r4-r7,pc}
+.else
+        add             r0,  r0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        vldr            \n1, [\s1]
+        vldr            \n2, [\s2]
+        vmovl.u8        q0,  d0
+        vmovl.u8        q1,  d2
+        vstr            s12, [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        vstr            s12, [r0, #-4]
+        vst1.16         {\w2}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+.if \ret
+        pop             {r4-r7,pc}
+.else
+        add             r0,  r0,  #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr dst, src, incr, w
+.if \w == 4
+        vld1.32         {\dst\()[0]}, [\src, :32], \incr
+.else
+        vld1.8          {\dst\()},    [\src, :64], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
+//                                    ptrdiff_t src_stride, const pixel (*left)[2],
+//                                    const pixel *const top, int h,
+//                                    enum CdefEdgeFlags edges);
+
+// n1 = s0/d0
+// w1 = d0/q0
+// n2 = s4/d2
+// w2 = d2/q1
+.macro padding_func w, stride, n1, w1, n2, w2, align
+function cdef_padding\w\()_8bpc_neon, export=1
+        push            {r4-r7,lr}
+        ldrd            r4,  r5,  [sp, #20]
+        ldr             r6,  [sp, #28]
+        cmp             r6,  #0xf // fully edged
+        beq             cdef_padding\w\()_edged_8bpc_neon
+        vmov.i16        q3,  #0x8000
+        tst             r6,  #4 // CDEF_HAVE_TOP
+        bne             1f
+        // !CDEF_HAVE_TOP
+        sub             r12, r0,  #2*(2*\stride+2)
+        vmov.i16        q2,  #0x8000
+        vst1.16         {q2,q3}, [r12]!
+.if \w == 8
+        vst1.16         {q2,q3}, [r12]!
+.endif
+        b               3f
+1:
+        // CDEF_HAVE_TOP
+        add             r7,  r4,  r2
+        sub             r0,  r0,  #2*(2*\stride)
+        pad_top_bottom  r4,  r7,  \w, \stride, \n1, \w1, \n2, \w2, \align, 0
+
+        // Middle section
+3:
+        tst             r6,  #1 // CDEF_HAVE_LEFT
+        beq             2f
+        // CDEF_HAVE_LEFT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
+        beq             1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        ldrh            r12, [r3], #2
+        vldr            \n1, [r1]
+        vdup.16         d2,  r12
+        ldrh            r12, [r1, #\w]
+        add             r1,  r1,  r2
+        subs            r5,  r5,  #1
+        vmov.16         d2[1], r12
+        vmovl.u8        q0,  d0
+        vmovl.u8        q1,  d2
+        vstr            s4,  [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s5,  [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        bgt             0b
+        b               3f
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ldrh            r12, [r3], #2
+        load_n_incr     d0,  r1,  r2,  \w
+        vdup.16         d2,  r12
+        subs            r5,  r5,  #1
+        vmovl.u8        q0,  d0
+        vmovl.u8        q1,  d2
+        vstr            s4,  [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        bgt             1b
+        b               3f
+2:
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
+        beq             1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        ldrh            r12, [r1, #\w]
+        load_n_incr     d0,  r1,  r2,  \w
+        vdup.16         d2,  r12
+        subs            r5,  r5,  #1
+        vmovl.u8        q0,  d0
+        vmovl.u8        q1,  d2
+        vstr            s12, [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s4,  [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        bgt             0b
+        b               3f
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        load_n_incr     d0,  r1,  r2,  \w
+        subs            r5,  r5,  #1
+        vmovl.u8        q0,  d0
+        vstr            s12, [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        bgt             1b
+
+3:
+        tst             r6,  #8 // CDEF_HAVE_BOTTOM
+        bne             1f
+        // !CDEF_HAVE_BOTTOM
+        sub             r12, r0,  #4
+        vmov.i16        q2,  #0x8000
+        vst1.16         {q2,q3}, [r12]!
+.if \w == 8
+        vst1.16         {q2,q3}, [r12]!
+.endif
+        pop             {r4-r7,pc}
+1:
+        // CDEF_HAVE_BOTTOM
+        add             r7,  r1,  r2
+        pad_top_bottom  r1,  r7,  \w, \stride, \n1, \w1, \n2, \w2, \align, 1
+endfunc
+.endm
+
+padding_func 8, 16, d0, q0, d2, q1, 128
+padding_func 4, 8,  s0, d0, s4, d2, 64
+
+// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src,
+//                                    ptrdiff_t src_stride, const pixel (*left)[2],
+//                                    const pixel *const top, int h,
+//                                    enum CdefEdgeFlags edges);
+
+.macro padding_func_edged w, stride, reg, align
+function cdef_padding\w\()_edged_8bpc_neon
+        sub             r0,  r0,  #(2*\stride)
+
+        ldrh            r12, [r4, #-2]
+        vldr            \reg, [r4]
+        add             r7,  r4,  r2
+        strh            r12, [r0, #-2]
+        ldrh            r12, [r4, #\w]
+        vstr            \reg, [r0]
+        strh            r12, [r0, #\w]
+
+        ldrh            r12, [r7, #-2]
+        vldr            \reg, [r7]
+        strh            r12, [r0, #\stride-2]
+        ldrh            r12, [r7, #\w]
+        vstr            \reg, [r0, #\stride]
+        strh            r12, [r0, #\stride+\w]
+        add             r0,  r0,  #2*\stride
+
+0:
+        ldrh            r12, [r3], #2
+        vldr            \reg, [r1]
+        str             r12, [r0, #-2]
+        ldrh            r12, [r1, #\w]
+        add             r1,  r1,  r2
+        subs            r5,  r5,  #1
+        vstr            \reg, [r0]
+        str             r12, [r0, #\w]
+        add             r0,  r0,  #\stride
+        bgt             0b
+
+        ldrh            r12, [r1, #-2]
+        vldr            \reg, [r1]
+        add             r7,  r1,  r2
+        strh            r12, [r0, #-2]
+        ldrh            r12, [r1, #\w]
+        vstr            \reg, [r0]
+        strh            r12, [r0, #\w]
+
+        ldrh            r12, [r7, #-2]
+        vldr            \reg, [r7]
+        strh            r12, [r0, #\stride-2]
+        ldrh            r12, [r7, #\w]
+        vstr            \reg, [r0, #\stride]
+        strh            r12, [r0, #\stride+\w]
+
+        pop             {r4-r7,pc}
+endfunc
+.endm
+
+padding_func_edged 8, 16, d0, 64
+padding_func_edged 4, 8,  s0, 32
+
+.macro dir_table w, stride
+const directions\w
+        .byte           -1 * \stride + 1, -2 * \stride + 2
+        .byte            0 * \stride + 1, -1 * \stride + 2
+        .byte            0 * \stride + 1,  0 * \stride + 2
+        .byte            0 * \stride + 1,  1 * \stride + 2
+        .byte            1 * \stride + 1,  2 * \stride + 2
+        .byte            1 * \stride + 0,  2 * \stride + 1
+        .byte            1 * \stride + 0,  2 * \stride + 0
+        .byte            1 * \stride + 0,  2 * \stride - 1
+// Repeated, to avoid & 7
+        .byte           -1 * \stride + 1, -2 * \stride + 2
+        .byte            0 * \stride + 1, -1 * \stride + 2
+        .byte            0 * \stride + 1,  0 * \stride + 2
+        .byte            0 * \stride + 1,  1 * \stride + 2
+        .byte            1 * \stride + 1,  2 * \stride + 2
+        .byte            1 * \stride + 0,  2 * \stride + 1
+endconst
+.endm
+
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+        .byte           4, 2, 3, 3
+endconst
+
+.macro load_px d11, d12, d21, d22, w
+.if \w == 8
+        add             r6,  r2,  r9, lsl #1 // x + off
+        sub             r9,  r2,  r9, lsl #1 // x - off
+        vld1.16         {\d11,\d12}, [r6]    // p0
+        vld1.16         {\d21,\d22}, [r9]    // p1
+.else
+        add             r6,  r2,  r9, lsl #1 // x + off
+        sub             r9,  r2,  r9, lsl #1 // x - off
+        vld1.16         {\d11}, [r6]         // p0
+        add             r6,  r6,  #2*8       // += stride
+        vld1.16         {\d21}, [r9]         // p1
+        add             r9,  r9,  #2*8       // += stride
+        vld1.16         {\d12}, [r6]         // p0
+        vld1.16         {\d22}, [r9]         // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
+.if \min
+        vmin.u16        q2,  q2,  \s1
+        vmax.s16        q3,  q3,  \s1
+        vmin.u16        q2,  q2,  \s2
+        vmax.s16        q3,  q3,  \s2
+.endif
+        vabd.u16        q8,  q0,  \s1        // abs(diff)
+        vabd.u16        q11, q0,  \s2        // abs(diff)
+        vshl.u16        q9,  q8,  \shift     // abs(diff) >> shift
+        vshl.u16        q12, q11, \shift     // abs(diff) >> shift
+        vqsub.u16       q9,  \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
+        vqsub.u16       q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
+        vsub.i16        q10, \s1, q0         // diff = p0 - px
+        vsub.i16        q13, \s2, q0         // diff = p1 - px
+        vneg.s16        q8,  q9              // -clip
+        vneg.s16        q11, q12             // -clip
+        vmin.s16        q10, q10, q9         // imin(diff, clip)
+        vmin.s16        q13, q13, q12        // imin(diff, clip)
+        vdup.16         q9,  \tap            // taps[k]
+        vmax.s16        q10, q10, q8         // constrain() = imax(imin(diff, clip), -clip)
+        vmax.s16        q13, q13, q11        // constrain() = imax(imin(diff, clip), -clip)
+        vmla.i16        q1,  q10, q9         // sum += taps[k] * constrain()
+        vmla.i16        q1,  q13, q9         // sum += taps[k] * constrain()
+.endm
+
+// void dav1d_cdef_filterX_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
+//                                   const uint16_t *tmp, int pri_strength,
+//                                   int sec_strength, int dir, int damping,
+//                                   int h, size_t edges);
+.macro filter_func w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_neon
+        cmp             r8,  #0xf
+        beq             cdef_filter\w\suffix\()_edged_neon
+.if \pri
+        movrel_local    r8,  pri_taps
+        and             r9,  r3,  #1
+        add             r8,  r8,  r9, lsl #1
+.endif
+        movrel_local    r9,  directions\w
+        add             r5,  r9,  r5, lsl #1
+        vmov.u16        d17, #15
+        vdup.16         d16, r6              // damping
+
+.if \pri
+        vdup.16         q5,  r3              // threshold
+.endif
+.if \sec
+        vdup.16         q7,  r4              // threshold
+.endif
+        vmov.16         d8[0], r3
+        vmov.16         d8[1], r4
+        vclz.i16        d8,  d8              // clz(threshold)
+        vsub.i16        d8,  d17, d8         // ulog2(threshold)
+        vqsub.u16       d8,  d16, d8         // shift = imax(0, damping - ulog2(threshold))
+        vneg.s16        d8,  d8              // -shift
+.if \sec
+        vdup.16         q6,  d8[1]
+.endif
+.if \pri
+        vdup.16         q4,  d8[0]
+.endif
+
+1:
+.if \w == 8
+        vld1.16         {q0},  [r2, :128]    // px
+.else
+        add             r12, r2,  #2*8
+        vld1.16         {d0},  [r2,  :64]    // px
+        vld1.16         {d1},  [r12, :64]    // px
+.endif
+
+        vmov.u16        q1,  #0              // sum
+.if \min
+        vmov.u16        q2,  q0              // min
+        vmov.u16        q3,  q0              // max
+.endif
+
+        // Instead of loading sec_taps 2, 1 from memory, just set it
+        // to 2 initially and decrease for the second round.
+        // This is also used as loop counter.
+        mov             lr,  #2              // sec_taps[0]
+
+2:
+.if \pri
+        ldrsb           r9,  [r5]            // off1
+
+        load_px         d28, d29, d30, d31, \w
+.endif
+
+.if \sec
+        add             r5,  r5,  #4         // +2*2
+        ldrsb           r9,  [r5]            // off2
+.endif
+
+.if \pri
+        ldrb            r12, [r8]            // *pri_taps
+
+        handle_pixel    q14, q15, q5,  q4,  r12, \min
+.endif
+
+.if \sec
+        load_px         d28, d29, d30, d31, \w
+
+        add             r5,  r5,  #8         // +2*4
+        ldrsb           r9,  [r5]            // off3
+
+        handle_pixel    q14, q15, q7,  q6,  lr, \min
+
+        load_px         d28, d29, d30, d31, \w
+
+        handle_pixel    q14, q15, q7,  q6,  lr, \min
+
+        sub             r5,  r5,  #11        // r5 -= 2*(2+4); r5 += 1;
+.else
+        add             r5,  r5,  #1         // r5 += 1
+.endif
+        subs            lr,  lr,  #1         // sec_tap-- (value)
+.if \pri
+        add             r8,  r8,  #1         // pri_taps++ (pointer)
+.endif
+        bne             2b
+
+        vshr.s16        q14, q1,  #15        // -(sum < 0)
+        vadd.i16        q1,  q1,  q14        // sum - (sum < 0)
+        vrshr.s16       q1,  q1,  #4         // (8 + sum - (sum < 0)) >> 4
+        vadd.i16        q0,  q0,  q1         // px + (8 + sum ...) >> 4
+.if \min
+        vmin.s16        q0,  q0,  q3
+        vmax.s16        q0,  q0,  q2         // iclip(px + .., min, max)
+.endif
+        vmovn.u16       d0,  q0
+.if \w == 8
+        add             r2,  r2,  #2*16      // tmp += tmp_stride
+        subs            r7,  r7,  #1         // h--
+        vst1.8          {d0}, [r0, :64], r1
+.else
+        vst1.32         {d0[0]}, [r0, :32], r1
+        add             r2,  r2,  #2*16      // tmp += 2*tmp_stride
+        subs            r7,  r7,  #2         // h -= 2
+        vst1.32         {d0[1]}, [r0, :32], r1
+.endif
+
+        // Reset pri_taps and directions back to the original point
+        sub             r5,  r5,  #2
+.if \pri
+        sub             r8,  r8,  #2
+.endif
+
+        bgt             1b
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter w
+filter_func \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_8bpc_neon, export=1
+        push            {r4-r9,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #92]
+        ldrd            r6,  r7,  [sp, #100]
+        ldr             r8,  [sp, #108]
+        cmp             r3,  #0 // pri_strength
+        bne             1f
+        b               cdef_filter\w\()_sec_neon // only sec
+1:
+        cmp             r4,  #0 // sec_strength
+        bne             1f
+        b               cdef_filter\w\()_pri_neon // only pri
+1:
+        b               cdef_filter\w\()_pri_sec_neon // both pri and sec
+endfunc
+.endm
+
+filter 8
+filter 4
+
+.macro load_px_8 d11, d12, d21, d22, w
+.if \w == 8
+        add             r6,  r2,  r9         // x + off
+        sub             r9,  r2,  r9         // x - off
+        vld1.8          {\d11}, [r6]         // p0
+        add             r6,  r6,  #16        // += stride
+        vld1.8          {\d21}, [r9]         // p1
+        add             r9,  r9,  #16        // += stride
+        vld1.8          {\d12}, [r6]         // p0
+        vld1.8          {\d22}, [r9]         // p1
+.else
+        add             r6,  r2,  r9         // x + off
+        sub             r9,  r2,  r9         // x - off
+        vld1.32         {\d11[0]}, [r6]      // p0
+        add             r6,  r6,  #8         // += stride
+        vld1.32         {\d21[0]}, [r9]      // p1
+        add             r9,  r9,  #8         // += stride
+        vld1.32         {\d11[1]}, [r6]      // p0
+        add             r6,  r6,  #8         // += stride
+        vld1.32         {\d21[1]}, [r9]      // p1
+        add             r9,  r9,  #8         // += stride
+        vld1.32         {\d12[0]}, [r6]      // p0
+        add             r6,  r6,  #8         // += stride
+        vld1.32         {\d22[0]}, [r9]      // p1
+        add             r9,  r9,  #8         // += stride
+        vld1.32         {\d12[1]}, [r6]      // p0
+        vld1.32         {\d22[1]}, [r9]      // p1
+.endif
+.endm
+.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
+.if \min
+        vmin.u8         q3,  q3,  \s1
+        vmax.u8         q4,  q4,  \s1
+        vmin.u8         q3,  q3,  \s2
+        vmax.u8         q4,  q4,  \s2
+.endif
+        vabd.u8         q8,  q0,  \s1        // abs(diff)
+        vabd.u8         q11, q0,  \s2        // abs(diff)
+        vshl.u8         q9,  q8,  \shift     // abs(diff) >> shift
+        vshl.u8         q12, q11, \shift     // abs(diff) >> shift
+        vqsub.u8        q9,  \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
+        vqsub.u8        q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
+        vcgt.u8         q10, q0,  \s1        // px > p0
+        vcgt.u8         q13, q0,  \s2        // px > p1
+        vmin.u8         q9,  q9,  q8         // imin(abs(diff), clip)
+        vmin.u8         q12, q12, q11        // imin(abs(diff), clip)
+        vneg.s8         q8,  q9              // -imin()
+        vneg.s8         q11, q12             // -imin()
+        vbsl            q10, q8,  q9         // constrain() = imax(imin(diff, clip), -clip)
+        vdup.8          d18, \tap            // taps[k]
+        vbsl            q13, q11, q12        // constrain() = imax(imin(diff, clip), -clip)
+        vmlal.s8        q1,  d20, d18        // sum += taps[k] * constrain()
+        vmlal.s8        q1,  d26, d18        // sum += taps[k] * constrain()
+        vmlal.s8        q2,  d21, d18        // sum += taps[k] * constrain()
+        vmlal.s8        q2,  d27, d18        // sum += taps[k] * constrain()
+.endm
+
+// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride,
+//                              const uint16_t *tmp, int pri_strength,
+//                              int sec_strength, int dir, int damping,
+//                              int h, size_t edges);
+.macro filter_func_8 w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_edged_neon
+.if \pri
+        movrel_local    r8,  pri_taps
+        and             r9,  r3,  #1
+        add             r8,  r8,  r9, lsl #1
+.endif
+        movrel_local    r9,  directions\w
+        add             r5,  r9,  r5, lsl #1
+        vmov.u8         d17, #7
+        vdup.8          d16, r6              // damping
+
+        vmov.8          d8[0], r3
+        vmov.8          d8[1], r4
+        vclz.i8         d8,  d8              // clz(threshold)
+        vsub.i8         d8,  d17, d8         // ulog2(threshold)
+        vqsub.u8        d8,  d16, d8         // shift = imax(0, damping - ulog2(threshold))
+        vneg.s8         d8,  d8              // -shift
+.if \sec
+        vdup.8          q6,  d8[1]
+.endif
+.if \pri
+        vdup.8          q5,  d8[0]
+.endif
+
+1:
+.if \w == 8
+        add             r12, r2,  #16
+        vld1.8          {d0},  [r2,  :64]    // px
+        vld1.8          {d1},  [r12, :64]    // px
+.else
+        add             r12, r2,  #8
+        vld1.32         {d0[0]},  [r2,  :32] // px
+        add             r9,  r2,  #2*8
+        vld1.32         {d0[1]},  [r12, :32] // px
+        add             r12, r12, #2*8
+        vld1.32         {d1[0]},  [r9,  :32] // px
+        vld1.32         {d1[1]},  [r12, :32] // px
+.endif
+
+        vmov.u8         q1,  #0              // sum
+        vmov.u8         q2,  #0              // sum
+.if \min
+        vmov.u16        q3,  q0              // min
+        vmov.u16        q4,  q0              // max
+.endif
+
+        // Instead of loading sec_taps 2, 1 from memory, just set it
+        // to 2 initially and decrease for the second round.
+        // This is also used as loop counter.
+        mov             lr,  #2              // sec_taps[0]
+
+2:
+.if \pri
+        ldrsb           r9,  [r5]            // off1
+
+        load_px_8       d28, d29, d30, d31, \w
+.endif
+
+.if \sec
+        add             r5,  r5,  #4         // +2*2
+        ldrsb           r9,  [r5]            // off2
+.endif
+
+.if \pri
+        ldrb            r12, [r8]            // *pri_taps
+        vdup.8          q7,  r3              // threshold
+
+        handle_pixel_8  q14, q15, q7,  q5,  r12, \min
+.endif
+
+.if \sec
+        load_px_8       d28, d29, d30, d31, \w
+
+        add             r5,  r5,  #8         // +2*4
+        ldrsb           r9,  [r5]            // off3
+
+        vdup.8          q7,  r4              // threshold
+
+        handle_pixel_8  q14, q15, q7,  q6,  lr, \min
+
+        load_px_8       d28, d29, d30, d31, \w
+
+        handle_pixel_8  q14, q15, q7,  q6,  lr, \min
+
+        sub             r5,  r5,  #11        // r5 -= 2*(2+4); r5 += 1;
+.else
+        add             r5,  r5,  #1         // r5 += 1
+.endif
+        subs            lr,  lr,  #1         // sec_tap-- (value)
+.if \pri
+        add             r8,  r8,  #1         // pri_taps++ (pointer)
+.endif
+        bne             2b
+
+        vshr.s16        q14, q1,  #15        // -(sum < 0)
+        vshr.s16        q15, q2,  #15        // -(sum < 0)
+        vadd.i16        q1,  q1,  q14        // sum - (sum < 0)
+        vadd.i16        q2,  q2,  q15        // sum - (sum < 0)
+        vrshr.s16       q1,  q1,  #4         // (8 + sum - (sum < 0)) >> 4
+        vrshr.s16       q2,  q2,  #4         // (8 + sum - (sum < 0)) >> 4
+        vaddw.u8        q1,  q1,  d0         // px + (8 + sum ...) >> 4
+        vaddw.u8        q2,  q2,  d1         // px + (8 + sum ...) >> 4
+        vqmovun.s16     d0,  q1
+        vqmovun.s16     d1,  q2
+.if \min
+        vmin.u8         q0,  q0,  q4
+        vmax.u8         q0,  q0,  q3         // iclip(px + .., min, max)
+.endif
+.if \w == 8
+        vst1.8          {d0}, [r0, :64], r1
+        add             r2,  r2,  #2*16      // tmp += 2*tmp_stride
+        subs            r7,  r7,  #2         // h -= 2
+        vst1.8          {d1}, [r0, :64], r1
+.else
+        vst1.32         {d0[0]}, [r0, :32], r1
+        add             r2,  r2,  #4*8       // tmp += 4*tmp_stride
+        vst1.32         {d0[1]}, [r0, :32], r1
+        subs            r7,  r7,  #4         // h -= 4
+        vst1.32         {d1[0]}, [r0, :32], r1
+        vst1.32         {d1[1]}, [r0, :32], r1
+.endif
+
+        // Reset pri_taps and directions back to the original point
+        sub             r5,  r5,  #2
+.if \pri
+        sub             r8,  r8,  #2
+.endif
+
+        bgt             1b
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter_8 w
+filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
+.endm
+
+filter_8 8
+filter_8 4
+
+const div_table, align=4
+        .short         840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact, align=4
+        .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+// int dav1d_cdef_find_dir_8bpc_neon(const pixel *img, const ptrdiff_t stride,
+//                                   unsigned *const var)
+function cdef_find_dir_8bpc_neon, export=1
+        push            {lr}
+        vpush           {q4-q7}
+        sub             sp,  sp,  #32          // cost
+        mov             r3,  #8
+        vmov.u16        q1,  #0                // q0-q1   sum_diag[0]
+        vmov.u16        q3,  #0                // q2-q3   sum_diag[1]
+        vmov.u16        q5,  #0                // q4-q5   sum_hv[0-1]
+        vmov.u16        q8,  #0                // q6,d16  sum_alt[0]
+                                               // q7,d17  sum_alt[1]
+        vmov.u16        q9,  #0                // q9,d22  sum_alt[2]
+        vmov.u16        q11, #0
+        vmov.u16        q10, #0                // q10,d23 sum_alt[3]
+
+
+.irpc i, 01234567
+        vld1.8          {d30}, [r0, :64], r1
+        vmov.u8         d31, #128
+        vsubl.u8        q15, d30, d31          // img[x] - 128
+        vmov.u16        q14, #0
+
+.if \i == 0
+        vmov            q0,  q15               // sum_diag[0]
+.else
+        vext.8          q12, q14, q15, #(16-2*\i)
+        vext.8          q13, q15, q14, #(16-2*\i)
+        vadd.i16        q0,  q0,  q12          // sum_diag[0]
+        vadd.i16        q1,  q1,  q13          // sum_diag[0]
+.endif
+        vrev64.16       q13, q15
+        vswp            d26, d27               // [-x]
+.if \i == 0
+        vmov            q2,  q13               // sum_diag[1]
+.else
+        vext.8          q12, q14, q13, #(16-2*\i)
+        vext.8          q13, q13, q14, #(16-2*\i)
+        vadd.i16        q2,  q2,  q12          // sum_diag[1]
+        vadd.i16        q3,  q3,  q13          // sum_diag[1]
+.endif
+
+        vpadd.u16       d26, d30, d31          // [(x >> 1)]
+        vmov.u16        d27, #0
+        vpadd.u16       d24, d26, d28
+        vpadd.u16       d24, d24, d28          // [y]
+        vmov.u16        r12, d24[0]
+        vadd.i16        q5,  q5,  q15          // sum_hv[1]
+.if \i < 4
+        vmov.16         d8[\i],   r12          // sum_hv[0]
+.else
+        vmov.16         d9[\i-4], r12          // sum_hv[0]
+.endif
+
+.if \i == 0
+        vmov.u16        q6,  q13               // sum_alt[0]
+.else
+        vext.8          q12, q14, q13, #(16-2*\i)
+        vext.8          q14, q13, q14, #(16-2*\i)
+        vadd.i16        q6,  q6,  q12          // sum_alt[0]
+        vadd.i16        d16, d16, d28          // sum_alt[0]
+.endif
+        vrev64.16       d26, d26               // [-(x >> 1)]
+        vmov.u16        q14, #0
+.if \i == 0
+        vmov            q7,  q13               // sum_alt[1]
+.else
+        vext.8          q12, q14, q13, #(16-2*\i)
+        vext.8          q13, q13, q14, #(16-2*\i)
+        vadd.i16        q7,  q7,  q12          // sum_alt[1]
+        vadd.i16        d17, d17, d26          // sum_alt[1]
+.endif
+
+.if \i < 6
+        vext.8          q12, q14, q15, #(16-2*(3-(\i/2)))
+        vext.8          q13, q15, q14, #(16-2*(3-(\i/2)))
+        vadd.i16        q9,  q9,  q12          // sum_alt[2]
+        vadd.i16        d22, d22, d26          // sum_alt[2]
+.else
+        vadd.i16        q9,  q9,  q15          // sum_alt[2]
+.endif
+.if \i == 0
+        vmov            q10, q15               // sum_alt[3]
+.elseif \i == 1
+        vadd.i16        q10, q10, q15          // sum_alt[3]
+.else
+        vext.8          q12, q14, q15, #(16-2*(\i/2))
+        vext.8          q13, q15, q14, #(16-2*(\i/2))
+        vadd.i16        q10, q10, q12          // sum_alt[3]
+        vadd.i16        d23, d23, d26          // sum_alt[3]
+.endif
+.endr
+
+        vmov.u32        q15, #105
+
+        vmull.s16       q12, d8,  d8           // sum_hv[0]*sum_hv[0]
+        vmlal.s16       q12, d9,  d9
+        vmull.s16       q13, d10, d10          // sum_hv[1]*sum_hv[1]
+        vmlal.s16       q13, d11, d11
+        vadd.s32        d8,  d24, d25
+        vadd.s32        d9,  d26, d27
+        vpadd.s32       d8,  d8,  d9           // cost[2,6] (s16, s17)
+        vmul.i32        d8,  d8,  d30          // cost[2,6] *= 105
+
+        vrev64.16       q1,  q1
+        vrev64.16       q3,  q3
+        vext.8          q1,  q1,  q1,  #10     // sum_diag[0][14-n]
+        vext.8          q3,  q3,  q3,  #10     // sum_diag[1][14-n]
+
+        vstr            s16, [sp, #2*4]        // cost[2]
+        vstr            s17, [sp, #6*4]        // cost[6]
+
+        movrel_local    r12, div_table
+        vld1.16         {q14}, [r12, :128]
+
+        vmull.s16       q5,  d0,  d0           // sum_diag[0]*sum_diag[0]
+        vmull.s16       q12, d1,  d1
+        vmlal.s16       q5,  d2,  d2
+        vmlal.s16       q12, d3,  d3
+        vmull.s16       q0,  d4,  d4           // sum_diag[1]*sum_diag[1]
+        vmull.s16       q1,  d5,  d5
+        vmlal.s16       q0,  d6,  d6
+        vmlal.s16       q1,  d7,  d7
+        vmovl.u16       q13, d28               // div_table
+        vmovl.u16       q14, d29
+        vmul.i32        q5,  q5,  q13          // cost[0]
+        vmla.i32        q5,  q12, q14
+        vmul.i32        q0,  q0,  q13          // cost[4]
+        vmla.i32        q0,  q1,  q14
+        vadd.i32        d10, d10, d11
+        vadd.i32        d0,  d0,  d1
+        vpadd.i32       d0,  d10, d0           // cost[0,4] = s0,s1
+
+        movrel_local    r12, alt_fact
+        vld1.16         {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105
+
+        vstr            s0,  [sp, #0*4]        // cost[0]
+        vstr            s1,  [sp, #4*4]        // cost[4]
+
+        vmovl.u16       q13, d29               // div_table[2*m+1] + 105
+        vmovl.u16       q14, d30
+        vmovl.u16       q15, d31
+
+.macro cost_alt dest, s1, s2, s3, s4, s5, s6
+        vmull.s16       q1,  \s1, \s1          // sum_alt[n]*sum_alt[n]
+        vmull.s16       q2,  \s2, \s2
+        vmull.s16       q3,  \s3, \s3
+        vmull.s16       q5,  \s4, \s4          // sum_alt[n]*sum_alt[n]
+        vmull.s16       q12, \s5, \s5
+        vmull.s16       q6,  \s6, \s6          // q6 overlaps the first \s1-\s2 here
+        vmul.i32        q1,  q1,  q13          // sum_alt[n]^2*fact
+        vmla.i32        q1,  q2,  q14
+        vmla.i32        q1,  q3,  q15
+        vmul.i32        q5,  q5,  q13          // sum_alt[n]^2*fact
+        vmla.i32        q5,  q12, q14
+        vmla.i32        q5,  q6,  q15
+        vadd.i32        d2,  d2,  d3
+        vadd.i32        d3,  d10, d11
+        vpadd.i32       \dest, d2, d3          // *cost_ptr
+.endm
+        cost_alt        d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3]
+        cost_alt        d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7]
+        vstr            s28, [sp, #1*4]        // cost[1]
+        vstr            s29, [sp, #3*4]        // cost[3]
+
+        mov             r0,  #0                // best_dir
+        vmov.32         r1,  d0[0]             // best_cost
+        mov             r3,  #1                // n
+
+        vstr            s30, [sp, #5*4]        // cost[5]
+        vstr            s31, [sp, #7*4]        // cost[7]
+
+        vmov.32         r12, d14[0]
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+        vmov.32         lr,  \s2
+.endif
+        cmp             r12, r1                // cost[n] > best_cost
+        itt             gt
+        movgt           r0,  r3                // best_dir = n
+        movgt           r1,  r12               // best_cost = cost[n]
+.ifnb \s2
+        add             r3,  r3,  #1           // n++
+        cmp             lr,  r1                // cost[n] > best_cost
+        vmov.32         r12, \s3
+        itt             gt
+        movgt           r0,  r3                // best_dir = n
+        movgt           r1,  lr                // best_cost = cost[n]
+        add             r3,  r3,  #1           // n++
+.endif
+.endm
+        find_best       d14[0], d8[0], d14[1]
+        find_best       d14[1], d0[1], d15[0]
+        find_best       d15[0], d8[1], d15[1]
+        find_best       d15[1]
+
+        eor             r3,  r0,  #4           // best_dir ^4
+        ldr             r12, [sp, r3, lsl #2]
+        sub             r1,  r1,  r12          // best_cost - cost[best_dir ^ 4]
+        lsr             r1,  r1,  #10
+        str             r1,  [r2]              // *var
+
+        add             sp,  sp,  #32
+        vpop            {q4-q7}
+        pop             {pc}
+endfunc
diff --git a/src/arm/32/ipred.S b/src/arm/32/ipred.S
new file mode 100644 (file)
index 0000000..d850a0c
--- /dev/null
@@ -0,0 +1,821 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * Copyright © 2019, B Krishnan Iyer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_dc_128_8bpc_neon, export=1
+        push            {r4, lr}
+        ldr             r4,  [sp, #8]
+        clz             r3,  r3
+        adr             r2,  L(ipred_dc_128_tbl)
+        sub             r3,  r3,  #25
+        ldr             r3,  [r2,  r3,  lsl #2]
+        mov             lr,  #128
+        vdup.8          q0,  lr
+        add             r2,  r2,  r3
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r2
+
+        .align 2
+L(ipred_dc_128_tbl):
+        .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+        .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+        .word 16f  - L(ipred_dc_128_tbl) + CONFIG_THUMB
+        .word 8f   - L(ipred_dc_128_tbl) + CONFIG_THUMB
+        .word 4f   - L(ipred_dc_128_tbl) + CONFIG_THUMB
+4:
+        vst1.32         {d0[0]},  [r0,  :32], r1
+        vst1.32         {d0[0]},  [r12, :32], r1
+        subs            r4,  r4,  #4
+        vst1.32         {d0[0]},  [r0,  :32], r1
+        vst1.32         {d0[0]},  [r12, :32], r1
+        bgt             4b
+        pop             {r4, pc}
+8:
+        vst1.8          {d0},  [r0,  :64], r1
+        vst1.8          {d0},  [r12, :64], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0},  [r0,  :64], r1
+        vst1.8          {d0},  [r12, :64], r1
+        bgt             8b
+        pop             {r4, pc}
+16:
+        vst1.8          {d0,  d1}, [r0,  :128], r1
+        vst1.8          {d0,  d1}, [r12, :128], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0,  d1}, [r0,  :128], r1
+        vst1.8          {d0,  d1}, [r12, :128], r1
+        bgt             16b
+        pop             {r4, pc}
+320:
+        vdup.8          q1,  lr
+32:
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
+        bgt             32b
+        pop             {r4, pc}
+640:
+        vdup.8          q1,  lr
+        vdup.8          q2,  lr
+        vdup.8          q3,  lr
+        sub             r1,  r1,  #32
+64:
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
+        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
+        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
+        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
+        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        bgt             64b
+        pop             {r4, pc}
+endfunc
+
+// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                        const pixel *const topleft,
+//                        const int width, const int height, const int a,
+//                        const int max_width, const int max_height);
+function ipred_v_8bpc_neon, export=1
+        push            {r4, lr}
+        ldr             lr,  [sp, #8]
+        clz             r3,  r3
+        adr             r4,  L(ipred_v_tbl)
+        sub             r3,  r3,  #25
+        ldr             r3,  [r4,  r3,  lsl #2]
+        add             r2,  r2,  #1
+        add             r4,  r4,  r3
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r4
+
+        .align 2
+L(ipred_v_tbl):
+        .word 640f - L(ipred_v_tbl) + CONFIG_THUMB
+        .word 320f - L(ipred_v_tbl) + CONFIG_THUMB
+        .word 160f - L(ipred_v_tbl) + CONFIG_THUMB
+        .word 80f  - L(ipred_v_tbl) + CONFIG_THUMB
+        .word 40f  - L(ipred_v_tbl) + CONFIG_THUMB
+40:
+        vld1.32         {d0[0]},  [r2]
+4:
+        vst1.32         {d0[0]},  [r0,  :32], r1
+        vst1.32         {d0[0]},  [r12, :32], r1
+        subs            lr,  lr,  #4
+        vst1.32         {d0[0]},  [r0,  :32], r1
+        vst1.32         {d0[0]},  [r12, :32], r1
+        bgt             4b
+        pop             {r4, pc}
+80:
+        vld1.8          {d0}, [r2]
+8:
+        vst1.8          {d0},  [r0,  :64], r1
+        vst1.8          {d0},  [r12, :64], r1
+        subs            lr,  lr,  #4
+        vst1.8          {d0},  [r0,  :64], r1
+        vst1.8          {d0},  [r12, :64], r1
+        bgt             8b
+        pop             {r4, pc}
+160:
+        vld1.8          {q0},  [r2]
+16:
+        vst1.8          {d0,  d1},  [r0,  :128], r1
+        vst1.8          {d0,  d1},  [r12, :128], r1
+        subs            lr,  lr,  #4
+        vst1.8          {d0,  d1},  [r0,  :128], r1
+        vst1.8          {d0,  d1},  [r12, :128], r1
+        bgt             16b
+        pop             {r4, pc}
+320:
+        vld1.8          {q0,  q1},  [r2]
+32:
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
+        subs            lr,  lr,  #4
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
+        bgt             32b
+        pop             {r4, pc}
+640:
+        vld1.8          {q0,  q1},  [r2]!
+        sub             r1,  r1,  #32
+        vld1.8          {q2,  q3},  [r2]
+64:
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
+        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
+        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        subs            lr,  lr,  #4
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
+        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
+        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        bgt             64b
+        pop             {r4, pc}
+endfunc
+
+// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                        const pixel *const topleft,
+//                        const int width, const int height, const int a,
+//                        const int max_width, const int max_height);
+function ipred_h_8bpc_neon, export=1
+        push            {r4-r5, lr}
+        ldr             r4,  [sp, #12]
+        clz             r3,  r3
+        adr             r5,  L(ipred_h_tbl)
+        sub             r3,  r3,  #25
+        ldr             r3,  [r5,  r3,  lsl #2]
+        sub             r2,  r2,  #4
+        mov             lr,  #-4
+        add             r5,  r5,  r3
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r5
+
+        .align 2
+L(ipred_h_tbl):
+        .word 640f - L(ipred_h_tbl) + CONFIG_THUMB
+        .word 320f - L(ipred_h_tbl) + CONFIG_THUMB
+        .word 160f - L(ipred_h_tbl) + CONFIG_THUMB
+        .word 8f   - L(ipred_h_tbl) + CONFIG_THUMB
+        .word 4f   - L(ipred_h_tbl) + CONFIG_THUMB
+4:
+        vld4.8          {d0[],  d1[],  d2[],  d3[]},  [r2],  lr
+        vst1.32         {d3[0]},  [r0,  :32], r1
+        vst1.32         {d2[0]},  [r12, :32], r1
+        subs            r4,  r4,  #4
+        vst1.32         {d1[0]},  [r0,  :32], r1
+        vst1.32         {d0[0]},  [r12, :32], r1
+        bgt             4b
+        pop             {r4-r5, pc}
+8:
+        vld4.8          {d0[],  d1[],  d2[],  d3[]},  [r2],  lr
+        vst1.8          {d3},  [r0,  :64], r1
+        vst1.8          {d2},  [r12, :64], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d1},  [r0,  :64], r1
+        vst1.8          {d0},  [r12, :64], r1
+        bgt             8b
+        pop             {r4-r5, pc}
+160:
+        add             r2,  r2,  #3
+        mov             lr,  #-1
+16:
+        vld1.8          {d0[],  d1[]},  [r2],  lr
+        subs            r4,  r4,  #4
+        vld1.8          {d2[],  d3[]},  [r2],  lr
+        vst1.8          {q0},  [r0,    :128],  r1
+        vld1.8          {d4[],  d5[]},  [r2],  lr
+        vst1.8          {q1},  [r12,   :128],  r1
+        vld1.8          {d6[],  d7[]},  [r2],  lr
+        vst1.8          {q2},  [r0,    :128],  r1
+        vst1.8          {q3},  [r12,   :128],  r1
+        bgt             16b
+        pop             {r4-r5, pc}
+320:
+        add             r2,  r2,  #3
+        mov             lr,  #-1
+        sub             r1,  r1,  #16
+32:
+        vld1.8          {d0[],  d1[]}, [r2],  lr
+        subs            r4,  r4,  #4
+        vld1.8          {d2[],  d3[]}, [r2],  lr
+        vst1.8          {q0},  [r0,   :128]!
+        vld1.8          {d4[],  d5[]}, [r2],  lr
+        vst1.8          {q1},  [r12,  :128]!
+        vld1.8          {d6[],  d7[]}, [r2],  lr
+        vst1.8          {q0},  [r0,   :128],  r1
+        vst1.8          {q1},  [r12,  :128],  r1
+        vst1.8          {q2},  [r0,   :128]!
+        vst1.8          {q3},  [r12,  :128]!
+        vst1.8          {q2},  [r0,   :128],  r1
+        vst1.8          {q3},  [r12,  :128],  r1
+        bgt             32b
+        pop             {r4-r5, pc}
+640:
+        add             r2,  r2,  #3
+        mov             lr,  #-1
+        sub             r1,  r1,  #48
+64:
+        vld1.8          {d0[],  d1[]},  [r2],  lr
+        subs            r4,  r4,  #4
+        vld1.8          {d2[],  d3[]},  [r2],  lr
+        vst1.8          {q0},  [r0,    :128]!
+        vld1.8          {d4[],  d5[]},  [r2],  lr
+        vst1.8          {q1},  [r12,   :128]!
+        vld1.8          {d6[],  d7[]},  [r2],  lr
+        vst1.8          {q0},  [r0,    :128]!
+        vst1.8          {q1},  [r12,   :128]!
+        vst1.8          {q0},  [r0,    :128]!
+        vst1.8          {q1},  [r12,   :128]!
+        vst1.8          {q0},  [r0,    :128],  r1
+        vst1.8          {q1},  [r12,   :128],  r1
+        vst1.8          {q2},  [r0,    :128]!
+        vst1.8          {q3},  [r12,   :128]!
+        vst1.8          {q2},  [r0,    :128]!
+        vst1.8          {q3},  [r12,   :128]!
+        vst1.8          {q2},  [r0,    :128]!
+        vst1.8          {q3},  [r12,   :128]!
+        vst1.8          {q2},  [r0,    :128],  r1
+        vst1.8          {q3},  [r12,   :128],  r1
+        bgt             64b
+        pop             {r4-r5, pc}
+endfunc
+
+// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_dc_top_8bpc_neon, export=1
+        push            {r4-r5, lr}
+        ldr             r4,  [sp, #12]
+        clz             r3,  r3
+        adr             r5,  L(ipred_dc_top_tbl)
+        sub             r3,  r3,  #25
+        ldr             r3,  [r5,  r3,  lsl #2]
+        add             r2,  r2,  #1
+        add             r5,  r5,  r3
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r5
+
+        .align 2
+L(ipred_dc_top_tbl):
+        .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+        .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+        .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+        .word 80f  - L(ipred_dc_top_tbl) + CONFIG_THUMB
+        .word 40f  - L(ipred_dc_top_tbl) + CONFIG_THUMB
+40:
+        vld1.32         {d0[]},  [r2]
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshrn.u16      d0,  q0,  #2
+        vdup.8          d0,  d0[0]
+4:
+        vst1.32         {d0[0]},  [r0,  :32], r1
+        vst1.32         {d0[0]},  [r12, :32], r1
+        subs            r4,  r4,  #4
+        vst1.32         {d0[0]},  [r0,  :32], r1
+        vst1.32         {d0[0]},  [r12, :32], r1
+        bgt             4b
+        pop             {r4-r5, pc}
+80:
+        vld1.8          {d0},  [r2]
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshrn.u16      d0,  q0,  #3
+        vdup.8          d0,  d0[0]
+8:
+        vst1.8          {d0},  [r0,  :64], r1
+        vst1.8          {d0},  [r12, :64], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0},  [r0,  :64], r1
+        vst1.8          {d0},  [r12, :64], r1
+        bgt             8b
+        pop             {r4-r5, pc}
+160:
+        vld1.8          {d0,  d1},  [r2]
+        vaddl.u8        q0,  d0,  d1
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshrn.u16      d0,  q0,  #4
+        vdup.8          q0,  d0[0]
+16:
+        vst1.8          {d0,  d1},  [r0,  :128], r1
+        vst1.8          {d0,  d1},  [r12, :128], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0,  d1},  [r0,  :128], r1
+        vst1.8          {d0,  d1},  [r12, :128], r1
+        bgt             16b
+        pop             {r4-r5, pc}
+320:
+        vld1.8          {d0,  d1,  d2,  d3},  [r2]
+        vaddl.u8        q0,  d0,  d1
+        vaddl.u8        q1,  d2,  d3
+        vadd.u16        q0,  q0,  q1
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshrn.u16      d4,  q0,  #5
+        vdup.8          q0,  d4[0]
+        vdup.8          q1,  d4[0]
+32:
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
+        bgt             32b
+        pop             {r4-r5, pc}
+640:
+        vld1.8          {d0,  d1,  d2,  d3},  [r2]!
+        vaddl.u8        q0,  d0,  d1
+        vld1.8          {d4,  d5,  d6,  d7},  [r2]
+        vaddl.u8        q1,  d2,  d3
+        vaddl.u8        q2,  d4,  d5
+        vaddl.u8        q3,  d6,  d7
+        vadd.u16        q0,  q0,  q1
+        vadd.u16        q1,  q2,  q3
+        vadd.u16        q0,  q0,  q1
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshrn.u16      d18, q0,  #6
+        vdup.8          q0,  d18[0]
+        vdup.8          q1,  d18[0]
+        vdup.8          q2,  d18[0]
+        vdup.8          q3,  d18[0]
+        sub             r1,  r1,  #32
+64:
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
+        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
+        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
+        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
+        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        bgt             64b
+        pop             {r4-r5, pc}
+endfunc
+
+// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height, const int a,
+//                              const int max_width, const int max_height);
+function ipred_dc_left_8bpc_neon, export=1
+        push            {r4-r5, lr}
+        ldr             r4,  [sp, #12]
+        sub             r2,  r2,  r4
+        clz             r3,  r3
+        clz             lr,  r4
+        sub             lr,  lr,  #25
+        adr             r5,  L(ipred_dc_left_tbl)
+        sub             r3,  r3,  #20
+        ldr             r3,  [r5,  r3,  lsl #2]
+        ldr             lr,  [r5,  lr,  lsl #2]
+        add             r3,  r5,  r3
+        add             r5,  r5,  lr
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r5
+
+        .align 2
+L(ipred_dc_left_tbl):
+        .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_left_h8)  - L(ipred_dc_left_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_left_h4)  - L(ipred_dc_left_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_left_w8)  - L(ipred_dc_left_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_left_w4)  - L(ipred_dc_left_tbl) + CONFIG_THUMB
+
+L(ipred_dc_left_h4):
+        vld1.32         {d0[]},  [r2]
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshrn.u16      d0,  q0,  #2
+        vdup.8          q0,  d0[0]
+        bx              r3
+L(ipred_dc_left_w4):
+        vst1.32         {d0[0]},  [r0,  :32], r1
+        vst1.32         {d0[0]},  [r12, :32], r1
+        subs            r4,  r4,  #4
+        vst1.32         {d0[0]},  [r0,  :32], r1
+        vst1.32         {d0[0]},  [r12, :32], r1
+        bgt             L(ipred_dc_left_w4)
+        pop             {r4-r5, pc}
+L(ipred_dc_left_h8):
+        vld1.8          {d0},  [r2]
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshrn.u16      d0,  q0,  #3
+        vdup.8          q0,  d0[0]
+        bx              r3
+L(ipred_dc_left_w8):
+        vst1.8          {d0},  [r0,  :64], r1
+        vst1.8          {d0},  [r12, :64], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0},  [r0,  :64], r1
+        vst1.8          {d0},  [r12, :64], r1
+        bgt             L(ipred_dc_left_w8)
+        pop             {r4-r5, pc}
+L(ipred_dc_left_h16):
+        vld1.8          {d0,  d1},  [r2]
+        vaddl.u8        q0,  d0,  d1
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshrn.u16      d0,  q0,  #4
+        vdup.8          q0,  d0[0]
+        bx              r3
+L(ipred_dc_left_w16):
+        vst1.8          {d0,  d1},  [r0,  :128], r1
+        vst1.8          {d0,  d1},  [r12, :128], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0,  d1},  [r0,  :128], r1
+        vst1.8          {d0,  d1},  [r12, :128], r1
+        bgt             L(ipred_dc_left_w16)
+        pop             {r4-r5, pc}
+L(ipred_dc_left_h32):
+        vld1.8          {d0,  d1,  d2,  d3},  [r2]
+        vaddl.u8        q0,  d0,  d1
+        vaddl.u8        q1,  d2,  d3
+        vadd.u16        q0,  q0,  q1
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshrn.u16      d0,  q0,  #5
+        vdup.8          q0,  d0[0]
+        bx              r3
+L(ipred_dc_left_w32):
+        vmov.8          q1,  q0
+1:
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
+        bgt             1b
+        pop             {r4-r5, pc}
+L(ipred_dc_left_h64):
+        vld1.8          {d0,  d1,  d2,  d3},  [r2]!
+        vld1.8          {d4,  d5,  d6,  d7},  [r2]
+        vaddl.u8        q0,  d0,  d1
+        vaddl.u8        q1,  d2,  d3
+        vaddl.u8        q2,  d4,  d5
+        vaddl.u8        q3,  d6,  d7
+        vadd.u16        q0,  q0,  q1
+        vadd.u16        q1,  q2,  q3
+        vadd.u16        q0,  q0,  q1
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        vrshrn.u16      d0,  q0,  #6
+        vdup.8          q0,  d0[0]
+        bx              r3
+L(ipred_dc_left_w64):
+        sub             r1,  r1,  #32
+        vmov.8          q1,  q0
+        vmov.8          q2,  q0
+        vmov.8          q3,  q0
+1:
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
+        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
+        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        subs            r4,  r4, #4
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
+        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
+        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        bgt             1b
+        pop             {r4-r5, pc}
+endfunc
+
+// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                         const pixel *const topleft,
+//                         const int width, const int height, const int a,
+//                         const int max_width, const int max_height);
+function ipred_dc_8bpc_neon, export=1
+        push            {r4-r6, lr}
+        ldr             r4,  [sp, #16]
+        sub             r2,  r2,  r4
+        add             lr,  r3,  r4        // width + height
+        clz             r3,  r3
+        clz             r12, r4
+        vdup.16         q15, lr             // width + height
+        mov             r6,  #0
+        adr             r5,  L(ipred_dc_tbl)
+        rbit            lr,  lr             // rbit(width + height)
+        sub             r3,  r3,  #20       // 25 leading bits, minus table offset 5
+        sub             r12, r12, #25
+        clz             lr,  lr             // ctz(width + height)
+        ldr             r3,  [r5,  r3,  lsl #2]
+        ldr             r12, [r5,  r12, lsl #2]
+        neg             lr,  lr             // -ctz(width + height)
+        add             r3,  r5,  r3
+        add             r5,  r5,  r12
+        vshr.u16        q15, q15, #1        // (width + height) >> 1
+        vdup.16         q14, lr             // -ctz(width + height)
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r5
+
+        .align 2
+L(ipred_dc_tbl):
+        .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_h8)  - L(ipred_dc_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_h4)  - L(ipred_dc_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_w8)  - L(ipred_dc_tbl) + CONFIG_THUMB
+        .word L(ipred_dc_w4)  - L(ipred_dc_tbl) + CONFIG_THUMB
+
+L(ipred_dc_h4):
+        vld1.32         {d0[0]},  [r2]!
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0
+        bx              r3
+L(ipred_dc_w4):
+        add             r2,  r2,  #1
+        vld1.32         {d1[0]},  [r2]
+        vmov.32         d1[1],  r6
+        vadd.s16        d0,  d0,  d30
+        vpaddl.u8       d1,  d1
+        vpadd.u16       d1,  d1
+        vpadd.u16       d1,  d1
+        cmp             r4,  #4
+        vadd.s16        d0,  d0,  d1
+        vshl.u16        d0,  d0,  d28
+        beq             1f                  // h = 8/16
+        movw            lr,  #(0x3334/2)
+        movw            r5,  #(0x5556/2)
+        cmp             r4,  #16
+        it              ne
+        movne           lr,  r5
+        vdup.16         d30, lr
+        vqdmulh.s16     d0,  d0,  d30
+1:
+        vdup.8          d0,  d0[0]
+2:
+        vst1.32         {d0[0]},  [r0,  :32], r1
+        vst1.32         {d0[0]},  [r12, :32], r1
+        subs            r4,  r4,  #4
+        vst1.32         {d0[0]},  [r0,  :32], r1
+        vst1.32         {d0[0]},  [r12, :32], r1
+        bgt             2b
+        pop             {r4-r6, pc}
+
+L(ipred_dc_h8):
+        vld1.8          {d0},  [r2]!
+        vpaddl.u8       d0,  d0
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        bx              r3
+L(ipred_dc_w8):
+        add             r2,  r2,  #1
+        vld1.8          {d2},  [r2]
+        vadd.s16        d0,  d0,  d30
+        vpaddl.u8       d2,  d2
+        vpadd.u16       d2,  d2
+        vpadd.u16       d2,  d2
+        cmp             r4,  #8
+        vadd.s16        d0,  d0,  d2
+        vshl.u16        d0,  d0,  d28
+        beq             1f                  // h = 4/16/32
+        cmp             r4,  #32
+        movw            lr,  #(0x3334/2)
+        movw            r5,  #(0x5556/2)
+        it              ne
+        movne           lr,  r5
+        vdup.16         q12, lr
+        vqdmulh.s16     d0,  d0,  d24
+1:
+        vdup.8          d0,  d0[0]
+2:
+        vst1.8          {d0},  [r0,  :64], r1
+        vst1.8          {d0},  [r12, :64], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0},  [r0,  :64], r1
+        vst1.8          {d0},  [r12, :64], r1
+        bgt             2b
+        pop             {r4-r6, pc}
+
+L(ipred_dc_h16):
+        vld1.8          {d0,  d1},  [r2]!
+        vaddl.u8        q0,  d0,  d1
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        bx              r3
+L(ipred_dc_w16):
+        add             r2,  r2,  #1
+        vld1.8          {d2,  d3},  [r2]
+        vadd.s16        d0,  d0,  d30
+        vaddl.u8        q1,  d2,  d3
+        vadd.u16        d2,  d2,  d3
+        vpadd.u16       d2,  d2
+        vpadd.u16       d2,  d2
+        cmp             r4,  #16
+        vadd.s16        d0,  d0,  d2
+        vshl.u16        d0,  d0,  d28
+        beq             1f                  // h = 4/8/32/64
+        tst             r4,  #(32+16+8)     // 16 added to make a consecutive bitmask
+        movw            lr,  #(0x3334/2)
+        movw            r5,  #(0x5556/2)
+        it              ne
+        movne           lr,  r5
+        vdup.16         q12, lr
+        vqdmulh.s16     d0,  d0,  d24
+1:
+        vdup.8          q0,  d0[0]
+2:
+        vst1.8          {d0,  d1},  [r0,  :128], r1
+        vst1.8          {d0,  d1},  [r12, :128], r1
+        subs            r4,  r4, #4
+        vst1.8          {d0,  d1},  [r0,  :128], r1
+        vst1.8          {d0,  d1},  [r12, :128], r1
+        bgt             2b
+        pop             {r4-r6, pc}
+
+L(ipred_dc_h32):
+        vld1.8          {d0,  d1,  d2,  d3},  [r2]!
+        vaddl.u8        q0,  d0,  d1
+        vaddl.u8        q1,  d2,  d3
+        vadd.u16        q0,  q0,  q1
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        bx              r3
+L(ipred_dc_w32):
+        add             r2,  r2,  #1
+        vld1.8          {d2,  d3,  d4,  d5},  [r2]
+        vadd.s16        d0,  d0,  d30
+        vaddl.u8        q2,  d4,  d5
+        vadd.u16        d4,  d4,  d5
+        vaddl.u8        q1,  d2,  d3
+        vadd.u16        d2,  d2,  d3
+        vpadd.u16       d4,  d4
+        vpadd.u16       d2,  d2
+        vpadd.u16       d4,  d4
+        vpadd.u16       d2,  d2
+        cmp             r4,  #32
+        vadd.s16        d0,  d0,  d4
+        vadd.s16        d0,  d0,  d2
+        vshl.u16        d4,  d0,  d28
+        beq             1f                  // h = 8/16/64
+        cmp             r4,  #8
+        movw            lr,  #(0x3334/2)
+        movw            r5,  #(0x5556/2)
+        it              ne
+        movne           lr,  r5
+        vdup.16         q12, lr
+        vqdmulh.s16     d4,  d4,  d24
+1:
+        vdup.8          q0,  d4[0]
+        vdup.8          q1,  d4[0]
+2:
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
+        bgt             2b
+        pop             {r4-r6, pc}
+
+L(ipred_dc_h64):
+        vld1.8          {d0,  d1,  d2,  d3},  [r2]!
+        vaddl.u8        q0,  d0,  d1
+        vld1.8          {d4,  d5,  d6,  d7},  [r2]!
+        vaddl.u8        q1,  d2,  d3
+        vaddl.u8        q2,  d4,  d5
+        vaddl.u8        q3,  d6,  d7
+        vadd.u16        q0,  q0,  q1
+        vadd.u16        q1,  q2,  q3
+        vadd.u16        q0,  q0,  q1
+        vadd.u16        d0,  d0,  d1
+        vpadd.u16       d0,  d0
+        vpadd.u16       d0,  d0
+        bx              r3
+L(ipred_dc_w64):
+        add             r2,  r2,  #1
+        vld1.8          {d2,  d3,  d4,  d5},  [r2]!
+        vadd.s16        d0,  d0,  d30
+        vaddl.u8        q2,  d4,  d5
+        vaddl.u8        q1,  d2,  d3
+        vadd.u16        d4,  d4,  d5
+        vadd.u16        d2,  d2,  d3
+        vld1.8          {d16, d17, d18, d19}, [r2]
+        vpadd.u16       d4,  d4
+        vpadd.u16       d2,  d2
+        vpadd.u16       d4,  d4
+        vpadd.u16       d2,  d2
+        vaddl.u8        q8,  d16, d17
+        vaddl.u8        q9,  d18, d19
+        vadd.u16        d16, d16, d17
+        vadd.u16        d18, d18, d19
+        vpadd.u16       d16, d16
+        vpadd.u16       d18, d18
+        vpadd.u16       d16, d16
+        vpadd.u16       d18, d18
+        vadd.u16        d2,  d2,  d4
+        vadd.u16        d3,  d16, d18
+        cmp             r4,  #64
+        vadd.s16        d0,  d0,  d2
+        vadd.s16        d0,  d0,  d3
+        vshl.u16        d18, d0,  d28
+        beq             1f                  // h = 16/32
+        movw            lr,  #(0x5556/2)
+        movt            lr,  #(0x3334/2)
+        mov             r5,  r4
+        and             r5,  r5,  #31
+        lsr             lr,  lr,  r5
+        vdup.16         d30, lr
+        vqdmulh.s16     d18, d18, d30
+1:
+        sub             r1,  r1,  #32
+        vdup.8          q0,  d18[0]
+        vdup.8          q1,  d18[0]
+        vdup.8          q2,  d18[0]
+        vdup.8          q3,  d18[0]
+2:
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
+        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
+        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        subs            r4,  r4,  #4
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
+        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
+        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        bgt             2b
+        pop             {r4-r6, pc}
+endfunc
+
diff --git a/src/arm/32/itx.S b/src/arm/32/itx.S
new file mode 100644 (file)
index 0000000..867eb19
--- /dev/null
@@ -0,0 +1,3386 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// r0-r3   external parameters
+// r4      function pointer to first transform
+// r5      function pointer to second transform
+// r6      output parameter for helper function
+// r7      input parameter for helper function
+// r8      input stride for helper function
+// r9      scratch variable for helper functions
+// r10-r11 pointer to list of eob thresholds, eob threshold value,
+//         scratch variables within helper functions (backed up)
+
+// The SIMD registers most often use the following layout:
+// d0-d3   multiplication coefficients
+// d4-d7   scratch registers
+// d8-d15  unused in some transforms, used for scratch registers in others
+// d16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+//   transform functions. (The register layout is designed to potentially
+//   allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+//   we know a significant number of inputs are zero. E.g. if the eob value
+//   indicates only a quarter of input values are set, for idct16 and up,
+//   a significant amount of calculation can be skipped, at the cost of more
+//   code duplication and special casing.
+
+const idct_coeffs, align=4
+        // idct4
+        .short          2896, 2896*8, 1567, 3784
+        // idct8
+        .short          799, 4017, 3406, 2276
+        // idct16
+        .short          401, 4076, 3166, 2598
+        .short          1931, 3612, 3920, 1189
+        // idct32
+        .short          201, 4091, 3035, 2751
+        .short          1751, 3703, 3857, 1380
+        .short          995, 3973, 3513, 2106
+        .short          2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+        .short          101*8, 4095*8, 2967*8, -2824*8
+        .short          1660*8, 3745*8, 3822*8, -1474*8
+        .short          4076, 401, 4017, 799
+
+        .short          4036*8, -700*8, 2359*8, 3349*8
+        .short          3461*8, -2191*8, 897*8, 3996*8
+        .short          -3166, -2598, -799, -4017
+
+        .short          501*8, 4065*8, 3229*8, -2520*8
+        .short          2019*8, 3564*8, 3948*8, -1092*8
+        .short          3612, 1931, 2276, 3406
+
+        .short          4085*8, -301*8, 2675*8, 3102*8
+        .short          3659*8, -1842*8, 1285*8, 3889*8
+        .short          -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+        // .h[4-5] can be interpreted as .s[2]
+        .short          1321, 3803, 2482, 3344, 3344, 0
+endconst
+
+const iadst8_coeffs, align=4
+        .short          4076, 401, 3612, 1931
+        .short          2598, 3166, 1189, 3920
+        // idct_coeffs
+        .short          2896, 0, 1567, 3784, 0, 0, 0, 0
+endconst
+
+const iadst16_coeffs, align=4
+        .short          4091, 201, 3973, 995
+        .short          3703, 1751, 3290, 2440
+        .short          2751, 3035, 2106, 3513
+        .short          1380, 3857, 601, 4052
+endconst
+
+.macro vmull_vmlal d0, s0, s1, c0, c1
+        vmull.s16       \d0, \s0, \c0
+        vmlal.s16       \d0, \s1, \c1
+.endm
+
+.macro vmull_vmlal_8h d0, d1, s0, s1, s2, s3, c0, c1
+        vmull.s16       \d0, \s0, \c0
+        vmlal.s16       \d0, \s2, \c1
+        vmull.s16       \d1, \s1, \c0
+        vmlal.s16       \d1, \s3, \c1
+.endm
+
+.macro vmull_vmlsl d0, s0, s1, c0, c1
+        vmull.s16       \d0, \s0, \c0
+        vmlsl.s16       \d0, \s1, \c1
+.endm
+
+.macro vmull_vmlsl_8h d0, d1, s0, s1, s2, s3, c0, c1
+        vmull.s16       \d0, \s0, \c0
+        vmlsl.s16       \d0, \s2, \c1
+        vmull.s16       \d1, \s1, \c0
+        vmlsl.s16       \d1, \s3, \c1
+.endm
+
+.macro vrshrn_8h d0, d1, s0, s1, shift
+        vrshrn.i32      \d0, \s0, \shift
+        vrshrn.i32      \d1, \s1, \shift
+.endm
+
+.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7
+        vqrdmulh.s16    \r0, \r0, \c
+        vqrdmulh.s16    \r1, \r1, \c
+.ifnb \r2
+        vqrdmulh.s16    \r2, \r2, \c
+        vqrdmulh.s16    \r3, \r3, \c
+.endif
+.ifnb \r4
+        vqrdmulh.s16    \r4, \r4, \c
+        vqrdmulh.s16    \r5, \r5, \c
+        vqrdmulh.s16    \r6, \r6, \c
+        vqrdmulh.s16    \r7, \r7, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
+.ifnb \load
+        vld1.8          {\load},  [\src, :64], r1
+.endif
+.ifnb \shift
+        vrshr.s16       \shift,  \shift,  #\shiftbits
+.endif
+.ifnb \addsrc
+        vaddw.u8        \adddst, \adddst, \addsrc
+.endif
+.ifnb \narrowsrc
+        vqmovun.s16     \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+        vst1.8          {\store},  [\dst, :64], r1
+.endif
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+        mov             \src, \dst
+        load_add_store  d2,  q8,    ,    ,    ,    ,    , \dst, \src, \shiftbits
+        load_add_store  d3,  q9,    ,    ,    ,    ,    , \dst, \src, \shiftbits
+        load_add_store  d4,  q10, d2,  q8,    ,    ,    , \dst, \src, \shiftbits
+        load_add_store  d5,  q11, d3,  q9,  q8,  d2,    , \dst, \src, \shiftbits
+        load_add_store  d6,  q12, d4,  q10, q9,  d3,  d2, \dst, \src, \shiftbits
+        load_add_store  d7,  q13, d5,  q11, q10, d4,  d3, \dst, \src, \shiftbits
+        load_add_store  d2,  q14, d6,  q12, q11, d5,  d4, \dst, \src, \shiftbits
+        load_add_store  d3,  q15, d7,  q13, q12, d6,  d5, \dst, \src, \shiftbits
+        load_add_store    ,     , d2,  q14, q13, d7,  d6, \dst, \src, \shiftbits
+        load_add_store    ,     , d3,  q15, q14, d2,  d7, \dst, \src, \shiftbits
+        load_add_store    ,     ,   ,     , q15, d3,  d2, \dst, \src, \shiftbits
+        load_add_store    ,     ,   ,     ,    ,   ,  d3, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src
+        mov             \src, \dst
+        load_add_store  d2,  q8,    ,    ,    ,    ,    ,  \dst, \src
+        load_add_store  d3,  q9,    ,    ,    ,    ,    ,  \dst, \src
+        load_add_store  d4,  q10, d2,  q8,    ,    ,    ,  \dst, \src
+        load_add_store  d5,  q11, d3,  q9,  q8,  d2,    ,  \dst, \src
+        load_add_store    ,     , d4,  q10, q9,  d3,  d2,  \dst, \src
+        load_add_store    ,     , d5,  q11, q10, d4,  d3,  \dst, \src
+        load_add_store    ,     ,   ,     , q11, d5,  d4,  \dst, \src
+        load_add_store    ,     ,   ,     ,    ,   ,  d5,  \dst, \src
+.endm
+.macro load_add_store4 load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src
+.ifnb \load
+        vld1.32         {\load[0]},  [\src, :32], r1
+.endif
+.ifnb \shift
+        vrshr.s16       \shift,  \shift,  #4
+.endif
+.ifnb \load
+        vld1.32         {\load[1]},  [\src, :32], r1
+.endif
+.ifnb \addsrc
+        vaddw.u8        \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+        vst1.32         {\store[0]},  [\dst, :32], r1
+.endif
+.ifnb \narrowsrc
+        vqmovun.s16     \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+        vst1.32         {\store[1]},  [\dst, :32], r1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+        mov             \src, \dst
+        load_add_store4 d0,    ,    ,    ,    ,    ,    ,  \dst, \src
+        load_add_store4 d1,  q8,    ,    ,    ,    ,    ,  \dst, \src
+        load_add_store4 d2,  q9,  d0,  q8,    ,    ,    ,  \dst, \src
+        load_add_store4 d3,  q10, d1,  q9,  q8,  d0,    ,  \dst, \src
+        load_add_store4 d4,  q11, d2,  q10, q9,  d1,  d0,  \dst, \src
+        load_add_store4 d5,  q12, d3,  q11, q10, d2,  d1,  \dst, \src
+        load_add_store4 d6,  q13, d4,  q12, q11, d3,  d2,  \dst, \src
+        load_add_store4 d7,  q14, d5,  q13, q12, d4,  d3,  \dst, \src
+        load_add_store4   ,  q15, d6,  q14, q13, d5,  d4,  \dst, \src
+        load_add_store4   ,     , d7,  q15, q14, d6,  d5,  \dst, \src
+        load_add_store4   ,     ,   ,     , q15, d7,  d6,  \dst, \src
+        load_add_store4   ,     ,   ,     ,    ,   ,  d7,  \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+        mov             \src, \dst
+        load_add_store4 d0,    ,    ,    ,    ,    ,    ,  \dst, \src
+        load_add_store4 d1,  q8,    ,    ,    ,    ,    ,  \dst, \src
+        load_add_store4 d2,  q9,  d0,  q8,    ,    ,    ,  \dst, \src
+        load_add_store4 d3,  q10, d1,  q9,  q8,  d0,    ,  \dst, \src
+        load_add_store4   ,  q11, d2,  q10, q9,  d1,  d0,  \dst, \src
+        load_add_store4   ,     , d3,  q11, q10, d2,  d1,  \dst, \src
+        load_add_store4   ,     ,   ,     , q11, d3,  d2,  \dst, \src
+        load_add_store4   ,     ,   ,     ,    ,   ,  d3,  \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+        cmp             r3,  #0
+        bne             1f
+        vmov.i16        d30, #0
+        movw            r12, #2896*8
+        vld1.16         {d16[]},  [r2, :16]
+        vdup.16         d0,  r12
+        vqrdmulh.s16    d16, d16, d0[0]
+        vst1.16         {d30[0]}, [r2, :16]
+.if (\w == 2*\h) || (2*\w == \h)
+        vqrdmulh.s16    d16, d16, d0[0]
+.endif
+.if \shift > 0
+        vrshr.s16       d16, d16, #\shift
+.endif
+        vqrdmulh.s16    d20, d16, d0[0]
+        mov             r3,  #\h
+        vrshr.s16       d16, d20, #4
+        vrshr.s16       d17, d20, #4
+        b               idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+1:
+        vld1.32         {d0[0]}, [r0, :32], r1
+        vld1.32         {d0[1]}, [r0, :32], r1
+        vld1.32         {d1[0]}, [r0, :32], r1
+        vld1.32         {d1[1]}, [r0, :32], r1
+        subs            r3,  r3,  #4
+        sub             r0,  r0,  r1, lsl #2
+        vaddw.u8        q10, q8,  d0
+        vqmovun.s16     d0,  q10
+        vaddw.u8        q11, q8,  d1
+        vst1.32         {d0[0]}, [r0, :32], r1
+        vqmovun.s16     d1,  q11
+        vst1.32         {d0[1]}, [r0, :32], r1
+        vst1.32         {d1[0]}, [r0, :32], r1
+        vst1.32         {d1[1]}, [r0, :32], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+function idct_dc_w8_neon
+1:
+        vld1.8          {d0}, [r0, :64], r1
+        vld1.8          {d1}, [r0, :64], r1
+        vld1.8          {d2}, [r0, :64], r1
+        vaddw.u8        q10, q8,  d0
+        vld1.8          {d3}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #2
+        subs            r3,  r3,  #4
+        vaddw.u8        q11, q8,  d1
+        vqmovun.s16     d0,  q10
+        vaddw.u8        q12, q8,  d2
+        vqmovun.s16     d1,  q11
+        vaddw.u8        q13, q8,  d3
+        vst1.8          {d0}, [r0, :64], r1
+        vqmovun.s16     d2,  q12
+        vst1.8          {d1}, [r0, :64], r1
+        vqmovun.s16     d3,  q13
+        vst1.8          {d2}, [r0, :64], r1
+        vst1.8          {d3}, [r0, :64], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+function idct_dc_w16_neon
+1:
+        vld1.8          {q0}, [r0, :128], r1
+        vld1.8          {q1}, [r0, :128], r1
+        vld1.8          {q2}, [r0, :128], r1
+        subs            r3,  r3,  #4
+        vaddw.u8        q10, q8,  d0
+        vaddw.u8        q11, q8,  d1
+        vld1.8          {q3}, [r0, :128], r1
+        vaddw.u8        q12, q8,  d2
+        vaddw.u8        q13, q8,  d3
+        sub             r0,  r0,  r1, lsl #2
+        vaddw.u8        q14, q8,  d4
+        vaddw.u8        q15, q8,  d5
+        vqmovun.s16     d0,  q10
+        vqmovun.s16     d1,  q11
+        vaddw.u8        q10, q8,  d6
+        vaddw.u8        q11, q8,  d7
+        vqmovun.s16     d2,  q12
+        vqmovun.s16     d3,  q13
+        vqmovun.s16     d4,  q14
+        vqmovun.s16     d5,  q15
+        vst1.8          {q0}, [r0, :128], r1
+        vqmovun.s16     d6,  q10
+        vqmovun.s16     d7,  q11
+        vst1.8          {q1}, [r0, :128], r1
+        vst1.8          {q2}, [r0, :128], r1
+        vst1.8          {q3}, [r0, :128], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+function idct_dc_w32_neon
+1:
+        vld1.8          {q0, q1}, [r0, :128], r1
+        subs            r3,  r3,  #2
+        vld1.8          {q2, q3}, [r0, :128], r1
+        vaddw.u8        q10, q8,  d0
+        vaddw.u8        q11, q8,  d1
+        vaddw.u8        q12, q8,  d2
+        vaddw.u8        q13, q8,  d3
+        sub             r0,  r0,  r1, lsl #1
+        vaddw.u8        q14, q8,  d4
+        vaddw.u8        q15, q8,  d5
+        vqmovun.s16     d0,  q10
+        vqmovun.s16     d1,  q11
+        vaddw.u8        q10, q8,  d6
+        vaddw.u8        q11, q8,  d7
+        vqmovun.s16     d2,  q12
+        vqmovun.s16     d3,  q13
+        vqmovun.s16     d4,  q14
+        vqmovun.s16     d5,  q15
+        vst1.8          {q0, q1}, [r0, :128], r1
+        vqmovun.s16     d6,  q10
+        vqmovun.s16     d7,  q11
+        vst1.8          {q2, q3}, [r0, :128], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+function idct_dc_w64_neon
+        sub             r1,  r1,  #32
+1:
+        vld1.8          {q0, q1}, [r0, :128]!
+        subs            r3,  r3,  #1
+        vld1.8          {q2, q3}, [r0, :128]
+        vaddw.u8        q10, q8,  d0
+        vaddw.u8        q11, q8,  d1
+        vaddw.u8        q12, q8,  d2
+        vaddw.u8        q13, q8,  d3
+        sub             r0,  r0,  #32
+        vaddw.u8        q14, q8,  d4
+        vaddw.u8        q15, q8,  d5
+        vqmovun.s16     d0,  q10
+        vqmovun.s16     d1,  q11
+        vaddw.u8        q10, q8,  d6
+        vaddw.u8        q11, q8,  d7
+        vqmovun.s16     d2,  q12
+        vqmovun.s16     d3,  q13
+        vqmovun.s16     d4,  q14
+        vqmovun.s16     d5,  q15
+        vst1.8          {q0, q1}, [r0, :128]!
+        vqmovun.s16     d6,  q10
+        vqmovun.s16     d7,  q11
+        vst1.8          {q2, q3}, [r0, :128], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+.macro iwht4
+        vadd.i16        d16, d16, d17
+        vsub.i16        d21, d18, d19
+        vsub.i16        d20, d16, d21
+        vshr.s16        d20, d20, #1
+        vsub.i16        d18, d20, d17
+        vsub.i16        d17, d20, d19
+        vadd.i16        d19, d21, d18
+        vsub.i16        d16, d16, d17
+.endm
+
+.macro idct_4h_x4 r0, r1, r2, r3
+        vmull_vmlal     q3,  \r1, \r3, d0[3], d0[2]
+        vmull_vmlsl     q2,  \r1, \r3, d0[2], d0[3]
+        vmull_vmlal     q1,  \r0, \r2, d0[0], d0[0]
+        vrshrn.i32      d6,  q3,  #12
+        vrshrn.i32      d7,  q2,  #12
+        vmull_vmlsl     q2,  \r0, \r2, d0[0], d0[0]
+        vrshrn.i32      d2,  q1,  #12
+        vrshrn.i32      d3,  q2,  #12
+        vqadd.s16       \r0, d2,  d6
+        vqsub.s16       \r3, d2,  d6
+        vqadd.s16       \r1, d3,  d7
+        vqsub.s16       \r2, d3,  d7
+.endm
+
+.macro idct_8h_x4 q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+        vmull_vmlal_8h  q6,  q7,  \r2, \r3, \r6, \r7, d0[3], d0[2]
+        vmull_vmlsl_8h  q4,  q5,  \r2, \r3, \r6, \r7, d0[2], d0[3]
+        vmull_vmlal_8h  q2,  q3,  \r0, \r1, \r4, \r5, d0[0], d0[0]
+        vrshrn_8h       d12, d13, q6,  q7,  #12
+        vrshrn_8h       d14, d15, q4,  q5,  #12
+        vmull_vmlsl_8h  q4,  q5,  \r0, \r1, \r4, \r5, d0[0], d0[0]
+        vrshrn_8h       d4,  d5,  q2,  q3,  #12
+        vrshrn_8h       d6,  d7,  q4,  q5,  #12
+        vqadd.s16       \q0, q2,  q6
+        vqsub.s16       \q3, q2,  q6
+        vqadd.s16       \q1, q3,  q7
+        vqsub.s16       \q2, q3,  q7
+.endm
+
+function inv_dct_4h_x4_neon, export=1
+        movrel_local    r12, idct_coeffs
+        vld1.16         {d0}, [r12, :64]
+        idct_4h_x4      d16, d17, d18, d19
+        bx              lr
+endfunc
+
+function inv_dct_8h_x4_neon, export=1
+        movrel_local    r12, idct_coeffs
+        vld1.16         {d0}, [r12, :64]
+        idct_8h_x4      q8,  q9,  q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+        bx              lr
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+        movrel_local    r12, iadst4_coeffs
+        vld1.16         {d0, d1}, [r12, :128]
+
+        vsubl.s16       q1,  d16, d18
+        vmull.s16       q2,  d16, d0[0]
+        vmlal.s16       q2,  d18, d0[1]
+        vmlal.s16       q2,  d19, d0[2]
+        vmull.s16       q10, d17, d0[3]
+        vaddw.s16       q1,  q1,  d19
+        vmull.s16       q3,  d16, d0[2]
+        vmlsl.s16       q3,  d18, d0[0]
+        vmlsl.s16       q3,  d19, d0[1]
+
+        vadd.s32        q11, q2,  q3
+        vmul.s32        q1,  q1,  d1[0]
+        vadd.s32        q2,  q2,  q10
+        vadd.s32        q3,  q3,  q10
+        vsub.s32        q11, q11, q10
+
+        vrshrn.i32      \o0, q2,  #12
+        vrshrn.i32      \o2, q1,  #12
+        vrshrn.i32      \o1, q3,  #12
+        vrshrn.i32      \o3, q11, #12
+.endm
+
+function inv_adst_4h_x4_neon, export=1
+        iadst_4x4       d16, d17, d18, d19
+        bx              lr
+endfunc
+
+function inv_flipadst_4h_x4_neon, export=1
+        iadst_4x4       d19, d18, d17, d16
+        bx              lr
+endfunc
+
+.macro iadst_8x4 o0, o1, o2, o3, o4, o5, o6, o7
+        movrel_local    r12, iadst4_coeffs
+        vld1.16         {d0, d1}, [r12, :128]
+
+        vsubl.s16       q2,  d16, d20
+        vsubl.s16       q3,  d17, d21
+        vmull.s16       q4,  d16, d0[0]
+        vmlal.s16       q4,  d20, d0[1]
+        vmlal.s16       q4,  d22, d0[2]
+        vmull.s16       q5,  d17, d0[0]
+        vmlal.s16       q5,  d21, d0[1]
+        vmlal.s16       q5,  d23, d0[2]
+        vaddw.s16       q2,  q2,  d22
+        vaddw.s16       q3,  q3,  d23
+        vmull.s16       q6,  d16, d0[2]
+        vmlsl.s16       q6,  d20, d0[0]
+        vmlsl.s16       q6,  d22, d0[1]
+        vmull.s16       q7,  d17, d0[2]
+        vmlsl.s16       q7,  d21, d0[0]
+        vmlsl.s16       q7,  d23, d0[1]
+
+        vmul.s32        q10, q2,  d1[0]
+        vmul.s32        q11, q3,  d1[0]
+
+        vmull.s16       q2,  d18, d0[3]
+        vmull.s16       q3,  d19, d0[3]
+
+        vadd.s32        q8,  q4,  q2 // out0
+        vadd.s32        q9,  q5,  q3
+
+        vadd.s32        q4,  q4,  q6 // out3
+        vadd.s32        q5,  q5,  q7
+
+        vadd.s32        q6,  q6,  q2 // out1
+        vadd.s32        q7,  q7,  q3
+
+        vsub.s32        q4,  q4,  q2 // out3
+        vsub.s32        q5,  q5,  q3
+
+        vrshrn.i32      d20, q10, #12
+        vrshrn.i32      d21, q11, #12
+
+        vrshrn.i32      \o0, q8,  #12
+        vrshrn.i32      \o1, q9,  #12
+
+.ifc \o4, d18
+        vmov            q9,  q10
+.endif
+
+        vrshrn.i32      \o2, q6,  #12
+        vrshrn.i32      \o3, q7,  #12
+
+        vrshrn.i32      \o6, q4,  #12
+        vrshrn.i32      \o7, q5,  #12
+.endm
+
+function inv_adst_8h_x4_neon, export=1
+        iadst_8x4       d16, d17, d18, d19, d20, d21, d22, d23
+        bx              lr
+endfunc
+
+function inv_flipadst_8h_x4_neon, export=1
+        iadst_8x4       d22, d23, d20, d21, d18, d19, d16, d17
+        bx              lr
+endfunc
+
+function inv_identity_4h_x4_neon, export=1
+        movw            r12, #(5793-4096)*8
+        vdup.16         d0,  r12
+        vqrdmulh.s16    q2,  q8,  d0[0]
+        vqrdmulh.s16    q3,  q9,  d0[0]
+        vqadd.s16       q8,  q8,  q2
+        vqadd.s16       q9,  q9,  q3
+        bx              lr
+endfunc
+
+function inv_identity_8h_x4_neon, export=1
+        movw            r12, #(5793-4096)*8
+        vdup.16         d0,  r12
+        vqrdmulh.s16    q1,  q8,  d0[0]
+        vqrdmulh.s16    q2,  q9,  d0[0]
+        vqrdmulh.s16    q3,  q10, d0[0]
+        vqadd.s16       q8,  q8,  q1
+        vqrdmulh.s16    q1,  q11, d0[0]
+        vqadd.s16       q9,  q9,  q2
+        vqadd.s16       q10, q10, q3
+        vqadd.s16       q11, q11, q1
+        bx              lr
+endfunc
+
+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0, \r1, \r2, \r3
+        vqrdmulh.s16    q1,  \i,  \c
+        vrhadd.s16      \i,  \i,  q1
+.endr
+.endm
+
+function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
+        push            {r4-r5,lr}
+        vmov.i16        q15, #0
+        vld1.16         {d16, d17, d18, d19}, [r2, :128]
+        vst1.16         {q15}, [r2, :128]!
+
+        vshr.s16        q8,  q8,  #2
+        vshr.s16        q9,  q9,  #2
+
+        iwht4
+
+        vst1.16         {q15}, [r2, :128]!
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+
+        iwht4
+
+        vld1.32         {d0[]},  [r0, :32], r1
+        vld1.32         {d0[1]}, [r0, :32], r1
+        vld1.32         {d1[]},  [r0, :32], r1
+        vld1.32         {d1[1]}, [r0, :32], r1
+
+        b               L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+        vmov.i16        q15, #0
+        vld1.16         {d16, d17, d18, d19}, [r2, :128]
+        vst1.16         {q15}, [r2, :128]!
+
+        blx             r4
+
+        vst1.16         {q15}, [r2, :128]!
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+
+        blx             r5
+
+        vld1.32         {d0[]},  [r0, :32], r1
+        vld1.32         {d0[1]}, [r0, :32], r1
+        vld1.32         {d1[]},  [r0, :32], r1
+        vld1.32         {d1[1]}, [r0, :32], r1
+        vrshr.s16       q8,  q8,  #4
+        vrshr.s16       q9,  q9,  #4
+
+L(itx_4x4_end):
+        sub             r0,  r0,  r1, lsl #2
+        vaddw.u8        q8,  q8,  d0
+        vqmovun.s16     d0,  q8
+        vaddw.u8        q9,  q9,  d1
+        vst1.32         {d0[0]}, [r0, :32], r1
+        vqmovun.s16     d1,  q9
+        vst1.32         {d0[1]}, [r0, :32], r1
+        vst1.32         {d1[0]}, [r0, :32], r1
+        vst1.32         {d1[1]}, [r0, :32], r1
+
+        pop             {r4-r5,pc}
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
+        push            {r4-r5,lr}
+
+.ifc \txfm1\()_\txfm2, dct_dct
+        cmp             r3,  #0
+        bne             1f
+        vmov.i16        d30, #0
+        movw            r12, #2896*8
+        vld1.16         {d16[]},  [r2, :16]
+        vdup.16         d4,  r12
+        vst1.16         {d30[0]}, [r2, :16]
+        vqrdmulh.s16    d16, d16, d4[0]
+        vld1.32         {d0[0]},  [r0, :32], r1
+        vqrdmulh.s16    d20, d16, d4[0]
+        vld1.32         {d0[1]},  [r0, :32], r1
+        vrshr.s16       d16, d20, #4
+        vrshr.s16       d17, d20, #4
+        vld1.32         {d1[0]},  [r0, :32], r1
+        vmov            q9,  q8
+        vld1.32         {d1[1]}, [r0, :32], r1
+        b               L(itx_4x4_end)
+1:
+.endif
+        movrel_local    r4,  inv_\txfm1\()_4h_x4_neon
+        movrel_local    r5,  inv_\txfm2\()_4h_x4_neon
+        b               inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+        idct_8h_x4      \q0, \q2, \q4, \q6, \r0, \r1, \r4, \r5, \r8, \r9, \r12, \r13
+
+        vmull_vmlsl_8h  q2,   q3,   \r2,  \r3,  \r14, \r15, d1[0], d1[1] // -> t4a
+        vmull_vmlal_8h  q4,   q5,   \r2,  \r3,  \r14, \r15, d1[1], d1[0] // -> t7a
+        vmull_vmlsl_8h  q6,   q7,   \r10, \r11, \r6,  \r7,  d1[2], d1[3] // -> t5a
+        vrshrn_8h       \r2,  \r3,  q2,   q3,   #12         // t4a
+        vrshrn_8h       \r14, \r15, q4,   q5,   #12         // t7a
+        vmull_vmlal_8h  q2,   q3,   \r10, \r11, \r6,  \r7,  d1[3], d1[2] // -> t6a
+        vrshrn_8h       \r6,  \r7,  q6,   q7,   #12         // t5a
+        vrshrn_8h       \r10, \r11, q2,   q3,   #12         // taa
+
+        vqadd.s16       q2,   \q1,  \q3 // t4
+        vqsub.s16       \q1,  \q1,  \q3 // t5a
+        vqadd.s16       q3,   \q7,  \q5 // t7
+        vqsub.s16       \q3,  \q7,  \q5 // t6a
+
+        vmull_vmlsl_8h  q4,   q5,   \r6,  \r7,  \r2,  \r3,  d0[0], d0[0] // -> t5
+        vmull_vmlal_8h  q6,   q7,   \r6,  \r7,  \r2,  \r3,  d0[0], d0[0] // -> t6
+        vrshrn_8h       d8,   d9,   q4,   q5,  #12 // t5
+        vrshrn_8h       d10,  d11,  q6,   q7,  #12 // t6
+
+        vqsub.s16       \q7,  \q0,  q3 // out7
+        vqadd.s16       \q0,  \q0,  q3 // out0
+        vqadd.s16       \q1,  \q2,  q5 // out1
+        vqsub.s16       q6,   \q2,  q5 // out6
+        vqadd.s16       \q2,  \q4,  q4 // out2
+        vqsub.s16       \q5,  \q4,  q4 // out5
+        vqadd.s16       \q3,  \q6,  q2 // out3
+        vqsub.s16       \q4,  \q6,  q2 // out4
+        vmov            \q6,  q6       // out6
+.endm
+
+.macro idct_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7
+        idct_4h_x4      \r0, \r2, \r4, \r6
+
+        vmull_vmlsl     q1,   \r1,  \r7, d1[0], d1[1] // -> t4a
+        vmull_vmlal     q2,   \r1,  \r7, d1[1], d1[0] // -> t7a
+        vmull_vmlsl     q3,   \r5,  \r3, d1[2], d1[3] // -> t5a
+        vrshrn.i32      \r1,  q1,   #12               // t4a
+        vmull_vmlal     q1,   \r5,  \r3, d1[3], d1[2] // -> t6a
+        vrshrn.i32      \r7,  q2,   #12               // t7a
+        vrshrn.i32      \r3,  q3,   #12               // t5a
+        vrshrn.i32      \r5,  q1,   #12               // taa
+
+        vqadd.s16       d2,   \r1,  \r3 // t4
+        vqsub.s16       \r1,  \r1,  \r3 // t5a
+        vqadd.s16       d3,   \r7,  \r5 // t7
+        vqsub.s16       \r3,  \r7,  \r5 // t6a
+
+        vmull_vmlsl     q2,   \r3,  \r1, d0[0], d0[0] // -> t5
+        vmull_vmlal     q3,   \r3,  \r1, d0[0], d0[0] // -> t6
+        vrshrn.i32      d4,   q2,   #12               // t5
+        vrshrn.i32      d5,   q3,   #12               // t6
+
+        vqsub.s16       \r7,  \r0,  d3 // out7
+        vqadd.s16       \r0,  \r0,  d3 // out0
+        vqadd.s16       \r1,  \r2,  d5 // out1
+        vqsub.s16       d6,   \r2,  d5 // out6
+        vqadd.s16       \r2,  \r4,  d4 // out2
+        vqsub.s16       \r5,  \r4,  d4 // out5
+        vqadd.s16       \r3,  \r6,  d2 // out3
+        vqsub.s16       \r4,  \r6,  d2 // out4
+        vmov            \r6,  d6       // out6
+.endm
+
+function inv_dct_8h_x8_neon, export=1
+        movrel_local    r12, idct_coeffs
+        vld1.16         {q0}, [r12, :128]
+        idct_8h_x8      q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        bx              lr
+endfunc
+
+function inv_dct_4h_x8_neon, export=1
+        movrel_local    r12, idct_coeffs
+        vld1.16         {q0}, [r12, :128]
+        idct_4h_x8      d16, d17, d18, d19, d20, d21, d22, d23
+        bx              lr
+endfunc
+
+.macro iadst_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+        movrel_local    r12, iadst8_coeffs
+        vld1.16         {d0, d1, d2}, [r12, :64]
+
+        vmull_vmlal_8h  q2,  q3,  d30, d31, d16, d17, d0[0], d0[1]
+        vmull_vmlsl_8h  q4,  q5,  d30, d31, d16, d17, d0[1], d0[0]
+        vmull_vmlal_8h  q6,  q7,  d26, d27, d20, d21, d0[2], d0[3]
+        vrshrn_8h       d16, d17, q2,  q3,  #12  // t0a
+        vrshrn_8h       d30, d31, q4,  q5,  #12  // t1a
+        vmull_vmlsl_8h  q2,  q3,  d26, d27, d20, d21, d0[3], d0[2]
+        vmull_vmlal_8h  q4,  q5,  d22, d23, d24, d25, d1[0], d1[1]
+        vrshrn_8h       d20, d21, q6,  q7,  #12  // t2a
+        vrshrn_8h       d26, d27, q2,  q3,  #12  // t3a
+        vmull_vmlsl_8h  q6,  q7,  d22, d23, d24, d25, d1[1], d1[0]
+        vmull_vmlal_8h  q2,  q3,  d18, d19, d28, d29, d1[2], d1[3]
+        vrshrn_8h       d24, d25, q4,  q5,  #12  // t4a
+        vrshrn_8h       d22, d23, q6,  q7,  #12  // t5a
+        vmull_vmlsl_8h  q4,  q5,  d18, d19, d28, d29, d1[3], d1[2]
+        vrshrn_8h       d28, d29, q2,  q3,  #12  // t6a
+        vrshrn_8h       d18, d19, q4,  q5,  #12  // t7a
+
+        vqadd.s16       q2,  q8,  q12 // t0
+        vqsub.s16       q3,  q8,  q12 // t4
+        vqadd.s16       q4,  q15, q11 // t1
+        vqsub.s16       q5,  q15, q11 // t5
+        vqadd.s16       q6,  q10, q14 // t2
+        vqsub.s16       q7,  q10, q14 // t6
+        vqadd.s16       q10, q13, q9  // t3
+        vqsub.s16       q11, q13, q9  // t7
+
+        vmull_vmlal_8h  q8,  q9,  d6,  d7,  d10, d11, d2[3], d2[2]
+        vmull_vmlsl_8h  q12, q13, d6,  d7,  d10, d11, d2[2], d2[3]
+        vmull_vmlsl_8h  q14, q15, d22, d23, d14, d15, d2[3], d2[2]
+
+        vrshrn_8h       d6,  d7,  q8,  q9,  #12  // t4a
+        vrshrn_8h       d10, d11, q12, q13, #12  // t5a
+
+        vmull_vmlal_8h  q8,  q9,  d22, d23, d14, d15, d2[2], d2[3]
+
+        vrshrn_8h       d14, d15, q14, q15, #12  // t6a
+        vrshrn_8h       d22, d23, q8,  q9,  #12  // t7a
+
+        vqadd.s16       \q0, q2,  q6  // out0
+        vqsub.s16       q2,  q2,  q6  // t2
+        vqadd.s16       \q7, q4,  q10 // out7
+        vqsub.s16       q4,  q4,  q10 // t3
+        vqneg.s16       \q7, \q7     // out7
+
+        vqadd.s16       \q1, q3,  q7  // out1
+        vqsub.s16       q3,  q3,  q7  // t6
+        vqadd.s16       \q6, q5,  q11 // out6
+        vqsub.s16       q5,  q5,  q11 // t7
+        vqneg.s16       \q1, \q1     // out1
+
+        vmull_vmlal_8h  q10, q11, d4,  d5,  d8,  d9,  d2[0], d2[0] // -> out3 (q11 or q12)
+        vmull_vmlsl_8h  q6,  q7,  d4,  d5,  d8,  d9,  d2[0], d2[0] // -> out4 (q12 or q11)
+        vmull_vmlsl_8h  q12, q13, d6,  d7,  d10, d11, d2[0], d2[0] // -> out5 (q13 or q10)
+        vrshrn_8h       d4,  d5,  q10, q11, #12 // out3
+        vmull_vmlal_8h  q10, q11, d6,  d7,  d10, d11, d2[0], d2[0] // -> out2 (q10 or q13)
+        vrshrn_8h       d6,  d7,  q12, q13, #12 // out5
+        vrshrn_8h       \r4, \r5, q10, q11, #12 // out2 (q10 or q13)
+        vrshrn_8h       \r8, \r9, q6,  q7,  #12 // out4 (q12 or q11)
+
+        vqneg.s16       \q3, q2     // out3
+        vqneg.s16       \q5, q3     // out5
+.endm
+
+.macro iadst_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7
+        movrel_local    r12, iadst8_coeffs
+        vld1.16         {d0, d1, d2}, [r12, :64]
+
+        vmull_vmlal     q2,  d23, d16, d0[0], d0[1]
+        vmull_vmlsl     q3,  d23, d16, d0[1], d0[0]
+        vmull_vmlal     q4,  d21, d18, d0[2], d0[3]
+        vrshrn.i32      d16, q2,  #12 // t0a
+        vrshrn.i32      d23, q3,  #12 // t1a
+        vmull_vmlsl     q5,  d21, d18, d0[3], d0[2]
+        vmull_vmlal     q6,  d19, d20, d1[0], d1[1]
+        vrshrn.i32      d18, q4,  #12 // t2a
+        vrshrn.i32      d21, q5,  #12 // t3a
+        vmull_vmlsl     q7,  d19, d20, d1[1], d1[0]
+        vmull_vmlal     q2,  d17, d22, d1[2], d1[3]
+        vrshrn.i32      d20, q6,  #12 // t4a
+        vrshrn.i32      d19, q7,  #12 // t5a
+        vmull_vmlsl     q3,  d17, d22, d1[3], d1[2]
+        vrshrn.i32      d22, q2,  #12 // t6a
+        vrshrn.i32      d17, q3,  #12 // t7a
+
+        vqadd.s16       d4,  d16, d20 // t0
+        vqsub.s16       d5,  d16, d20 // t4
+        vqadd.s16       d6,  d23, d19 // t1
+        vqsub.s16       d7,  d23, d19 // t5
+        vqadd.s16       d8,  d18, d22 // t2
+        vqsub.s16       d9,  d18, d22 // t6
+        vqadd.s16       d18, d21, d17 // t3
+        vqsub.s16       d19, d21, d17 // t7
+
+        vmull_vmlal     q8,  d5,  d7,  d2[3], d2[2]
+        vmull_vmlsl     q10, d5,  d7,  d2[2], d2[3]
+        vmull_vmlsl     q11, d19, d9,  d2[3], d2[2]
+
+        vrshrn.i32      d5,  q8,  #12 // t4a
+        vrshrn.i32      d7,  q10, #12 // t5a
+
+        vmull_vmlal     q8,  d19, d9,  d2[2], d2[3]
+
+        vrshrn.i32      d9,  q11, #12 // t6a
+        vrshrn.i32      d19, q8,  #12 // t7a
+
+        vqadd.s16       \r0, d4,  d8  // out0
+        vqsub.s16       d4,  d4,  d8  // t2
+        vqadd.s16       \r7, d6,  d18 // out7
+        vqsub.s16       d6,  d6,  d18 // t3
+        vqneg.s16       \r7, \r7      // out7
+
+        vqadd.s16       \r1, d5,  d9  // out1
+        vqsub.s16       d5,  d5,  d9  // t6
+        vqadd.s16       \r6, d7,  d19 // out6
+        vqsub.s16       d7,  d7,  d19 // t7
+        vqneg.s16       \r1, \r1      // out1
+
+        vmull_vmlal     q9,  d4,  d6,  d2[0], d2[0] // -> out3 (d19 or d20)
+        vmull_vmlsl     q4,  d4,  d6,  d2[0], d2[0] // -> out4 (d20 or d19)
+        vmull_vmlsl     q10, d5,  d7,  d2[0], d2[0] // -> out5 (d21 or d18)
+        vrshrn.i32      d4,  q9,  #12 // out3
+        vmull_vmlal     q9,  d5,  d7,  d2[0], d2[0] // -> out2 (d18 or d21)
+        vrshrn.i32      d5,  q10, #12 // out5
+        vrshrn.i32      \r2, q9,  #12 // out2 (d18 or d21)
+        vrshrn.i32      \r4, q4,  #12 // out4 (d20 or d19)
+
+        vqneg.s16       \r3, d4       // out3
+        vqneg.s16       \r5, d5       // out5
+.endm
+
+function inv_adst_8h_x8_neon, export=1
+        iadst_8h_x8     q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        bx              lr
+endfunc
+
+function inv_flipadst_8h_x8_neon, export=1
+        iadst_8h_x8     q15, q14, q13, q12, q11, q10, q9,  q8,  d30, d31, d28, d29, d26, d27, d24, d25, d22, d23, d20, d21, d18, d19, d16, d17
+        bx              lr
+endfunc
+
+function inv_adst_4h_x8_neon, export=1
+        iadst_4h_x8     d16, d17, d18, d19, d20, d21, d22, d23
+        bx              lr
+endfunc
+
+function inv_flipadst_4h_x8_neon, export=1
+        iadst_4h_x8     d23, d22, d21, d20, d19, d18, d17, d16
+        bx              lr
+endfunc
+
+function inv_identity_8h_x8_neon, export=1
+        vqshl.s16       q8,  q8,  #1
+        vqshl.s16       q9,  q9,  #1
+        vqshl.s16       q10, q10, #1
+        vqshl.s16       q11, q11, #1
+        vqshl.s16       q12, q12, #1
+        vqshl.s16       q13, q13, #1
+        vqshl.s16       q14, q14, #1
+        vqshl.s16       q15, q15, #1
+        bx              lr
+endfunc
+
+function inv_identity_4h_x8_neon, export=1
+        vqshl.s16       q8,  q8,  #1
+        vqshl.s16       q9,  q9,  #1
+        vqshl.s16       q10, q10, #1
+        vqshl.s16       q11, q11, #1
+        bx              lr
+endfunc
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
+        vmov.i16        q0,  #0
+        vmov.i16        q1,  #0
+        vld1.16         {q8,  q9},  [r2, :128]
+        vst1.16         {q0,  q1},  [r2, :128]!
+        vld1.16         {q10, q11}, [r2, :128]
+        vst1.16         {q0,  q1},  [r2, :128]!
+        vld1.16         {q12, q13}, [r2, :128]
+        vst1.16         {q0,  q1},  [r2, :128]!
+        vld1.16         {q14, q15}, [r2, :128]
+        vst1.16         {q0,  q1},  [r2, :128]
+
+.ifc \variant, identity_
+        // The identity shl #1 and downshift srshr #1 cancel out
+.else
+        blx             r4
+
+        vrshr.s16       q8,  q8,  #1
+        vrshr.s16       q9,  q9,  #1
+        vrshr.s16       q10, q10, #1
+        vrshr.s16       q11, q11, #1
+        vrshr.s16       q12, q12, #1
+        vrshr.s16       q13, q13, #1
+        vrshr.s16       q14, q14, #1
+        vrshr.s16       q15, q15, #1
+.endif
+
+        transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+        blx             r5
+
+        load_add_store_8x8 r0, r7
+        vpop            {q4-q7}
+        pop             {r4-r5,r7,pc}
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         8,   8,   1
+.endif
+        push            {r4-r5,r7,lr}
+        vpush           {q4-q7}
+        movrel_local    r5,  inv_\txfm2\()_8h_x8_neon
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_8x8_neon
+.else
+        movrel_local    r4,  inv_\txfm1\()_8h_x8_neon
+        b               inv_txfm_add_8x8_neon
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
+
+function inv_txfm_add_8x4_neon
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+        movw            r12, #2896*8
+        vdup.16         d0,  r12
+        vld1.16         {d16, d17, d18, d19}, [r2, :128]
+        vst1.16         {q14, q15}, [r2, :128]!
+        vld1.16         {d20, d21, d22, d23}, [r2, :128]
+        vst1.16         {q14, q15}, [r2, :128]
+
+        scale_input     d0[0], q8,  q9, q10, q11
+
+        blx             r4
+
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        vswp            d17, d20
+        vswp            d19, d21
+        vswp            d18, d20
+        vswp            d21, d22
+
+        blx             r5
+
+        load_add_store_8x4 r0, r7
+        vpop            {q4-q7}
+        pop             {r4-r5,r7,pc}
+endfunc
+
+function inv_txfm_add_4x8_neon
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+        movw            r12, #2896*8
+        vdup.16         d0,  r12
+        vld1.16         {q8,  q9},  [r2, :128]
+        vst1.16         {q14, q15}, [r2, :128]!
+        vld1.16         {q10, q11}, [r2, :128]
+        vst1.16         {q14, q15}, [r2, :128]
+
+        scale_input     d0[0], q8,  q9, q10, q11
+
+        blx             r4
+
+        transpose_4x8h  q8,  q9,  q10, q11
+        vswp            d17, d20
+        vswp            d19, d21
+        vswp            d17, d18
+        vswp            d19, d22
+
+        blx             r5
+
+        load_add_store_4x8 r0, r7
+        vpop            {q4-q7}
+        pop             {r4-r5,r7,pc}
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  0
+.endif
+        push            {r4-r5,r7,lr}
+        vpush           {q4-q7}
+        movrel_local    r4,  inv_\txfm1\()_\h\()h_x\w\()_neon
+        movrel_local    r5,  inv_\txfm2\()_\w\()h_x\h\()_neon
+        b               inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+function inv_dct_4h_x16_neon, export=1
+        movrel_local    r12, idct_coeffs
+        vld1.16         {q0, q1}, [r12, :128]
+
+        vmull_vmlsl     q2,  d17, d31, d2[0], d2[1]  // -> t8a
+        vmull_vmlal     q3,  d17, d31, d2[1], d2[0]  // -> t15a
+        vmull_vmlsl     q4,  d25, d23, d2[2], d2[3]  // -> t9a
+        vrshrn.i32      d17, q2,  #12                // t8a
+        vrshrn.i32      d31, q3,  #12                // t15a
+        vmull_vmlal     q2,  d25, d23, d2[3], d2[2]  // -> t14a
+        vmull_vmlsl     q3,  d21, d27, d3[0], d3[1]  // -> t10a
+        vrshrn.i32      d23, q4,  #12                // t9a
+        vrshrn.i32      d25, q2,  #12                // t14a
+        vmull_vmlal     q4,  d21, d27, d3[1], d3[0]  // -> t13a
+        vmull_vmlsl     q2,  d29, d19, d3[2], d3[3]  // -> t11a
+        vrshrn.i32      d21, q3,  #12                // t10a
+        vrshrn.i32      d27, q4,  #12                // t13a
+        vmull_vmlal     q3,  d29, d19, d3[3], d3[2]  // -> t12a
+        vrshrn.i32      d19, q2,  #12                // t11a
+        vrshrn.i32      d29, q3,  #12                // t12a
+
+        idct_4h_x8      d16, d18, d20, d22, d24, d26, d28, d30
+
+        vqsub.s16       d4,  d17, d23  // t9
+        vqadd.s16       d17, d17, d23  // t8
+        vqsub.s16       d5,  d31, d25  // t14
+        vqadd.s16       d31, d31, d25  // t15
+        vqsub.s16       d23, d19, d21  // t10
+        vqadd.s16       d19, d19, d21  // t11
+        vqadd.s16       d25, d29, d27  // t12
+        vqsub.s16       d29, d29, d27  // t13
+
+        vmull_vmlsl     q3,  d5,  d4,  d0[2], d0[3]  // -> t9a
+        vmull_vmlal     q4,  d5,  d4,  d0[3], d0[2]  // -> t14a
+        vrshrn.i32      d21, q3,  #12                // t9a
+        vrshrn.i32      d27, q4,  #12                // t14a
+
+        vmull_vmlsl     q3,  d29, d23, d0[2], d0[3]  // -> t13a
+        vmull_vmlal     q4,  d29, d23, d0[3], d0[2]  // -> t10a
+        vrshrn.i32      d29, q3,  #12                // t13a
+        vneg.s32        q4,  q4
+        vrshrn.i32      d23, q4,  #12                // t10a
+
+        vqsub.s16       d4,  d17, d19  // t11a
+        vqadd.s16       d17, d17, d19  // t8a
+        vqsub.s16       d5,  d31, d25  // t12a
+        vqadd.s16       d31, d31, d25  // t15a
+        vqadd.s16       d19, d21, d23  // t9
+        vqsub.s16       d21, d21, d23  // t10
+        vqsub.s16       d25, d27, d29  // t13
+        vqadd.s16       d27, d27, d29  // t14
+
+        vmull_vmlsl     q3,  d5,  d4,  d0[0], d0[0]  // -> t11
+        vmull_vmlal     q4,  d5,  d4,  d0[0], d0[0]  // -> t12
+        vmull_vmlsl     q2,  d25, d21, d0[0], d0[0]  // -> t10a
+
+        vrshrn.i32      d6,  q3,  #12  // t11
+        vrshrn.i32      d7,  q4,  #12  // t12
+        vmull_vmlal     q4,  d25, d21, d0[0], d0[0]  // -> t10a
+        vrshrn.i32      d4,  q2,  #12  // t10a
+        vrshrn.i32      d5,  q4,  #12  // t13a
+
+        vqadd.s16       d8,  d16, d31  // out0
+        vqsub.s16       d31, d16, d31  // out15
+        vmov            d16, d8
+        vqadd.s16       d23, d30, d17  // out7
+        vqsub.s16       d9,  d30, d17  // out8
+        vqadd.s16       d17, d18, d27  // out1
+        vqsub.s16       d30, d18, d27  // out14
+        vqadd.s16       d18, d20, d5   // out2
+        vqsub.s16       d29, d20, d5   // out13
+        vqadd.s16       d5,  d28, d19  // out6
+        vqsub.s16       d25, d28, d19  // out9
+        vqadd.s16       d19, d22, d7   // out3
+        vqsub.s16       d28, d22, d7   // out12
+        vqadd.s16       d20, d24, d6   // out4
+        vqsub.s16       d27, d24, d6   // out11
+        vqadd.s16       d21, d26, d4   // out5
+        vqsub.s16       d26, d26, d4   // out10
+        vmov            d24, d9
+        vmov            d22, d5
+
+        bx              lr
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+        movrel_local    r12, iadst16_coeffs
+        vld1.16         {q0, q1}, [r12, :128]
+        movrel_local    r12, idct_coeffs
+
+        vmull_vmlal     q2,  d31, d16, d0[0], d0[1] // -> t0
+        vmull_vmlsl     q3,  d31, d16, d0[1], d0[0] // -> t1
+        vmull_vmlal     q4,  d29, d18, d0[2], d0[3] // -> t2
+        vrshrn.i32      d16, q2,  #12               // t0
+        vrshrn.i32      d31, q3,  #12               // t1
+        vmull_vmlsl     q2,  d29, d18, d0[3], d0[2] // -> t3
+        vmull_vmlal     q3,  d27, d20, d1[0], d1[1] // -> t4
+        vrshrn.i32      d18, q4,  #12               // t2
+        vrshrn.i32      d29, q2,  #12               // t3
+        vmull_vmlsl     q4,  d27, d20, d1[1], d1[0] // -> t5
+        vmull_vmlal     q2,  d25, d22, d1[2], d1[3] // -> t6
+        vrshrn.i32      d20, q3,  #12               // t4
+        vrshrn.i32      d27, q4,  #12               // t5
+        vmull_vmlsl     q3,  d25, d22, d1[3], d1[2] // -> t7
+        vmull_vmlal     q4,  d23, d24, d2[0], d2[1] // -> t8
+        vrshrn.i32      d22, q2,  #12               // t6
+        vrshrn.i32      d25, q3,  #12               // t7
+        vmull_vmlsl     q2,  d23, d24, d2[1], d2[0] // -> t9
+        vmull_vmlal     q3,  d21, d26, d2[2], d2[3] // -> t10
+        vrshrn.i32      d23, q4,  #12               // t8
+        vrshrn.i32      d24, q2,  #12               // t9
+        vmull_vmlsl     q4,  d21, d26, d2[3], d2[2] // -> t11
+        vmull_vmlal     q2,  d19, d28, d3[0], d3[1] // -> t12
+        vrshrn.i32      d21, q3,  #12               // t10
+        vrshrn.i32      d26, q4,  #12               // t11
+        vmull_vmlsl     q3,  d19, d28, d3[1], d3[0] // -> t13
+        vmull_vmlal     q4,  d17, d30, d3[2], d3[3] // -> t14
+        vrshrn.i32      d19, q2,  #12               // t12
+        vrshrn.i32      d28, q3,  #12               // t13
+        vmull_vmlsl     q2,  d17, d30, d3[3], d3[2] // -> t15
+        vrshrn.i32      d17, q4,  #12               // t14
+        vrshrn.i32      d30, q2,  #12               // t15
+
+        vld1.16         {q0}, [r12, :128]
+
+        vqsub.s16       d2,  d16, d23 // t8a
+        vqadd.s16       d16, d16, d23 // t0a
+        vqsub.s16       d3,  d31, d24 // t9a
+        vqadd.s16       d31, d31, d24 // t1a
+        vqadd.s16       d23, d18, d21 // t2a
+        vqsub.s16       d18, d18, d21 // t10a
+        vqadd.s16       d24, d29, d26 // t3a
+        vqsub.s16       d29, d29, d26 // t11a
+        vqadd.s16       d21, d20, d19 // t4a
+        vqsub.s16       d20, d20, d19 // t12a
+        vqadd.s16       d26, d27, d28 // t5a
+        vqsub.s16       d27, d27, d28 // t13a
+        vqadd.s16       d19, d22, d17 // t6a
+        vqsub.s16       d22, d22, d17 // t14a
+        vqadd.s16       d28, d25, d30 // t7a
+        vqsub.s16       d25, d25, d30 // t15a
+
+        vmull_vmlal     q2,  d2,  d3,  d1[1], d1[0] // -> t8
+        vmull_vmlsl     q3,  d2,  d3,  d1[0], d1[1] // -> t9
+        vmull_vmlal     q4,  d18, d29, d1[3], d1[2] // -> t10
+        vrshrn.i32      d17, q2,  #12               // t8
+        vrshrn.i32      d30, q3,  #12               // t9
+        vmull_vmlsl     q2,  d18, d29, d1[2], d1[3] // -> t11
+        vmull_vmlsl     q3,  d27, d20, d1[1], d1[0] // -> t12
+        vrshrn.i32      d18, q4,  #12               // t10
+        vrshrn.i32      d29, q2,  #12               // t11
+        vmull_vmlal     q4,  d27, d20, d1[0], d1[1] // -> t13
+        vmull_vmlsl     q2,  d25, d22, d1[3], d1[2] // -> t14
+        vrshrn.i32      d27, q3,  #12               // t12
+        vrshrn.i32      d20, q4,  #12               // t13
+        vmull_vmlal     q3,  d25, d22, d1[2], d1[3] // -> t15
+        vrshrn.i32      d25, q2,  #12               // t14
+        vrshrn.i32      d22, q3,  #12               // t15
+
+        vqsub.s16       d2,  d16, d21 // t4
+        vqadd.s16       d16, d16, d21 // t0
+        vqsub.s16       d3,  d31, d26 // t5
+        vqadd.s16       d31, d31, d26 // t1
+        vqadd.s16       d21, d23, d19 // t2
+        vqsub.s16       d23, d23, d19 // t6
+        vqadd.s16       d26, d24, d28 // t3
+        vqsub.s16       d24, d24, d28 // t7
+        vqadd.s16       d19, d17, d27 // t8a
+        vqsub.s16       d17, d17, d27 // t12a
+        vqadd.s16       d28, d30, d20 // t9a
+        vqsub.s16       d30, d30, d20 // t13a
+        vqadd.s16       d27, d18, d25 // t10a
+        vqsub.s16       d18, d18, d25 // t14a
+        vqadd.s16       d20, d29, d22 // t11a
+        vqsub.s16       d29, d29, d22 // t15a
+
+        vmull_vmlal     q2,  d2,  d3,  d0[3], d0[2] // -> t4a
+        vmull_vmlsl     q3,  d2,  d3,  d0[2], d0[3] // -> t5a
+        vmull_vmlsl     q4,  d24, d23, d0[3], d0[2] // -> t6a
+        vrshrn.i32      d22, q2,  #12               // t4a
+        vrshrn.i32      d25, q3,  #12               // t5a
+        vmull_vmlal     q2,  d24, d23, d0[2], d0[3] // -> t7a
+        vmull_vmlal     q3,  d17, d30, d0[3], d0[2] // -> t12
+        vrshrn.i32      d24, q4,  #12               // t6a
+        vrshrn.i32      d23, q2,  #12               // t7a
+        vmull_vmlsl     q4,  d17, d30, d0[2], d0[3] // -> t13
+        vmull_vmlsl     q2,  d29, d18, d0[3], d0[2] // -> t14
+        vrshrn.i32      d17, q3,  #12               // t12
+        vmull_vmlal     q3,  d29, d18, d0[2], d0[3] // -> t15
+        vrshrn.i32      d29, q4,  #12               // t13
+        vrshrn.i32      d30, q2,  #12               // t14
+        vrshrn.i32      d18, q3,  #12               // t15
+
+        vqsub.s16       d2,  d16, d21 // t2a
+.ifc \o0, d16
+        vqadd.s16       \o0, d16, d21 // out0
+        vqsub.s16       d21, d31, d26 // t3a
+        vqadd.s16       \o15,d31, d26 // out15
+.else
+        vqadd.s16       d4,  d16, d21 // out0
+        vqsub.s16       d21, d31, d26 // t3a
+        vqadd.s16       \o15,d31, d26 // out15
+        vmov            \o0, d4
+.endif
+        vqneg.s16       \o15, \o15    // out15
+
+        vqsub.s16       d3,  d29, d18 // t15a
+        vqadd.s16       \o13,d29, d18 // out13
+        vqadd.s16       \o2, d17, d30 // out2
+        vqsub.s16       d26, d17, d30 // t14a
+        vqneg.s16       \o13,\o13     // out13
+
+        vqadd.s16       \o1, d19, d27 // out1
+        vqsub.s16       d27, d19, d27 // t10
+        vqadd.s16       \o14,d28, d20 // out14
+        vqsub.s16       d20, d28, d20 // t11
+        vqneg.s16       \o1, \o1      // out1
+
+        vqadd.s16       \o3, d22, d24 // out3
+        vqsub.s16       d22, d22, d24 // t6
+        vqadd.s16       \o12,d25, d23 // out12
+        vqsub.s16       d23, d25, d23 // t7
+        vqneg.s16       \o3, \o3      // out3
+
+        vmull_vmlsl     q12, d2,  d21, d0[0], d0[0] // -> out8 (d24 or d23)
+        vmull_vmlal     q2,  d2,  d21, d0[0], d0[0] // -> out7 (d23 or d24)
+        vmull_vmlal     q3,  d26, d3,  d0[0], d0[0] // -> out5 (d21 or d26)
+
+        vrshrn.i32      d24, q12, #12 // out8
+        vrshrn.i32      d4,  q2,  #12 // out7
+        vrshrn.i32      d5,  q3,  #12 // out5
+        vmull_vmlsl     q4,  d26, d3,  d0[0], d0[0] // -> out10 (d26 or d21)
+        vmull_vmlal     q1,  d22, d23, d0[0], d0[0] // -> out4 (d20 or d27)
+        vrshrn.i32      d26, q4,  #12 // out10
+
+        vmull_vmlsl     q4,  d22, d23, d0[0], d0[0] // -> out11 (d27 or d20)
+        vmull_vmlal     q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25)
+        vmull_vmlsl     q3,  d27, d20, d0[0], d0[0] // -> out9 (d25 or d22)
+
+        vrshrn.i32      \o4, q1,  #12 // out4
+        vrshrn.i32      d7,  q3,  #12 // out9
+        vrshrn.i32      d6,  q4,  #12 // out11
+        vrshrn.i32      \o6, q11, #12 // out6
+
+.ifc \o8, d23
+        vmov            \o8, d24
+        vmov            \o10,d26
+.endif
+
+        vqneg.s16       \o7, d4  // out7
+        vqneg.s16       \o5, d5  // out5
+        vqneg.s16       \o11,d6  // out11
+        vqneg.s16       \o9, d7  // out9
+.endm
+
+function inv_adst_4h_x16_neon, export=1
+        iadst_16        d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        bx              lr
+endfunc
+
+function inv_flipadst_4h_x16_neon, export=1
+        iadst_16        d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+        bx              lr
+endfunc
+
+function inv_identity_4h_x16_neon, export=1
+        movw            r12, #2*(5793-4096)*8
+        vdup.16         d0,  r12
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vqrdmulh.s16    q1,  \i,  d0[0]
+        vqadd.s16       \i,  \i,  \i
+        vqadd.s16       \i,  \i,  q1
+.endr
+        bx              lr
+endfunc
+
+.macro identity_4x16_shift2 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vqrdmulh.s16    q2,  \i,  \c
+        vshr.s16        q2,  q2,  #1
+        vrhadd.s16      \i,  \i,  q2
+.endr
+.endm
+
+.macro identity_4x16_shift1 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vqrdmulh.s16    q2,  \i,  \c
+        vrshr.s16       q2,  q2,  #1
+        vqadd.s16       \i,  \i,  q2
+.endr
+.endm
+
+.macro identity_8x8_shift1 c
+        identity_4x16_shift1 \c
+.endm
+
+.macro identity_8x8 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vqrdmulh.s16    q2,  \i,  \c
+        vqadd.s16       \i,  \i,  \i
+        vqadd.s16       \i,  \i,  q2
+.endr
+.endm
+
+.macro def_horz_16 scale=0, identity=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+        push            {lr}
+        vmov.i16        d7,  #0
+.if \identity
+        movw            r12, #2*(5793-4096)*8
+        vdup.16         d0,  r12
+.endif
+.if \scale
+        movw            r12, #2896*8
+        vdup.16         d1,  r12
+.endif
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vld1.16         {\i}, [r7, :64]
+        vst1.16         {d7}, [r7, :64], r8
+.endr
+.if \scale
+        scale_input     d1[0], q8,  q9, q10, q11, q12, q13, q14, q15
+.endif
+.if \identity
+.if \shift == -2
+        identity_4x16_shift2 d0[0]
+.else
+        identity_4x16_shift1 d0[0]
+.endif
+.else
+        blx             r4
+.endif
+.if \shift > 0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vrshr.s16       \i,  \i,  #\shift
+.endr
+.endif
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        transpose_4x4h  q12, q13, d24, d25, d26, d27
+        transpose_4x4h  q14, q15, d28, d29, d30, d31
+
+.irp i, d16, d20, d24, d28, d17, d21, d25, d29, d18, d22, d26, d30, d19, d23, d27, d31
+        vst1.16         {\i}, [r6, :64]!
+.endr
+
+        pop             {pc}
+endfunc
+.endm
+
+def_horz_16 scale=0, identity=0, shift=2
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
+def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity
+
+function inv_txfm_add_vert_4x16_neon
+        push            {lr}
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vld1.16         {\i}, [r7, :64], r8
+.endr
+        blx             r5
+        load_add_store_4x16 r6, r7
+        pop             {pc}
+endfunc
+
+.macro sub_sp_align space
+#if CONFIG_THUMB
+        mov             r7,  sp
+        and             r7,  r7,  #15
+#else
+        and             r7,  sp,  #15
+#endif
+        sub             sp,  sp,  r7
+        // Now the stack is aligned, store the amount of adjustment back
+        // on the stack, as we don't want to waste a register as frame
+        // pointer.
+        str             r7,  [sp, #-16]!
+#ifdef _WIN32
+.if \space > 8192
+        // Here, we'd need to touch two (or more) pages while decrementing
+        // the stack pointer.
+        .error          "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+        sub             r7,  sp,  #4096
+        ldr             r12, [r7]
+        sub             r7,  r7,  #(\space - 4096)
+        mov             sp,  r7
+.else
+        sub             sp,  sp,  #\space
+.endif
+#else
+.if \space >= 4096
+        sub             sp,  sp,  #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+        sub             sp,  sp,  #(\space)%4096
+.endif
+#endif
+.endm
+
+.macro add_sp_align space
+.if \space >= 4096
+        add             sp,  sp,  #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+        add             sp,  sp,  #(\space)%4096
+.endif
+        ldr             r7,  [sp], #16
+        // Add back the original stack adjustment
+        add             sp,  sp,  r7
+.endm
+
+function inv_txfm_add_16x16_neon
+        sub_sp_align    512
+        ldrh            r11, [r10], #2
+.irp i, 0, 4, 8, 12
+        add             r6,  sp,  #(\i*16*2)
+.if \i > 0
+        mov             r8,  #(16 - \i)
+        cmp             r3,  r11
+        blt             1f
+.if \i < 12
+        ldrh            r11, [r10], #2
+.endif
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #16*2
+        blx             r9
+.endr
+        b               3f
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #4
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+3:
+.irp i, 0, 4, 8, 12
+        add             r6,  r0,  #(\i)
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #32
+        bl              inv_txfm_add_vert_4x16_neon
+.endr
+
+        add_sp_align    512
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+const eob_16x16
+        .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+        .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         16,  16,  2
+.endif
+        push            {r4-r11,lr}
+        vpush           {q4}
+.ifc \txfm1, identity
+        movrel_local    r9,  inv_txfm_horz_identity_16x4_neon
+.else
+        movrel_local    r9,  inv_txfm_horz_16x4_neon
+        movrel_local    r4,  inv_\txfm1\()_4h_x16_neon
+.endif
+        movrel_local    r5,  inv_\txfm2\()_4h_x16_neon
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+        movrel_local    r10, eob_16x16
+.else
+        movrel_local    r10, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+        movrel_local    r10, eob_16x16_identity
+.else
+        movrel_local    r10, eob_16x16
+.endif
+.endif
+        b               inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
+
+.ifc \variant, identity_
+        vmov.i16        d4,  #0
+.irp i, d16, d18, d20, d22
+        vld1.16         {\i}, [r2, :64]
+        vst1.16         {d4}, [r2, :64]!
+.endr
+.irp i, d17, d19, d21, d23
+        vld1.16         {\i}, [r2, :64]
+        vst1.16         {d4}, [r2, :64]!
+.endr
+        movw            r12, #2*(5793-4096)*8
+        vdup.16         d0,  r12
+.irp i, d24, d26, d28, d30
+        vld1.16         {\i}, [r2, :64]
+        vst1.16         {d4}, [r2, :64]!
+.endr
+.irp i, d25, d27, d29, d31
+        vld1.16         {\i}, [r2, :64]
+        vst1.16         {d4}, [r2, :64]!
+.endr
+
+        identity_4x16_shift1 d0[0]
+.else
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+        vld1.16         {d16, d17, d18, d19}, [r2, :128]
+        vst1.16         {q2,  q3}, [r2, :128]!
+        vld1.16         {d20, d21, d22, d23}, [r2, :128]
+        vst1.16         {q2,  q3}, [r2, :128]!
+        vld1.16         {d24, d25, d26, d27}, [r2, :128]
+        vst1.16         {q2,  q3}, [r2, :128]!
+        vld1.16         {d28, d29, d30, d31}, [r2, :128]
+        vst1.16         {q2,  q3}, [r2, :128]!
+
+        blx             r4
+
+        vswp            d17, d20
+        vswp            d19, d22
+        vswp            d18, d20
+        vswp            d19, d21
+.irp i, q8, q9, q10, q11
+        vrshr.s16       \i,  \i,  #1
+.endr
+.endif
+        transpose_4x8h  q8,  q9,  q10, q11
+        blx             r5
+        mov             r6,  r0
+        load_add_store_8x4 r6, r7
+
+.ifc \variant, identity_
+        vmov            q8,  q12
+        vmov            q9,  q13
+        vmov            q10, q14
+        vmov            q11, q15
+.else
+        vswp            d25, d28
+        vswp            d27, d30
+        vswp            d26, d28
+        vswp            d27, d29
+        vrshr.s16       q8,  q12, #1
+        vrshr.s16       q9,  q13, #1
+        vrshr.s16       q10, q14, #1
+        vrshr.s16       q11, q15, #1
+.endif
+        transpose_4x8h  q8,  q9,  q10, q11
+        blx             r5
+        add             r6,  r0,  #8
+        load_add_store_8x4 r6, r7
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_\variant\()add_4x16_neon
+        vmov.i16        q2,  #0
+
+        mov             r11, #32
+        cmp             r3,  r10
+        blt             1f
+
+        add             r6,  r2,  #16
+.ifc \variant, identity_
+.irp i, q12, q13, q14, q15
+        vld1.16         {\i}, [r6, :128]
+        vst1.16         {q2}, [r6, :128], r11
+.endr
+        movw            r12, #(5793-4096)*8
+        vdup.16         d0,  r12
+        identity_8x4_shift1 q12, q13, q14, q15, d0[0]
+.else
+.irp i, q8,  q9,  q10, q11
+        vld1.16         {\i}, [r6, :128]
+        vst1.16         {q2}, [r6, :128], r11
+.endr
+        blx             r4
+        vrshr.s16       q12, q8,  #1
+        vrshr.s16       q13, q9,  #1
+        vrshr.s16       q14, q10, #1
+        vrshr.s16       q15, q11, #1
+.endif
+        transpose_4x8h  q12, q13, q14, q15
+        vswp            d27, d29
+        vswp            d26, d28
+        vswp            d27, d30
+        vswp            d25, d28
+
+        b               2f
+1:
+.irp i, q12, q13, q14, q15
+        vmov.i16        \i,  #0
+.endr
+2:
+        vmov.i16        q2,  #0
+.irp i, q8,  q9,  q10, q11
+        vld1.16         {\i}, [r2, :128]
+        vst1.16         {q2}, [r2, :128], r11
+.endr
+.ifc \variant, identity_
+        movw            r12, #(5793-4096)*8
+        vdup.16         d0,  r12
+        identity_8x4_shift1 q8,  q9,  q10, q11, d0[0]
+.else
+        blx             r4
+.irp i, q8, q9, q10, q11
+        vrshr.s16       \i,  \i,  #1
+.endr
+.endif
+        transpose_4x8h  q8,  q9,  q10, q11
+        vswp            d19, d21
+        vswp            d18, d20
+        vswp            d19, d22
+        vswp            d17, d20
+
+        blx             r5
+
+        load_add_store_4x16 r0, r6
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+def_fn_416_base
+def_fn_416_base identity_
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  1
+.endif
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+.if \w == 4
+        movrel_local    r4,  inv_\txfm1\()_8h_x\w\()_neon
+        movrel_local    r5,  inv_\txfm2\()_4h_x\h\()_neon
+        mov             r10, #\eob_half
+.else
+        movrel_local    r4,  inv_\txfm1\()_4h_x\w\()_neon
+        movrel_local    r5,  inv_\txfm2\()_8h_x\h\()_neon
+.endif
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+        b               inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
+        sub_sp_align    256
+
+.irp i, 0, 4
+        add             r6,  sp,  #(\i*16*2)
+.if \i > 0
+        cmp             r3,  r10
+        blt             1f
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #8*2
+        blx             r9
+.endr
+        b               2f
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+2:
+
+.irp i, 0, 8
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #32
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\j}, [r7, :128], r8
+.endr
+        blx             r5
+
+        add             r6,  r0,  #(\i)
+        load_add_store_8x8 r6, r7
+.endr
+
+        add_sp_align    256
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_\variant\()add_8x16_neon
+        sub_sp_align    256
+
+.irp i, 0, 8
+        add             r6,  sp,  #(\i*8*2)
+.if \i > 0
+        cmp             r3,  r10
+        blt             1f
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #16*2
+
+        vmov.i16        q2,  #0
+        movw            r12, #2896*8
+        vdup.16         d0,  r12
+
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\j}, [r7, :128]
+        vst1.16         {q2}, [r7, :128], r8
+.endr
+        scale_input     d0[0], q8,  q9,  q10, q11, q12, q13, q14, q15
+.ifc \variant, identity_
+        // The identity shl #1 and downshift vrshr #1 cancel out
+.else
+        blx             r4
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+        vrshr.s16       \j,  \j,  #1
+.endr
+.endif
+        transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+        vst1.16         {q8,  q9},  [r6, :128]!
+        vst1.16         {q10, q11}, [r6, :128]!
+        vst1.16         {q12, q13}, [r6, :128]!
+        vst1.16         {q14, q15}, [r6, :128]!
+.endr
+        b               2f
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+2:
+
+.irp i, 0, 4
+        add             r6,  r0,  #(\i)
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #16
+        bl              inv_txfm_add_vert_4x16_neon
+.endr
+
+        add_sp_align    256
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+def_fn_816_base
+def_fn_816_base identity_
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  1
+.endif
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+.if \w == 8
+        movrel_local    r4,  inv_\txfm1\()_8h_x8_neon
+        movrel_local    r5,  inv_\txfm2\()_4h_x16_neon
+.else
+.ifc \txfm1, identity
+        movrel_local    r9,  inv_txfm_horz_scale_identity_16x4_neon
+.else
+        movrel_local    r4,  inv_\txfm1\()_4h_x16_neon
+        movrel_local    r9,  inv_txfm_horz_scale_16x4_neon
+.endif
+        movrel_local    r5,  inv_\txfm2\()_8h_x8_neon
+.endif
+.if \w == 8
+        mov             r10, #\eob_8x8
+.else
+        mov             r10, #\eob_4x4
+.endif
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+        b               inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43, 10
+def_fn_816 \w, \h, identity, identity, 43, 10
+def_fn_816 \w, \h, dct, adst, 43, 10
+def_fn_816 \w, \h, dct, flipadst, 43, 10
+def_fn_816 \w, \h, dct, identity, 8, 4
+def_fn_816 \w, \h, adst, dct, 43, 10
+def_fn_816 \w, \h, adst, adst, 43, 10
+def_fn_816 \w, \h, adst, flipadst, 43, 10
+def_fn_816 \w, \h, flipadst, dct, 43, 10
+def_fn_816 \w, \h, flipadst, adst, 43, 10
+def_fn_816 \w, \h, flipadst, flipadst, 43, 10
+def_fn_816 \w, \h, identity, dct, 64, 4
+def_fn_816 \w, \h, adst, identity, 8, 4
+def_fn_816 \w, \h, flipadst, identity, 8, 4
+def_fn_816 \w, \h, identity, adst, 64, 4
+def_fn_816 \w, \h, identity, flipadst, 64, 4
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4h_x16_neon, export=1
+        movrel_local    r12, idct_coeffs, 2*16
+        vld1.16         {q0, q1}, [r12, :128]
+        sub             r12, r12, #2*16
+
+        vmull_vmlsl     q2,  d16, d31, d0[0], d0[1] // -> t16a
+        vmull_vmlal     q3,  d16, d31, d0[1], d0[0] // -> t31a
+        vmull_vmlsl     q4,  d24, d23, d0[2], d0[3] // -> t17a
+        vrshrn.i32      d16, q2,  #12               // t16a
+        vrshrn.i32      d31, q3,  #12               // t31a
+        vmull_vmlal     q2,  d24, d23, d0[3], d0[2] // -> t30a
+        vmull_vmlsl     q3,  d20, d27, d1[0], d1[1] // -> t18a
+        vrshrn.i32      d24, q4,  #12               // t17a
+        vrshrn.i32      d23, q2,  #12               // t30a
+        vmull_vmlal     q4,  d20, d27, d1[1], d1[0] // -> t29a
+        vmull_vmlsl     q2,  d28, d19, d1[2], d1[3] // -> t19a
+        vrshrn.i32      d20, q3,  #12               // t18a
+        vrshrn.i32      d27, q4,  #12               // t29a
+        vmull_vmlal     q3,  d28, d19, d1[3], d1[2] // -> t28a
+        vmull_vmlsl     q4,  d18, d29, d2[0], d2[1] // -> t20a
+        vrshrn.i32      d28, q2,  #12               // t19a
+        vrshrn.i32      d19, q3,  #12               // t28a
+        vmull_vmlal     q2,  d18, d29, d2[1], d2[0] // -> t27a
+        vmull_vmlsl     q3,  d26, d21, d2[2], d2[3] // -> t21a
+        vrshrn.i32      d18, q4,  #12               // t20a
+        vrshrn.i32      d29, q2,  #12               // t27a
+        vmull_vmlal     q4,  d26, d21, d2[3], d2[2] // -> t26a
+        vmull_vmlsl     q2,  d22, d25, d3[0], d3[1] // -> t22a
+        vrshrn.i32      d26, q3,  #12               // t21a
+        vrshrn.i32      d21, q4,  #12               // t26a
+        vmull_vmlal     q3,  d22, d25, d3[1], d3[0] // -> t25a
+        vmull_vmlsl     q4,  d30, d17, d3[2], d3[3] // -> t23a
+        vrshrn.i32      d22, q2,  #12               // t22a
+        vrshrn.i32      d25, q3,  #12               // t25a
+        vmull_vmlal     q2,  d30, d17, d3[3], d3[2] // -> t24a
+        vrshrn.i32      d30, q4,  #12               // t23a
+        vrshrn.i32      d17, q2,  #12               // t24a
+
+        vld1.16         {q0}, [r12, :128]
+
+        vqsub.s16       d2,  d16, d24 // t17
+        vqadd.s16       d16, d16, d24 // t16
+        vqsub.s16       d3,  d31, d23 // t30
+        vqadd.s16       d31, d31, d23 // t31
+        vqsub.s16       d24, d28, d20 // t18
+        vqadd.s16       d28, d28, d20 // t19
+        vqadd.s16       d23, d18, d26 // t20
+        vqsub.s16       d18, d18, d26 // t21
+        vqsub.s16       d20, d30, d22 // t22
+        vqadd.s16       d30, d30, d22 // t23
+        vqadd.s16       d26, d17, d25 // t24
+        vqsub.s16       d17, d17, d25 // t25
+        vqsub.s16       d22, d29, d21 // t26
+        vqadd.s16       d29, d29, d21 // t27
+        vqadd.s16       d25, d19, d27 // t28
+        vqsub.s16       d19, d19, d27 // t29
+
+        vmull_vmlsl     q2,  d3,  d2,  d1[0], d1[1] // -> t17a
+        vmull_vmlal     q3,  d3,  d2,  d1[1], d1[0] // -> t30a
+        vmull_vmlal     q4,  d19, d24, d1[1], d1[0] // -> t18a
+        vrshrn.i32      d21, q2,  #12               // t17a
+        vrshrn.i32      d27, q3,  #12               // t30a
+        vneg.s32        q4,  q4                     // -> t18a
+        vmull_vmlsl     q1,  d19, d24, d1[0], d1[1] // -> t29a
+        vmull_vmlsl     q2,  d22, d18, d1[2], d1[3] // -> t21a
+        vrshrn.i32      d19, q4,  #12               // t18a
+        vrshrn.i32      d24, q1,  #12               // t29a
+        vmull_vmlal     q3,  d22, d18, d1[3], d1[2] // -> t26a
+        vmull_vmlal     q4,  d17, d20, d1[3], d1[2] // -> t22a
+        vrshrn.i32      d22, q2,  #12               // t21a
+        vrshrn.i32      d18, q3,  #12               // t26a
+        vneg.s32        q4,  q4                     // -> t22a
+        vmull_vmlsl     q1,  d17, d20, d1[2], d1[3] // -> t25a
+        vrshrn.i32      d17, q4,  #12               // t22a
+        vrshrn.i32      d20, q1,  #12               // t25a
+
+        vqsub.s16       d2,  d27, d24 // t29
+        vqadd.s16       d27, d27, d24 // t30
+        vqsub.s16       d3,  d21, d19 // t18
+        vqadd.s16       d21, d21, d19 // t17
+        vqsub.s16       d24, d16, d28 // t19a
+        vqadd.s16       d16, d16, d28 // t16a
+        vqsub.s16       d19, d30, d23 // t20a
+        vqadd.s16       d30, d30, d23 // t23a
+        vqsub.s16       d28, d17, d22 // t21
+        vqadd.s16       d17, d17, d22 // t22
+        vqadd.s16       d23, d26, d29 // t24a
+        vqsub.s16       d26, d26, d29 // t27a
+        vqadd.s16       d22, d20, d18 // t25
+        vqsub.s16       d20, d20, d18 // t26
+        vqsub.s16       d29, d31, d25 // t28a
+        vqadd.s16       d31, d31, d25 // t31a
+
+        vmull_vmlsl     q2,  d2,  d3,  d0[2], d0[3] // -> t18a
+        vmull_vmlal     q3,  d2,  d3,  d0[3], d0[2] // -> t29a
+        vmull_vmlsl     q4,  d29, d24, d0[2], d0[3] // -> t19
+        vrshrn.i32      d18, q2,  #12               // t18a
+        vrshrn.i32      d25, q3,  #12               // t29a
+        vmull_vmlal     q1,  d29, d24, d0[3], d0[2] // -> t28
+        vmull_vmlal     q2,  d26, d19, d0[3], d0[2] // -> t20
+        vrshrn.i32      d29, q4,  #12               // t19
+        vrshrn.i32      d24, q1,  #12               // t28
+        vneg.s32        q2,  q2                     // -> t20
+        vmull_vmlsl     q3,  d26, d19, d0[2], d0[3] // -> t27
+        vmull_vmlal     q4,  d20, d28, d0[3], d0[2] // -> t21a
+        vrshrn.i32      d26, q2,  #12               // t20
+        vrshrn.i32      d19, q3,  #12               // t27
+        vneg.s32        q4,  q4                     // -> t21a
+        vmull_vmlsl     q1,  d20, d28, d0[2], d0[3] // -> t26a
+        vrshrn.i32      d20, q4,  #12               // t21a
+        vrshrn.i32      d28, q1,  #12               // t26a
+
+        vqsub.s16       d2,  d16, d30 // t23
+        vqadd.s16       d16, d16, d30 // t16 = out16
+        vqsub.s16       d3,  d31, d23 // t24
+        vqadd.s16       d31, d31, d23 // t31 = out31
+        vqsub.s16       d23, d21, d17 // t22a
+        vqadd.s16       d17, d21, d17 // t17a = out17
+        vqadd.s16       d30, d27, d22 // t30a = out30
+        vqsub.s16       d21, d27, d22 // t25a
+        vqsub.s16       d27, d18, d20 // t21
+        vqadd.s16       d18, d18, d20 // t18 = out18
+        vqadd.s16       d4,  d29, d26 // t19a = out19
+        vqsub.s16       d26, d29, d26 // t20a
+        vqadd.s16       d29, d25, d28 // t29 = out29
+        vqsub.s16       d25, d25, d28 // t26
+        vqadd.s16       d28, d24, d19 // t28a = out28
+        vqsub.s16       d24, d24, d19 // t27a
+        vmov            d19, d4       // out19
+
+        vmull_vmlsl     q2,  d24, d26, d0[0], d0[0] // -> t20
+        vmull_vmlal     q3,  d24, d26, d0[0], d0[0] // -> t27
+        vrshrn.i32      d20, q2,  #12   // t20
+        vrshrn.i32      d22, q3,  #12   // t27
+
+        vmull_vmlal     q2,  d25, d27, d0[0], d0[0] // -> t26a
+        vmull_vmlsl     q3,  d25, d27, d0[0], d0[0] // -> t21a
+        vmov            d27, d22        // t27
+        vrshrn.i32      d26, q2,  #12   // t26a
+
+        vmull_vmlsl     q12, d21, d23, d0[0], d0[0] // -> t22
+        vmull_vmlal     q2,  d21, d23, d0[0], d0[0] // -> t25
+        vrshrn.i32      d21, q3,  #12   // t21a
+        vrshrn.i32      d22, q12, #12   // t22
+        vrshrn.i32      d25, q2,  #12   // t25
+
+        vmull_vmlsl     q2,  d3,  d2,  d0[0], d0[0] // -> t23a
+        vmull_vmlal     q3,  d3,  d2,  d0[0], d0[0] // -> t24a
+        vrshrn.i32      d23, q2,  #12   // t23a
+        vrshrn.i32      d24, q3,  #12   // t24a
+
+        bx              lr
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+        push            {lr}
+        vmov.i16        d7,  #0
+        lsl             r8,  r8,  #1
+.if \scale
+        movw            r12, #2896*8
+        vdup.16         d0,  r12
+.endif
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vld1.16         {\i}, [r7, :64]
+        vst1.16         {d7}, [r7, :64], r8
+.endr
+        sub             r7,  r7,  r8, lsl #4
+        add             r7,  r7,  r8, lsr #1
+.if \scale
+        scale_input     d0[0], q8,  q9,  q10, q11, q12, q13, q14, q15
+.endif
+        bl              inv_dct_4h_x16_neon
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        transpose_4x4h  q12, q13, d24, d25, d26, d27
+        transpose_4x4h  q14, q15, d28, d29, d30, d31
+
+.macro store1 r0, r1, r2, r3
+        vst1.16         {\r0}, [r6, :64]!
+        vst1.16         {\r1}, [r6, :64]!
+        vst1.16         {\r2}, [r6, :64]!
+        vst1.16         {\r3}, [r6, :64]!
+        add             r6,  r6,  #32
+.endm
+        store1          d16, d20, d24, d28
+        store1          d17, d21, d25, d29
+        store1          d18, d22, d26, d30
+        store1          d19, d23, d27, d31
+.purgem store1
+        sub             r6,  r6,  #64*4
+
+        vmov.i16        d7,  #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vld1.16         {\i}, [r7, :64]
+        vst1.16         {d7}, [r7, :64], r8
+.endr
+.if \scale
+        // This relies on the fact that the idct also leaves the right coeff in d0[1]
+        scale_input     d0[1], q8,  q9,  q10, q11, q12, q13, q14, q15
+.endif
+        bl              inv_dct32_odd_4h_x16_neon
+        transpose_4x4h  q15, q14, d31, d30, d29, d28
+        transpose_4x4h  q13, q12, d27, d26, d25, d24
+        transpose_4x4h  q11, q10, d23, d22, d21, d20
+        transpose_4x4h  q9,  q8,  d19, d18, d17, d16
+.macro store2 r0, r1, r2, r3, shift
+        vld1.16         {q0, q1}, [r6, :128]
+        vqsub.s16       d7,  d0,  \r0
+        vqadd.s16       d0,  d0,  \r0
+        vqsub.s16       d6,  d1,  \r1
+        vqadd.s16       d1,  d1,  \r1
+        vqsub.s16       d5,  d2,  \r2
+        vqadd.s16       d2,  d2,  \r2
+        vqsub.s16       d4,  d3,  \r3
+        vqadd.s16       d3,  d3,  \r3
+        vrev64.16       q2,  q2
+        vrev64.16       q3,  q3
+        vrshr.s16       q0,  q0,  #\shift
+        vrshr.s16       q1,  q1,  #\shift
+        vrshr.s16       q2,  q2,  #\shift
+        vrshr.s16       q3,  q3,  #\shift
+        vst1.16         {q0, q1}, [r6, :128]!
+        vst1.16         {q2, q3}, [r6, :128]!
+.endm
+
+        store2          d31, d27, d23, d19, \shift
+        store2          d30, d26, d22, d18, \shift
+        store2          d29, d25, d21, d17, \shift
+        store2          d28, d24, d20, d16, \shift
+.purgem store2
+        pop             {pc}
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_4x32_neon
+        push            {r10-r11,lr}
+        lsl             r8,  r8,  #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vld1.16         {\i}, [r7, :64], r8
+.endr
+        sub             r7,  r7,  r8, lsl #4
+
+        bl              inv_dct_4h_x16_neon
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vst1.16         {\i}, [r7, :64], r8
+.endr
+        sub             r7,  r7,  r8, lsl #4
+        add             r7,  r7,  r8, lsr #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vld1.16         {\i}, [r7, :64], r8
+.endr
+        sub             r7,  r7,  r8, lsl #4
+        sub             r7,  r7,  r8, lsr #1
+        bl              inv_dct32_odd_4h_x16_neon
+
+        neg             r9,  r8
+        mov             r10, r6
+.macro combine r0, r1, r2, r3, op, stride
+        vld1.16         {d4},    [r7,  :64], \stride
+        vld1.32         {d2[0]}, [r10, :32], r1
+        vld1.16         {d5},    [r7,  :64],  \stride
+        vld1.32         {d2[1]}, [r10, :32], r1
+        \op\().s16      d4,  d4,  \r0
+        vld1.16         {d6},    [r7,  :64], \stride
+        vld1.32         {d3[0]}, [r10, :32], r1
+        \op\().s16      d5,  d5,  \r1
+        vld1.32         {d3[1]}, [r10, :32], r1
+        vrshr.s16       q2,  q2,  #4
+        \op\().s16      d6,  d6,  \r2
+        vld1.16         {d7},    [r7,  :64], \stride
+        vaddw.u8        q2,  q2,  d2
+        \op\().s16      d7,  d7,  \r3
+        vqmovun.s16     d2,  q2
+        vrshr.s16       q3,  q3,  #4
+        vst1.32         {d2[0]}, [r6,  :32], r1
+        vaddw.u8        q3,  q3,  d3
+        vst1.32         {d2[1]}, [r6,  :32], r1
+        vqmovun.s16     d3,  q3
+        vst1.32         {d3[0]}, [r6,  :32], r1
+        vst1.32         {d3[1]}, [r6,  :32], r1
+.endm
+        combine         d31, d30, d29, d28, vqadd, r8
+        combine         d27, d26, d25, d24, vqadd, r8
+        combine         d23, d22, d21, d20, vqadd, r8
+        combine         d19, d18, d17, d16, vqadd, r8
+        sub             r7,  r7,  r8
+        combine         d16, d17, d18, d19, vqsub, r9
+        combine         d20, d21, d22, d23, vqsub, r9
+        combine         d24, d25, d26, d27, vqsub, r9
+        combine         d28, d29, d30, d31, vqsub, r9
+.purgem combine
+
+        pop             {r10-r11,pc}
+endfunc
+
+const eob_32x32
+        .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+        .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+        .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+        // Contrary to the others, this one is only ever used in increments of 8x8
+        .short 43, 107, 171, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
+        push            {r4-r7,lr}
+        vmov.i16        q0,  #0
+        movrel_local    r5,  eob_32x32, 2
+
+        mov             r6,  #2*32
+1:
+        mov             r12, #0
+        movrel_local    r4,  eob_32x32, 2
+2:
+        add             r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\i}, [r2, :128]
+        vst1.16         {q0}, [r2, :128], r6
+.endr
+        transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+        load_add_store_8x8 r0, r7, shiftbits=2
+        ldrh            lr,  [r4], #4
+        sub             r0,  r0,  r1, lsl #3
+        cmp             r3,  lr
+        add             r0,  r0,  #8
+        bge             2b
+
+        ldrh            lr,  [r5], #4
+        cmp             r3,  lr
+        blt             9f
+
+        sub             r0,  r0,  r12
+        add             r0,  r0,  r1, lsl #3
+        mls             r2,  r6,  r12, r2
+        add             r2,  r2,  #2*8
+        b               1b
+9:
+        pop             {r4-r7,pc}
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        \op             \i,  \i,  #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+        push            {r4-r7,lr}
+        movw            r6,  #2896*8
+        movw            r7,  #2*(5793-4096)*8
+        vdup.i16        d0,  r6
+        movrel_local    r5,  eob_16x32\hshort, 2
+        vmov.16         d0[1], r7
+
+        mov             r6,  #2*\h
+1:
+        mov             r12, #0
+        movrel_local    r4,  eob_16x32\wshort, 2
+2:
+        vmov.i16        q1,  #0
+        add             r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\i}, [r2, :128]
+        vst1.16         {q1}, [r2, :128], r6
+.endr
+        scale_input     d0[0], q8,  q9, q10, q11, q12, q13, q14, q15
+
+.if \w == 16
+        // 16x32
+        identity_8x8_shift1 d0[1]
+.else
+        // 32x16
+        shift_8_regs    vqshl.s16, 1
+        identity_8x8    d0[1]
+.endif
+
+        transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+.if \w == 16
+        load_add_store_8x8 r0, r7, shiftbits=2
+.else
+        load_add_store_8x8 r0, r7, shiftbits=4
+.endif
+        ldrh            lr,  [r4], #4
+        sub             r0,  r0,  r1, lsl #3
+        cmp             r3,  lr
+        add             r0,  r0,  #8
+        bge             2b
+
+        ldrh            lr,  [r5], #4
+        cmp             r3,  lr
+        blt             9f
+
+        sub             r0,  r0,  r12
+        add             r0,  r0,  r1, lsl #3
+        mls             r2,  r6,  r12, r2
+        add             r2,  r2,  #2*8
+        b               1b
+9:
+        pop             {r4-r7,pc}
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+        push            {r4-r5,lr}
+        vmov.i16        q0,  #0
+        movrel_local    r4,  eob_8x32
+
+        mov             r12, #2*\h
+1:
+        ldrh            lr,  [r4], #2
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\i}, [r2, :128]
+        vst1.16         {q0}, [r2, :128], r12
+.endr
+
+.if \w == 8
+        // 8x32
+        shift_8_regs    vrshr.s16, 1
+.endif
+
+        transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+        cmp             r3,  lr
+.if \w == 8
+        load_add_store_8x8 r0, r5, shiftbits=2
+.else
+        load_add_store_8x8 r0, r5, shiftbits=3
+.endif
+
+        blt             9f
+.if \w == 8
+        sub             r2,  r2,  r12, lsl #3
+        add             r2,  r2,  #2*8
+.else
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  #8
+.endif
+        b               1b
+
+9:
+        pop             {r4-r5,pc}
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
+        idct_dc         32,  32,  2
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+        sub_sp_align    2048
+        movrel_local    r10, eob_32x32
+        ldrh            r11, [r10], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  sp,  #(\i*32*2)
+.if \i > 0
+        mov             r8,  #(32 - \i)
+        cmp             r3,  r11
+        blt             1f
+.if \i < 28
+        ldrh            r11, [r10], #2
+.endif
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_horz_dct_32x4_neon
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #2
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  r0,  #(\i)
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+        add_sp_align    2048
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
+        idct_dc         16,  32,  1
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+        sub_sp_align    1024
+        movrel_local    r10, eob_16x32
+        ldrh            r11, [r10], #2
+        movrel_local    r4,  inv_dct_4h_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  sp,  #(\i*16*2)
+        add             r7,  r2,  #(\i*2)
+.if \i > 0
+        mov             r8,  #(32 - \i)
+        cmp             r3,  r11
+        blt             1f
+.if \i < 28
+        ldrh            r11, [r10], #2
+.endif
+.endif
+        mov             r8,  #2*32
+        bl              inv_txfm_horz_scale_16x4_neon
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #4
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12
+        add             r6,  r0,  #(\i)
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #16*2
+        bl              inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+        add_sp_align    1024
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
+        idct_dc         32,  16,  1
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+        sub_sp_align    1024
+        movrel_local    r10, eob_16x32
+        ldrh            r11, [r10], #2
+        movrel_local    r5,  inv_dct_4h_x16_neon
+
+.irp i, 0, 4, 8, 12
+        add             r6,  sp,  #(\i*32*2)
+        add             r7,  r2,  #(\i*2)
+.if \i > 0
+        mov             r8,  #(16 - \i)
+        cmp             r3,  r11
+        blt             1f
+.if \i < 12
+        ldrh            r11, [r10], #2
+.endif
+.endif
+        mov             r8,  #2*16
+        bl              inv_txfm_horz_scale_dct_32x4_neon
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #2
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  r0,  #(\i)
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_add_vert_4x16_neon
+.endr
+
+        add_sp_align    1024
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
+        idct_dc         8,   32,  2
+
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        sub_sp_align    512
+
+        movrel_local    r10, eob_8x32
+
+        mov             r8,  #2*32
+        mov             r9,  #32
+        mov             r6,  sp
+1:
+        vmov.i16        q0,  #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\i}, [r2, :128]
+        vst1.16         {q0}, [r2, :128], r8
+.endr
+        ldrh            r11, [r10], #2
+        sub             r2,  r2,  r8, lsl #3
+        sub             r9,  r9,  #8
+        add             r2,  r2,  #2*8
+
+        bl              inv_dct_8h_x8_neon
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vrshr.s16       \i,  \i,  #2
+.endr
+
+        transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+        vst1.16         {q8,  q9},  [r6, :128]!
+        cmp             r3,  r11
+        vst1.16         {q10, q11}, [r6, :128]!
+        vst1.16         {q12, q13}, [r6, :128]!
+        vst1.16         {q14, q15}, [r6, :128]!
+
+        bge             1b
+        cmp             r9,  #0
+        beq             3f
+
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r9,  r9,  #8
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4
+        add             r6,  r0,  #(\i)
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #8*2
+        bl              inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+        add_sp_align    512
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
+        idct_dc         32,  8,   2
+
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        sub_sp_align    512
+
+.irp i, 0, 4
+        add             r6,  sp,  #(\i*32*2)
+        add             r7,  r2,  #(\i*2)
+.if \i > 0
+        cmp             r3,  #10
+        blt             1f
+.endif
+        mov             r8,  #8*2
+        bl              inv_txfm_horz_dct_32x4_neon
+.endr
+        b               2f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+.rept 8
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+
+2:
+        mov             r8,  #2*32
+        mov             r9,  #0
+1:
+        add             r6,  r0,  r9
+        add             r7,  sp,  r9, lsl #1 // #(\i*2)
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\i}, [r7, :128], r8
+.endr
+        add             r9,  r9,  #8
+
+        bl              inv_dct_8h_x8_neon
+
+        cmp             r9,  #32
+
+        load_add_store_8x8 r6, r7
+
+        blt             1b
+
+        add_sp_align    512
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_dct64_step1_neon
+        // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+        // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+        // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+        // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+        vld1.16         {d0, d1, d2}, [r12, :64]!
+
+        vqrdmulh.s16    d23, d16, d0[1]  // t63a
+        vqrdmulh.s16    d16, d16, d0[0]  // t32a
+        vqrdmulh.s16    d22, d17, d0[2]  // t62a
+        vqrdmulh.s16    d17, d17, d0[3]  // t33a
+        vqrdmulh.s16    d21, d18, d1[1]  // t61a
+        vqrdmulh.s16    d18, d18, d1[0]  // t34a
+        vqrdmulh.s16    d20, d19, d1[2]  // t60a
+        vqrdmulh.s16    d19, d19, d1[3]  // t35a
+
+        vqadd.s16       d24, d16, d17    // t32
+        vqsub.s16       d25, d16, d17    // t33
+        vqsub.s16       d26, d19, d18    // t34
+        vqadd.s16       d27, d19, d18    // t35
+        vqadd.s16       d28, d20, d21    // t60
+        vqsub.s16       d29, d20, d21    // t61
+        vqsub.s16       d30, d23, d22    // t62
+        vqadd.s16       d31, d23, d22    // t63
+
+        vmull_vmlal     q2,  d29, d26, d2[0], d2[1] // -> t34a
+        vmull_vmlsl     q3,  d29, d26, d2[1], d2[0] // -> t61a
+        vneg.s32        q2,  q2                     // t34a
+        vmull_vmlsl     q4,  d30, d25, d2[1], d2[0] // -> t33a
+        vrshrn.i32      d26, q2,  #12               // t34a
+        vmull_vmlal     q2,  d30, d25, d2[0], d2[1] // -> t62a
+        vrshrn.i32      d29, q3,  #12               // t61a
+        vrshrn.i32      d25, q4,  #12               // t33a
+        vrshrn.i32      d30, q2,  #12               // t62a
+
+        vqadd.s16       d16, d24, d27    // t32a
+        vqsub.s16       d19, d24, d27    // t35a
+        vqadd.s16       d17, d25, d26    // t33
+        vqsub.s16       d18, d25, d26    // t34
+        vqsub.s16       d20, d31, d28    // t60a
+        vqadd.s16       d23, d31, d28    // t63a
+        vqsub.s16       d21, d30, d29    // t61
+        vqadd.s16       d22, d30, d29    // t62
+
+        vmull_vmlal     q2,  d21, d18, d2[2], d2[3] // -> t61a
+        vmull_vmlsl     q3,  d21, d18, d2[3], d2[2] // -> t34a
+        vmull_vmlal     q4,  d20, d19, d2[2], d2[3] // -> t60
+        vrshrn.i32      d21, q2,  #12               // t61a
+        vrshrn.i32      d18, q3,  #12               // t34a
+        vmull_vmlsl     q2,  d20, d19, d2[3], d2[2] // -> t35
+        vrshrn.i32      d20, q4,  #12               // t60
+        vrshrn.i32      d19, q2,  #12               // t35
+
+        vst1.16         {d16, d17, d18, d19}, [r6, :128]!
+        vst1.16         {d20, d21, d22, d23}, [r6, :128]!
+
+        bx              lr
+endfunc
+
+function inv_dct64_step2_neon
+        movrel_local    r12, idct_coeffs
+        vld1.16         {d0}, [r12, :64]
+1:
+        // t32a/33/34a/35/60/61a/62/63a
+        // t56a/57/58a/59/36/37a/38/39a
+        // t40a/41/42a/43/52/53a/54/55a
+        // t48a/49/50a/51/44/45a/46/47a
+        vldr            d16, [r6, #2*4*0]  // t32a
+        vldr            d17, [r9, #2*4*8]  // t39a
+        vldr            d18, [r9, #2*4*0]  // t63a
+        vldr            d19, [r6, #2*4*8]  // t56a
+        vldr            d20, [r6, #2*4*16] // t40a
+        vldr            d21, [r9, #2*4*24] // t47a
+        vldr            d22, [r9, #2*4*16] // t55a
+        vldr            d23, [r6, #2*4*24] // t48a
+
+        vqadd.s16       d24, d16, d17      // t32
+        vqsub.s16       d25, d16, d17      // t39
+        vqadd.s16       d26, d18, d19      // t63
+        vqsub.s16       d27, d18, d19      // t56
+        vqsub.s16       d28, d21, d20      // t40
+        vqadd.s16       d29, d21, d20      // t47
+        vqadd.s16       d30, d23, d22      // t48
+        vqsub.s16       d31, d23, d22      // t55
+
+        vmull_vmlal     q2,  d27, d25, d0[3], d0[2] // -> t56a
+        vmull_vmlsl     q3,  d27, d25, d0[2], d0[3] // -> t39a
+        vmull_vmlal     q4,  d31, d28, d0[3], d0[2] // -> t40a
+        vrshrn.i32      d25, q2,  #12               // t56a
+        vrshrn.i32      d27, q3,  #12               // t39a
+        vneg.s32        q4,  q4                     // t40a
+        vmull_vmlsl     q2,  d31, d28, d0[2], d0[3] // -> t55a
+        vrshrn.i32      d31, q4,  #12               // t40a
+        vrshrn.i32      d28, q2,  #12               // t55a
+
+        vqadd.s16       d16, d24, d29      // t32a
+        vqsub.s16       d19, d24, d29      // t47a
+        vqadd.s16       d17, d27, d31      // t39
+        vqsub.s16       d18, d27, d31      // t40
+        vqsub.s16       d20, d26, d30      // t48a
+        vqadd.s16       d23, d26, d30      // t63a
+        vqsub.s16       d21, d25, d28      // t55
+        vqadd.s16       d22, d25, d28      // t56
+
+        vmull_vmlsl     q2,  d21, d18, d0[0], d0[0] // -> t40a
+        vmull_vmlal     q3,  d21, d18, d0[0], d0[0] // -> t55a
+        vmull_vmlsl     q4,  d20, d19, d0[0], d0[0] // -> t47
+        vrshrn.i32      d18, q2,  #12               // t40a
+        vrshrn.i32      d21, q3,  #12               // t55a
+        vmull_vmlal     q2,  d20, d19, d0[0], d0[0] // -> t48
+        vrshrn.i32      d19, q4,  #12               // t47
+        vrshrn.i32      d20, q2,  #12               // t48
+
+        vstr            d16, [r6, #2*4*0]  // t32a
+        vstr            d17, [r9, #2*4*0]  // t39
+        vstr            d18, [r6, #2*4*8]  // t40a
+        vstr            d19, [r9, #2*4*8]  // t47
+        vstr            d20, [r6, #2*4*16] // t48
+        vstr            d21, [r9, #2*4*16] // t55a
+        vstr            d22, [r6, #2*4*24] // t56
+        vstr            d23, [r9, #2*4*24] // t63a
+
+        add             r6,  r6,  #2*4
+        sub             r9,  r9,  #2*4
+        cmp             r6,  r9
+        blt             1b
+        bx              lr
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23
+.if \clear
+        vld1.16         {\i}, [\src, :64]
+        vst1.16         {\zero}, [\src, :64], \strd
+.else
+        vld1.16         {\i}, [\src, :64], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+        vst1.16         {q8,  q9},  [\dst, :128]!
+        vst1.16         {q10, q11}, [\dst, :128]!
+        vst1.16         {q12, q13}, [\dst, :128]!
+        vst1.16         {q14, q15}, [\dst, :128]!
+.endm
+
+.macro clear_upper8
+.irp i, q12, q13, q14, q15
+        vmov.i16        \i,  #0
+.endr
+.endm
+
+.macro vmov_if reg, val, cond
+.if \cond
+        vmov.i16        \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+        movw            \gpr, \val
+        vdup.16         \reg, \gpr
+.endif
+.endm
+
+.macro vst1_if regs, dst, dstalign, cond
+.if \cond
+        vst1.16         \regs, \dst, \dstalign
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+        scale_input     \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4h_x64_neon, export=1
+        mov             r6,  sp
+
+        push            {r10-r11,lr}
+
+        lsl             r8,  r8,  #2
+
+        movdup_if       d0,  r12, #2896*8, \scale
+        vmov_if         d7,  #0,  \clear
+        load8           r7,  r8,  d7,  \clear
+        clear_upper8
+        sub             r7,  r7,  r8, lsl #3
+        add             r7,  r7,  r8, lsr #1
+        scale_if        \scale, d0[0], q8, q9, q10, q11
+
+        bl              inv_dct_4h_x16_neon
+
+        store16         r6
+
+        movdup_if       d0,  r12, #2896*8, \scale
+        vmov_if         d7,  #0,  \clear
+        load8           r7,  r8,  d7,  \clear
+        clear_upper8
+        sub             r7,  r7,  r8, lsl #3
+        lsr             r8,  r8,  #1
+        sub             r7,  r7,  r8, lsr #1
+        scale_if        \scale, d0[0], q8, q9, q10, q11
+
+        bl              inv_dct32_odd_4h_x16_neon
+
+        add             r10, r6,  #8*15
+        sub             r6,  r6,  #8*16
+
+        mov             r9,  #-8
+
+.macro store_addsub r0, r1, r2, r3
+        vld1.16         {d2},  [r6, :64]!
+        vld1.16         {d3},  [r6, :64]!
+        vqadd.s16       d6,  d2,  \r0
+        vqsub.s16       \r0, d2,  \r0
+        vld1.16         {d4},  [r6, :64]!
+        vqadd.s16       d7,  d3,  \r1
+        vqsub.s16       \r1, d3,  \r1
+        vld1.16         {d5},  [r6, :64]!
+        vqadd.s16       d2,  d4,  \r2
+        sub             r6,  r6,  #8*4
+        vqsub.s16       \r2, d4,  \r2
+        vst1.16         {d6},  [r6,  :64]!
+        vst1.16         {\r0}, [r10, :64], r9
+        vqadd.s16       d3,  d5,  \r3
+        vqsub.s16       \r3, d5,  \r3
+        vst1.16         {d7},  [r6,  :64]!
+        vst1.16         {\r1}, [r10, :64], r9
+        vst1.16         {d2},  [r6,  :64]!
+        vst1.16         {\r2}, [r10, :64], r9
+        vst1.16         {d3},  [r6,  :64]!
+        vst1.16         {\r3}, [r10, :64], r9
+.endm
+        store_addsub    d31, d30, d29, d28
+        store_addsub    d27, d26, d25, d24
+        store_addsub    d23, d22, d21, d20
+        store_addsub    d19, d18, d17, d16
+.purgem store_addsub
+
+        add             r6,  r6,  #2*4*16
+
+        movrel_local    r12, idct64_coeffs
+        movdup_if       d0,  lr,  #2896*8, \scale
+        vmov_if         d7,  #0,  \clear
+        add             r9,  r7,  r8, lsl #4 // offset 16
+        add             r10, r7,  r8, lsl #3 // offset 8
+        sub             r9,  r9,  r8         // offset 15
+        sub             r11, r10, r8         // offset 7
+        vld1.16         {d16}, [r7,  :64]    // in1  (offset 0)
+        vld1.16         {d17}, [r9,  :64]    // in31 (offset 15)
+        vld1.16         {d18}, [r10, :64]    // in17 (offset 8)
+        vld1.16         {d19}, [r11, :64]    // in15 (offset 7)
+        vst1_if         {d7},  [r7,  :64], \clear
+        vst1_if         {d7},  [r9,  :64], \clear
+        vst1_if         {d7},  [r10, :64], \clear
+        vst1_if         {d7},  [r11, :64], \clear
+        scale_if        \scale, d0[0], q8, q9
+        bl              inv_dct64_step1_neon
+        movdup_if       d0,  lr,  #2896*8, \scale
+        vmov_if         d7,  #0,  \clear
+        add             r7,  r7,  r8, lsl #2 // offset 4
+        sub             r9,  r9,  r8, lsl #2 // offset 11
+        sub             r10, r7,  r8         // offset 3
+        add             r11, r9,  r8         // offset 12
+        vld1.16         {d16}, [r10, :64]    // in7  (offset 3)
+        vld1.16         {d17}, [r11, :64]    // in25 (offset 12)
+        vld1.16         {d18}, [r9,  :64]    // in23 (offset 11)
+        vld1.16         {d19}, [r7,  :64]    // in9  (offset 4)
+        vst1_if         {d7},  [r7,  :64], \clear
+        vst1_if         {d7},  [r9,  :64], \clear
+        vst1_if         {d7},  [r10, :64], \clear
+        vst1_if         {d7},  [r11, :64], \clear
+        scale_if        \scale, d0[0], q8, q9
+        bl              inv_dct64_step1_neon
+        movdup_if       d0,  lr,  #2896*8, \scale
+        vmov_if         d7,  #0,  \clear
+        sub             r10, r10, r8, lsl #1 // offset 1
+        sub             r9,  r9,  r8, lsl #1 // offset 9
+        add             r10, r10, r8         // offset 2
+        add             r9,  r9,  r8         // offset 10
+        add             r7,  r7,  r8         // offset 5
+        add             r11, r11, r8         // offset 13
+        vld1.16         d16, [r10, :64]      // in5  (offset 2)
+        vld1.16         d17, [r11, :64]      // in27 (offset 13)
+        vld1.16         d18, [r9,  :64]      // in21 (offset 10)
+        vld1.16         d19, [r7,  :64]      // in11 (offset 5)
+        vst1_if         d7,  [r10, :64], \clear
+        vst1_if         d7,  [r11, :64], \clear
+        vst1_if         d7,  [r9,  :64], \clear
+        vst1_if         d7,  [r7,  :64], \clear
+        scale_if        \scale, d0[0], q8, q9
+        bl              inv_dct64_step1_neon
+        movdup_if       d0,  lr,  #2896*8, \scale
+        vmov_if         d7,  #0,  \clear
+        sub             r10, r10, r8         // offset 1
+        sub             r9,  r9,  r8         // offset 9
+        add             r11, r11, r8         // offset 14
+        add             r7,  r7,  r8         // offset 6
+        vld1.16         d16, [r10, :64]      // in3  (offset 1)
+        vld1.16         d17, [r11, :64]      // in29 (offset 14)
+        vld1.16         d18, [r9,  :64]      // in19 (offset 9)
+        vld1.16         d19, [r7,  :64]      // in13 (offset 6)
+        vst1_if         d7,  [r10, :64], \clear
+        vst1_if         d7,  [r11, :64], \clear
+        vst1_if         d7,  [r9,  :64], \clear
+        vst1_if         d7,  [r7,  :64], \clear
+        scale_if        \scale, d0[0], q8, q9
+        bl              inv_dct64_step1_neon
+
+        sub             r6,  r6,  #2*4*32
+        add             r9,  r6,  #2*4*7
+
+        bl              inv_dct64_step2_neon
+
+        pop             {r10-r11,pc}
+endfunc
+.endm
+
+def_dct64_func
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+function inv_txfm_horz_dct_64x4_neon
+        vdup.16         q3,  r9
+
+        mov             r7,  sp
+        add             r8,  sp,  #2*4*(64 - 4)
+        add             r9,  r6,  #2*56
+
+        push            {r10-r11,lr}
+
+        mov             r10, #2*64
+        mov             r11, #-2*4*4
+
+1:
+        vld1.16         {d16, d17, d18, d19}, [r7, :128]!
+        vld1.16         {d28, d29, d30, d31}, [r8, :128], r11
+        vld1.16         {d20, d21, d22, d23}, [r7, :128]!
+        vld1.16         {d24, d25, d26, d27}, [r8, :128], r11
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+        transpose_4x4h  q15, q14, d31, d30, d29, d28
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        transpose_4x4h  q13, q12, d27, d26, d25, d24
+
+.macro store_addsub src0, src1, src2, src3
+        vqsub.s16       d3,  \src0,  \src1
+        vqsub.s16       d2,  \src2,  \src3
+        vqadd.s16       d0,  \src0,  \src1
+        vqadd.s16       d1,  \src2,  \src3
+        vrshl.s16       q1,  q1,  q3
+        vrshl.s16       q0,  q0,  q3
+        vrev64.16       q1,  q1
+        vst1.16         {q0},  [r6, :128], r10
+        vst1.16         {q1},  [r9, :128], r10
+.endm
+        store_addsub    d16, d31, d20, d27
+        store_addsub    d17, d30, d21, d26
+        store_addsub    d18, d29, d22, d25
+        store_addsub    d19, d28, d23, d24
+.purgem store_addsub
+        sub             r6,  r6,  r10, lsl #2
+        sub             r9,  r9,  r10, lsl #2
+        add             r6,  r6,  #16
+        sub             r9,  r9,  #16
+
+        cmp             r7,  r8
+        blt             1b
+        pop             {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_vert_dct_4x64_neon
+        lsl             r8,  r8,  #1
+
+        mov             r7,  sp
+        add             r8,  sp,  #2*4*(64 - 4)
+        add             r9,  r6,  r1, lsl #6
+        sub             r9,  r9,  r1
+
+        push            {r10-r11,lr}
+
+        neg             r10, r1
+        mov             r11, #-2*4*4
+
+1:
+        vld1.16         {d16, d17, d18, d19}, [r7, :128]!
+        vld1.16         {d28, d29, d30, d31}, [r8, :128], r11
+        vld1.16         {d20, d21, d22, d23}, [r7, :128]!
+        vld1.16         {d24, d25, d26, d27}, [r8, :128], r11
+
+.macro add_dest_addsub src0, src1, src2, src3
+        vld1.32         {d0[0]}, [r6, :32], r1
+        vld1.32         {d1[0]}, [r9, :32], r10
+        vqadd.s16       d4,  \src0,  \src1
+        vld1.32         {d0[1]}, [r6, :32]
+        vqadd.s16       d5,  \src2,  \src3
+        vld1.32         {d1[1]}, [r9, :32]
+        vqsub.s16       d6,  \src0,  \src1
+        vqsub.s16       d7,  \src2,  \src3
+        sub             r6,  r6,  r1
+        sub             r9,  r9,  r10
+        vrshr.s16       q2,  q2,  #4
+        vrshr.s16       q3,  q3,  #4
+        vaddw.u8        q2,  q2,  d0
+        vaddw.u8        q3,  q3,  d1
+        vqmovun.s16     d0,  q2
+        vqmovun.s16     d1,  q3
+        vst1.32         {d0[0]}, [r6, :32], r1
+        vst1.32         {d1[0]}, [r9, :32], r10
+        vst1.32         {d0[1]}, [r6, :32], r1
+        vst1.32         {d1[1]}, [r9, :32], r10
+.endm
+        add_dest_addsub d16, d31, d17, d30
+        add_dest_addsub d18, d29, d19, d28
+        add_dest_addsub d20, d27, d21, d26
+        add_dest_addsub d22, d25, d23, d24
+.purgem add_dest_addsub
+        cmp             r7,  r8
+        blt             1b
+
+        pop             {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
+        idct_dc         64,  64,  2
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+
+        sub_sp_align    64*32*2+64*4*2
+        add             r5,  sp,  #64*4*2
+
+        movrel_local    r10, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  r5,  #(\i*64*2)
+.if \i > 0
+        mov             r8,  #(32 - \i)
+        cmp             r3,  r11
+        blt             1f
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_dct_clear_4h_x64_neon
+        add             r6,  r5,  #(\i*64*2)
+        mov             r9,  #-2 // shift
+        bl              inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+        ldrh            r11, [r10], #2
+.endif
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #2
+.rept 8
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+        add             r7,  r5,  #(\i*2)
+        mov             r8,  #64*2
+        bl              inv_txfm_dct_4h_x64_neon
+        add             r6,  r0,  #(\i)
+        bl              inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+        add_sp_align    64*32*2+64*4*2
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
+        idct_dc         64,  32,  1
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+
+        sub_sp_align    64*32*2+64*4*2
+        add             r5,  sp,  #64*4*2
+
+        movrel_local    r10, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  r5,  #(\i*64*2)
+.if \i > 0
+        mov             r8,  #(32 - \i)
+        cmp             r3,  r11
+        blt             1f
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_dct_clear_scale_4h_x64_neon
+        add             r6,  r5,  #(\i*64*2)
+        mov             r9,  #-1 // shift
+        bl              inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+        ldrh            r11, [r10], #2
+.endif
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #2
+.rept 8
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+        add             r6,  r0,  #(\i)
+        add             r7,  r5,  #(\i*2)
+        mov             r8,  #64*2
+        bl              inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+        add_sp_align    64*32*2+64*4*2
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
+        idct_dc         32,  64,  1
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+
+        sub_sp_align    32*32*2+64*4*2
+        add             r5,  sp,  #64*4*2
+
+        movrel_local    r10, eob_32x32
+        ldrh            r11, [r10], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  r5,  #(\i*32*2)
+.if \i > 0
+        mov             r8,  #(32 - \i)
+        cmp             r3,  r11
+        blt             1f
+        ldrh            r11, [r10], #2
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_horz_scale_dct_32x4_neon
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #2
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r7,  r5,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_dct_4h_x64_neon
+        add             r6,  r0,  #(\i)
+        bl              inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+        add_sp_align    32*32*2+64*4*2
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
+        idct_dc         64,  16,  2
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+
+        sub_sp_align    64*16*2+64*4*2
+        add             r4,  sp,  #64*4*2
+
+        movrel_local    r10, eob_16x32
+
+.irp i, 0, 4, 8, 12
+        add             r6,  r4,  #(\i*64*2)
+.if \i > 0
+        mov             r8,  #(16 - \i)
+        cmp             r3,  r11
+        blt             1f
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #16*2
+        bl              inv_txfm_dct_clear_4h_x64_neon
+        add             r6,  r4,  #(\i*64*2)
+        mov             r9,  #-2 // shift
+        bl              inv_txfm_horz_dct_64x4_neon
+.if \i < 8
+        ldrh            r11, [r10], #2
+.endif
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #2
+.rept 8
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+        movrel_local    r5,  inv_dct_4h_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+        add             r6,  r0,  #(\i)
+        add             r7,  r4,  #(\i*2)
+        mov             r8,  #64*2
+        bl              inv_txfm_add_vert_4x16_neon
+.endr
+
+        add_sp_align    64*16*2+64*4*2
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
+        idct_dc         16,  64,  2
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+
+        sub_sp_align    16*32*2+64*4*2
+        add             r5,  sp,  #64*4*2
+
+        movrel_local    r10, eob_16x32
+        ldrh            r11, [r10], #2
+
+        movrel_local    r4,  inv_dct_4h_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  r5,  #(\i*16*2)
+.if \i > 0
+        mov             r8,  #(32 - \i)
+        cmp             r3,  r11
+        blt             1f
+        ldrh            r11, [r10], #2
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_horz_16x4_neon
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #4
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12
+        add             r7,  r5,  #(\i*2)
+        mov             r8,  #16*2
+        bl              inv_txfm_dct_4h_x64_neon
+        add             r6,  r0,  #(\i)
+        bl              inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+        add_sp_align    16*32*2+64*4*2
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
diff --git a/src/arm/32/loopfilter.S b/src/arm/32/loopfilter.S
new file mode 100644 (file)
index 0000000..25f993d
--- /dev/null
@@ -0,0 +1,868 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro loop_filter wd
+function lpf_8_wd\wd\()_neon
+        vabd.u8         d0,  d22, d23 // abs(p1 - p0)
+        vabd.u8         d1,  d25, d24 // abs(q1 - q0)
+        vabd.u8         d2,  d23, d24 // abs(p0 - q0)
+        vabd.u8         d3,  d22, d25 // abs(p1 - q1)
+.if \wd >= 6
+        vabd.u8         d4,  d21, d22 // abs(p2 - p1)
+        vabd.u8         d5,  d26, d25 // abs(q2 - q1)
+.endif
+.if \wd >= 8
+        vabd.u8         d6,  d20, d21 // abs(p3 - p2)
+        vabd.u8         d7,  d27, d26 // abs(q3 - q3)
+.endif
+.if \wd >= 6
+        vmax.u8         d4,  d4,  d5
+.endif
+        vqadd.u8        d2,  d2,  d2  // abs(p0 - q0) * 2
+.if \wd >= 8
+        vmax.u8         d6,  d6,  d7
+.endif
+        vshr.u8         d3,  d3,  #1
+.if \wd >= 8
+        vmax.u8         d4,  d4,  d6
+.endif
+.if \wd >= 6
+        vand            d4,  d4,  d14
+.endif
+        vmax.u8         d0,  d0,  d1  // max(abs(p1 - p0), abs(q1 - q0))
+        vqadd.u8        d2,  d2,  d3  // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+        vmax.u8         d4,  d0,  d4
+        vcge.u8         d1,  d11, d4  // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+        vcge.u8         d1,  d11, d0  // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+        vcge.u8         d2,  d10, d2  // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+        vand            d1,  d1,  d2  // fm
+        vand            d1,  d1,  d13 // fm && wd >= 4
+.if \wd >= 6
+        vand            d14, d14, d1  // fm && wd > 4
+.endif
+.if \wd >= 16
+        vand            d15, d15, d1  // fm && wd == 16
+.endif
+
+        vmov            r10, r11, d1
+        orrs            r10, r10, r11
+        beq             9f            // if (!fm || wd < 4) return;
+
+.if \wd >= 6
+        vmov.i8         d10, #1
+        vabd.u8         d2,  d21, d23 // abs(p2 - p0)
+        vabd.u8         d3,  d22, d23 // abs(p1 - p0)
+        vabd.u8         d4,  d25, d24 // abs(q1 - q0)
+        vabd.u8         d5,  d26, d24 // abs(q2 - q0)
+.if \wd >= 8
+        vabd.u8         d6,  d20, d23 // abs(p3 - p0)
+        vabd.u8         d7,  d27, d24 // abs(q3 - q0)
+.endif
+        vmax.u8         d2,  d2,  d3
+        vmax.u8         d4,  d4,  d5
+.if \wd >= 8
+        vmax.u8         d6,  d6,  d7
+.endif
+        vmax.u8         d2,  d2,  d4
+.if \wd >= 8
+        vmax.u8         d2,  d2,  d6
+.endif
+
+.if \wd == 16
+        vabd.u8         d3,  d17, d23 // abs(p6 - p0)
+        vabd.u8         d4,  d18, d23 // abs(p5 - p0)
+        vabd.u8         d5,  d19, d23 // abs(p4 - p0)
+.endif
+        vcge.u8         d2,  d10, d2  // flat8in
+.if \wd == 16
+        vabd.u8         d6,  d28, d24 // abs(q4 - q0)
+        vabd.u8         d7,  d29, d24 // abs(q5 - q0)
+        vabd.u8         d8,  d30, d24 // abs(q6 - q0)
+.endif
+        vand            d14, d2,  d14 // flat8in && fm && wd > 4
+        vbic            d1,  d1,  d14 // fm && wd >= 4 && !flat8in
+.if \wd == 16
+        vmax.u8         d3,  d3,  d4
+        vmax.u8         d5,  d5,  d6
+.endif
+        vmov            r10, r11, d1
+.if \wd == 16
+        vmax.u8         d7,  d7,  d8
+        vmax.u8         d3,  d3,  d5
+        vmax.u8         d3,  d3,  d7
+        vcge.u8         d3,  d10, d3  // flat8out
+.endif
+        orrs            r10, r10, r11
+.if \wd == 16
+        vand            d15, d15, d3  // flat8out && fm && wd == 16
+        vand            d15, d15, d14 // flat8out && flat8in && fm && wd == 16
+        vbic            d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out
+.endif
+        beq             1f            // skip wd == 4 case
+.endif
+
+        vsubl.u8        q1,  d22, d25 // p1 - q1
+        vcgt.u8         d0,  d0,  d12 // hev
+        vqmovn.s16      d2,  q1
+        vand            d4,  d2,  d0  // if (hev) iclip_diff(p1 - q1)
+        vbic            d0,  d1,  d0  // (fm && wd >= 4 && !hev)
+        vsubl.u8        q1,  d24, d23
+        vmov.i16        q3,  #3
+        vmul.i16        q1,  q1,  q3
+        vmov.i8         d6,  #4
+        vaddw.s8        q1,  q1,  d4
+        vmov.i8         d7,  #3
+        vqmovn.s16      d2,  q1       // f
+        vqadd.s8        d4,  d6,  d2  // imin(f + 4, 127)
+        vqadd.s8        d5,  d7,  d2  // imin(f + 3, 127)
+        vshr.s8         d4,  d4,  #3  // f1
+        vshr.s8         d5,  d5,  #3  // f2
+        vmovl.u8        q1,  d23      // p0
+        vmovl.u8        q3,  d24      // q0
+        vaddw.s8        q1,  q1,  d5
+        vsubw.s8        q3,  q3,  d4
+        vrshr.s8        d4,  d4,  #1  // (f1 + 1) >> 1
+        vqmovun.s16     d2,  q1       // out p0
+        vqmovun.s16     d6,  q3       // out q0
+        vbit            d23, d2,  d1  // if (fm && wd >= 4)
+        vmovl.u8        q1,  d22      // p1
+        vbit            d24, d6,  d1  // if (fm && wd >= 4)
+        vmovl.u8        q3,  d25      // q1
+        vaddw.s8        q1,  q1,  d4
+        vsubw.s8        q3,  q3,  d4
+        vqmovun.s16     d2,  q1       // out p1
+        vqmovun.s16     d6,  q3       // out q1
+        vbit            d22, d2,  d0  // if (fm && wd >= 4 && !hev)
+        vbit            d25, d6,  d0  // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+        vmov            r10, r11, d14
+        orrs            r10, r10, r11
+        beq             2f            // skip if there's no flat8in
+
+        vaddl.u8        q0,  d21, d21 // p2 * 2
+        vaddl.u8        q1,  d21, d22 // p2 + p1
+        vaddl.u8        q2,  d22, d23 // p1 + p0
+        vaddl.u8        q3,  d23, d24 // p0 + q0
+        vadd.i16        q4,  q0,  q1
+        vadd.i16        q5,  q2,  q3
+        vaddl.u8        q6,  d24, d25 // q0 + q1
+        vadd.i16        q4,  q4,  q5
+        vsub.i16        q6,  q6,  q0
+        vaddl.u8        q5,  d25, d26 // q1 + q2
+        vrshrn.i16      d0,  q4,  #3  // out p1
+
+        vadd.i16        q4,  q4,  q6
+        vsub.i16        q5,  q5,  q1
+        vaddl.u8        q6,  d26, d26 // q2 + q2
+        vrshrn.i16      d1,  q4,  #3  // out p0
+
+        vadd.i16        q4,  q4,  q5
+        vsub.i16        q6,  q6,  q2
+        vrshrn.i16      d2,  q4,  #3  // out q0
+
+        vbit            d22, d0,  d14 // p1 if (flat8in)
+        vadd.i16        q4,  q4,  q6
+        vbit            d23, d1,  d14 // p0 if (flat8in)
+        vrshrn.i16      d3,  q4,  #3  // out q1
+        vbit            d24, d2,  d14 // q0 if (flat8in)
+        vbit            d25, d3,  d14 // q1 if (flat8in)
+.elseif \wd >= 8
+        vmov            r10, r11, d14
+        orrs            r10, r10, r11
+.if \wd == 8
+        beq             8f            // skip if there's no flat8in
+.else
+        beq             2f            // skip if there's no flat8in
+.endif
+
+        vaddl.u8        q0,  d20, d21 // p3 + p2
+        vaddl.u8        q1,  d22, d25 // p1 + q1
+        vaddl.u8        q2,  d20, d22 // p3 + p1
+        vaddl.u8        q3,  d23, d26 // p0 + q2
+        vadd.i16        q4,  q0,  q0  // 2 * (p3 + p2)
+        vaddw.u8        q4,  q4,  d23 // + p0
+        vaddw.u8        q4,  q4,  d24 // + q0
+        vadd.i16        q4,  q4,  q2  // + p3 + p1
+        vsub.i16        q1,  q1,  q0  // p1 + q1 - p3 - p2
+        vsub.i16        q3,  q3,  q2  // p0 + q2 - p3 - p1
+        vrshrn.i16      d10, q4,  #3  // out p2
+
+        vadd.i16        q4,  q4,  q1
+        vaddl.u8        q0,  d20, d23 // p3 + p0
+        vaddl.u8        q1,  d24, d27 // q0 + q3
+        vrshrn.i16      d11, q4,  #3  // out p1
+
+        vadd.i16        q4,  q4,  q3
+        vsub.i16        q1,  q1,  q0  // q0 + q3 - p3 - p0
+        vaddl.u8        q2,  d21, d24 // p2 + q0
+        vaddl.u8        q3,  d25, d27 // q1 + q3
+        vrshrn.i16      d12, q4,  #3  // out p0
+
+        vadd.i16        q4,  q4,  q1
+        vsub.i16        q3,  q3,  q2  // q1 + q3 - p2 - q0
+        vaddl.u8        q0,  d22, d25 // p1 + q1
+        vaddl.u8        q1,  d26, d27 // q2 + q3
+        vrshrn.i16      d13, q4,  #3  // out q0
+
+        vadd.i16        q4,  q4,  q3
+        vsub.i16        q1,  q1,  q0  // q2 + q3 - p1 - q1
+        vrshrn.i16      d0,  q4,  #3  // out q1
+
+        vadd.i16        q4,  q4,  q1
+
+        vbit            d21, d10, d14
+        vbit            d22, d11, d14
+        vbit            d23, d12, d14
+        vrshrn.i16      d1,  q4,  #3  // out q2
+        vbit            d24, d13, d14
+        vbit            d25, d0,  d14
+        vbit            d26, d1,  d14
+.endif
+2:
+.if \wd == 16
+        vmov            r10, r11, d15
+        orrs            r10, r10, r11
+        bne             1f            // check if flat8out is needed
+        vmov            r10, r11, d14
+        orrs            r10, r10, r11
+        beq             8f            // if there was no flat8in, just write the inner 4 pixels
+        b               7f            // if flat8in was used, write the inner 6 pixels
+1:
+
+        vaddl.u8        q1,  d17, d17 // p6 + p6
+        vaddl.u8        q2,  d17, d18 // p6 + p5
+        vaddl.u8        q3,  d17, d19 // p6 + p4
+        vaddl.u8        q4,  d17, d20 // p6 + p3
+        vadd.i16        q6,  q1,  q2
+        vadd.i16        q5,  q3,  q4
+        vaddl.u8        q3,  d17, d21 // p6 + p2
+        vadd.i16        q6,  q6,  q5
+        vaddl.u8        q4,  d17, d22 // p6 + p1
+        vaddl.u8        q5,  d18, d23 // p5 + p0
+        vadd.i16        q3,  q3,  q4
+        vaddl.u8        q4,  d19, d24 // p4 + q0
+        vadd.i16        q6,  q6,  q3
+        vadd.i16        q5,  q5,  q4
+        vaddl.u8        q3,  d20, d25 // p3 + q1
+        vadd.i16        q6,  q6,  q5
+        vsub.i16        q3,  q3,  q1
+        vaddl.u8        q1,  d21, d26 // p2 + q2
+        vrshrn.i16      d0,  q6,  #4  // out p5
+        vadd.i16        q6,  q6,  q3  // - (p6 + p6) + (p3 + q1)
+        vsub.i16        q1,  q1,  q2
+        vaddl.u8        q2,  d22, d27 // p1 + q3
+        vaddl.u8        q3,  d17, d19 // p6 + p4
+        vrshrn.i16      d1,  q6,  #4  // out p4
+        vadd.i16        q6,  q6,  q1  // - (p6 + p5) + (p2 + q2)
+        vsub.i16        q2,  q2,  q3
+        vaddl.u8        q3,  d23, d28 // p0 + q4
+        vaddl.u8        q4,  d17, d20 // p6 + p3
+        vrshrn.i16      d2,  q6,  #4  // out p3
+        vadd.i16        q6,  q6,  q2  // - (p6 + p4) + (p1 + q3)
+        vsub.i16        q3,  q3,  q4
+        vaddl.u8        q4,  d24, d29 // q0 + q5
+        vaddl.u8        q2,  d17, d21 // p6 + p2
+        vrshrn.i16      d3,  q6,  #4  // out p2
+        vadd.i16        q6,  q6,  q3  // - (p6 + p3) + (p0 + q4)
+        vsub.i16        q4,  q4,  q2
+        vaddl.u8        q3,  d25, d30 // q1 + q6
+        vaddl.u8        q5,  d17, d22 // p6 + p1
+        vrshrn.i16      d4,  q6,  #4  // out p1
+        vadd.i16        q6,  q6,  q4  // - (p6 + p2) + (q0 + q5)
+        vsub.i16        q3,  q3,  q5
+        vaddl.u8        q4,  d26, d30 // q2 + q6
+        vbif            d0,  d18, d15 // out p5
+        vaddl.u8        q5,  d18, d23 // p5 + p0
+        vrshrn.i16      d5,  q6,  #4  // out p0
+        vadd.i16        q6,  q6,  q3  // - (p6 + p1) + (q1 + q6)
+        vsub.i16        q4,  q4,  q5
+        vaddl.u8        q5,  d27, d30 // q3 + q6
+        vbif            d1,  d19, d15 // out p4
+        vaddl.u8        q9,  d19, d24 // p4 + q0
+        vrshrn.i16      d6,  q6,  #4  // out q0
+        vadd.i16        q6,  q6,  q4  // - (p5 + p0) + (q2 + q6)
+        vsub.i16        q5,  q5,  q9
+        vaddl.u8        q4,  d28, d30 // q4 + q6
+        vbif            d2,  d20, d15 // out p3
+        vaddl.u8        q9,  d20, d25 // p3 + q1
+        vrshrn.i16      d7,  q6,  #4  // out q1
+        vadd.i16        q6,  q6,  q5  // - (p4 + q0) + (q3 + q6)
+        vsub.i16        q9,  q4,  q9
+        vaddl.u8        q5,  d29, d30 // q5 + q6
+        vbif            d3,  d21, d15 // out p2
+        vaddl.u8        q10, d21, d26 // p2 + q2
+        vrshrn.i16      d8,  q6,  #4  // out q2
+        vadd.i16        q6,  q6,  q9  // - (p3 + q1) + (q4 + q6)
+        vsub.i16        q5,  q5,  q10
+        vaddl.u8        q9,  d30, d30 // q6 + q6
+        vbif            d4,  d22, d15 // out p1
+        vaddl.u8        q10, d22, d27 // p1 + q3
+        vrshrn.i16      d9,  q6,  #4  // out q3
+        vadd.i16        q6,  q6,  q5  // - (p2 + q2) + (q5 + q6)
+        vsub.i16        q9,  q9,  q10
+        vbif            d5,  d23, d15 // out p0
+        vrshrn.i16      d10, q6,  #4  // out q4
+        vadd.i16        q6,  q6,  q9  // - (p1 + q3) + (q6 + q6)
+        vrshrn.i16      d11, q6,  #4  // out q5
+        vbif            d6,  d24, d15 // out q0
+        vbif            d7,  d25, d15 // out q1
+        vbif            d8,  d26, d15 // out q2
+        vbif            d9,  d27, d15 // out q3
+        vbif            d10, d28, d15 // out q4
+        vbif            d11, d29, d15 // out q5
+.endif
+
+        bx              lr
+.if \wd == 16
+7:
+        // Return to a shorter epilogue, writing only the inner 6 pixels
+        bx              r8
+.endif
+.if \wd >= 8
+8:
+        // Return to a shorter epilogue, writing only the inner 4 pixels
+        bx              r9
+.endif
+9:
+        // Return directly without writing back any pixels
+        bx              r12
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_8_wd16
+        adr             r8,  7f + CONFIG_THUMB
+        adr             r9,  8f + CONFIG_THUMB
+        bl              lpf_8_wd16_neon
+.endm
+
+.macro lpf_8_wd8
+        adr             r9,  8f + CONFIG_THUMB
+        bl              lpf_8_wd8_neon
+.endm
+
+.macro lpf_8_wd6
+        bl              lpf_8_wd6_neon
+.endm
+
+.macro lpf_8_wd4
+        bl              lpf_8_wd4_neon
+.endm
+
+function lpf_v_4_8_neon
+        mov             r12, lr
+        sub             r10, r0,  r1, lsl #1
+        vld1.8          {d22}, [r10, :64], r1 // p1
+        vld1.8          {d24}, [r0,  :64], r1 // q0
+        vld1.8          {d23}, [r10, :64], r1 // p0
+        vld1.8          {d25}, [r0,  :64], r1 // q1
+        sub             r0,  r0,  r1, lsl #1
+
+        lpf_8_wd4
+
+        sub             r10, r0,  r1, lsl #1
+        vst1.8          {d22}, [r10, :64], r1 // p1
+        vst1.8          {d24}, [r0,  :64], r1 // q0
+        vst1.8          {d23}, [r10, :64], r1 // p0
+        vst1.8          {d25}, [r0,  :64], r1 // q1
+        sub             r0,  r0,  r1, lsl #1
+        bx              r12
+endfunc
+
+function lpf_h_4_8_neon
+        mov             r12, lr
+        sub             r10, r0,  #2
+        add             r0,  r10, r1, lsl #2
+        vld1.32         {d22[0]}, [r10], r1
+        vld1.32         {d22[1]}, [r0],  r1
+        vld1.32         {d23[0]}, [r10], r1
+        vld1.32         {d23[1]}, [r0],  r1
+        vld1.32         {d24[0]}, [r10], r1
+        vld1.32         {d24[1]}, [r0],  r1
+        vld1.32         {d25[0]}, [r10], r1
+        vld1.32         {d25[1]}, [r0],  r1
+        add             r0,  r0,  #2
+
+        transpose_4x8b  q11, q12, d22, d23, d24, d25
+
+        lpf_8_wd4
+
+        sub             r10, r0,  r1, lsl #3
+        sub             r10, r10, #2
+        transpose_4x8b  q11, q12, d22, d23, d24, d25
+        add             r0,  r10, r1, lsl #2
+
+        vst1.32         {d22[0]}, [r10], r1
+        vst1.32         {d22[1]}, [r0],  r1
+        vst1.32         {d23[0]}, [r10], r1
+        vst1.32         {d23[1]}, [r0],  r1
+        vst1.32         {d24[0]}, [r10], r1
+        vst1.32         {d24[1]}, [r0],  r1
+        vst1.32         {d25[0]}, [r10], r1
+        vst1.32         {d25[1]}, [r0],  r1
+        add             r0,  r0,  #2
+        bx              r12
+endfunc
+
+function lpf_v_6_8_neon
+        mov             r12, lr
+        sub             r10, r0,  r1, lsl #1
+        sub             r10, r10, r1
+        vld1.8          {d21}, [r10, :64], r1 // p2
+        vld1.8          {d24}, [r0,  :64], r1 // q0
+        vld1.8          {d22}, [r10, :64], r1 // p1
+        vld1.8          {d25}, [r0,  :64], r1 // q1
+        vld1.8          {d23}, [r10, :64], r1 // p0
+        vld1.8          {d26}, [r0,  :64], r1 // q2
+        sub             r0,  r0,  r1, lsl #1
+        sub             r0,  r0,  r1
+
+        lpf_8_wd6
+
+        sub             r10, r0,  r1, lsl #1
+        vst1.8          {d22}, [r10, :64], r1 // p1
+        vst1.8          {d24}, [r0,  :64], r1 // q0
+        vst1.8          {d23}, [r10, :64], r1 // p0
+        vst1.8          {d25}, [r0,  :64], r1 // q1
+        sub             r0,  r0,  r1, lsl #1
+        bx              r12
+endfunc
+
+function lpf_h_6_8_neon
+        mov             r12, lr
+        sub             r10, r0,  #4
+        add             r0,  r10, r1, lsl #2
+        vld1.8          {d20}, [r10], r1
+        vld1.8          {d24}, [r0],  r1
+        vld1.8          {d21}, [r10], r1
+        vld1.8          {d25}, [r0],  r1
+        vld1.8          {d22}, [r10], r1
+        vld1.8          {d26}, [r0],  r1
+        vld1.8          {d23}, [r10], r1
+        vld1.8          {d27}, [r0],  r1
+        add             r0,  r0,  #4
+
+        transpose_8x8b  q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+        lpf_8_wd6
+
+        sub             r10, r0,  r1, lsl #3
+        sub             r10, r10, #2
+        transpose_4x8b  q11, q12, d22, d23, d24, d25
+        add             r0,  r10, r1, lsl #2
+
+        vst1.32         {d22[0]}, [r10], r1
+        vst1.32         {d22[1]}, [r0],  r1
+        vst1.32         {d23[0]}, [r10], r1
+        vst1.32         {d23[1]}, [r0],  r1
+        vst1.32         {d24[0]}, [r10], r1
+        vst1.32         {d24[1]}, [r0],  r1
+        vst1.32         {d25[0]}, [r10], r1
+        vst1.32         {d25[1]}, [r0],  r1
+        add             r0,  r0,  #2
+        bx              r12
+endfunc
+
+function lpf_v_8_8_neon
+        mov             r12, lr
+        sub             r10, r0,  r1, lsl #2
+        vld1.8          {d20}, [r10, :64], r1 // p3
+        vld1.8          {d24}, [r0,  :64], r1 // q0
+        vld1.8          {d21}, [r10, :64], r1 // p2
+        vld1.8          {d25}, [r0,  :64], r1 // q1
+        vld1.8          {d22}, [r10, :64], r1 // p1
+        vld1.8          {d26}, [r0,  :64], r1 // q2
+        vld1.8          {d23}, [r10, :64], r1 // p0
+        vld1.8          {d27}, [r0,  :64], r1 // q3
+        sub             r0,  r0,  r1, lsl #2
+
+        lpf_8_wd8
+
+        sub             r10, r0,  r1, lsl #1
+        sub             r10, r10,  r1
+        vst1.8          {d21}, [r10, :64], r1 // p2
+        vst1.8          {d24}, [r0,  :64], r1 // q0
+        vst1.8          {d22}, [r10, :64], r1 // p1
+        vst1.8          {d25}, [r0,  :64], r1 // q1
+        vst1.8          {d23}, [r10, :64], r1 // p0
+        vst1.8          {d26}, [r0,  :64], r1 // q2
+        sub             r0,  r0,  r1, lsl #1
+        sub             r0,  r0,  r1
+        bx              r12
+
+8:
+        sub             r10, r0,  r1, lsl #1
+        vst1.8          {d22}, [r10, :64], r1 // p1
+        vst1.8          {d24}, [r0,  :64], r1 // q0
+        vst1.8          {d23}, [r10, :64], r1 // p0
+        vst1.8          {d25}, [r0,  :64], r1 // q1
+        sub             r0,  r0,  r1, lsl #1
+        bx              r12
+endfunc
+
+function lpf_h_8_8_neon
+        mov             r12, lr
+        sub             r10, r0,  #4
+        add             r0,  r10, r1, lsl #2
+        vld1.8          {d20}, [r10], r1
+        vld1.8          {d24}, [r0],  r1
+        vld1.8          {d21}, [r10], r1
+        vld1.8          {d25}, [r0],  r1
+        vld1.8          {d22}, [r10], r1
+        vld1.8          {d26}, [r0],  r1
+        vld1.8          {d23}, [r10], r1
+        vld1.8          {d27}, [r0],  r1
+        add             r0,  r0,  #4
+
+        transpose_8x8b  q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+        lpf_8_wd8
+
+        sub             r10, r0,  r1, lsl #3
+        sub             r10, r10, #4
+        transpose_8x8b  q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+        add             r0,  r10, r1, lsl #2
+
+        vst1.8          {d20}, [r10], r1
+        vst1.8          {d24}, [r0],  r1
+        vst1.8          {d21}, [r10], r1
+        vst1.8          {d25}, [r0],  r1
+        vst1.8          {d22}, [r10], r1
+        vst1.8          {d26}, [r0],  r1
+        vst1.8          {d23}, [r10], r1
+        vst1.8          {d27}, [r0],  r1
+        add             r0,  r0,  #4
+        bx              r12
+8:
+        sub             r10, r0,  r1, lsl #3
+        sub             r10, r10, #2
+        transpose_4x8b  q11, q12, d22, d23, d24, d25
+        add             r0,  r10, r1, lsl #2
+
+        vst1.32         {d22[0]}, [r10], r1
+        vst1.32         {d22[1]}, [r0],  r1
+        vst1.32         {d23[0]}, [r10], r1
+        vst1.32         {d23[1]}, [r0],  r1
+        vst1.32         {d24[0]}, [r10], r1
+        vst1.32         {d24[1]}, [r0],  r1
+        vst1.32         {d25[0]}, [r10], r1
+        vst1.32         {d25[1]}, [r0],  r1
+        add             r0,  r0,  #2
+        bx              r12
+endfunc
+
+function lpf_v_16_8_neon
+        mov             r12, lr
+
+        sub             r10, r0,  r1, lsl #3
+        add             r10, r10, r1
+        vld1.8          {d17}, [r10, :64], r1 // p6
+        vld1.8          {d24}, [r0,  :64], r1 // q0
+        vld1.8          {d18}, [r10, :64], r1 // p5
+        vld1.8          {d25}, [r0,  :64], r1 // q1
+        vld1.8          {d19}, [r10, :64], r1 // p4
+        vld1.8          {d26}, [r0,  :64], r1 // q2
+        vld1.8          {d20}, [r10, :64], r1 // p3
+        vld1.8          {d27}, [r0,  :64], r1 // q3
+        vld1.8          {d21}, [r10, :64], r1 // p2
+        vld1.8          {d28}, [r0,  :64], r1 // q4
+        vld1.8          {d22}, [r10, :64], r1 // p1
+        vld1.8          {d29}, [r0,  :64], r1 // q5
+        vld1.8          {d23}, [r10, :64], r1 // p0
+        vld1.8          {d30}, [r0,  :64], r1 // q6
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  r1
+
+        lpf_8_wd16
+
+        sub             r10, r0,  r1, lsl #2
+        sub             r10, r10, r1, lsl #1
+        vst1.8          {d0},  [r10, :64], r1 // p5
+        vst1.8          {d6},  [r0,  :64], r1 // q0
+        vst1.8          {d1},  [r10, :64], r1 // p4
+        vst1.8          {d7},  [r0,  :64], r1 // q1
+        vst1.8          {d2},  [r10, :64], r1 // p3
+        vst1.8          {d8},  [r0,  :64], r1 // q2
+        vst1.8          {d3},  [r10, :64], r1 // p2
+        vst1.8          {d9},  [r0,  :64], r1 // q3
+        vst1.8          {d4},  [r10, :64], r1 // p1
+        vst1.8          {d10}, [r0,  :64], r1 // q4
+        vst1.8          {d5},  [r10, :64], r1 // p0
+        vst1.8          {d11}, [r0,  :64], r1 // q5
+        sub             r0,  r0,  r1, lsl #2
+        sub             r0,  r0,  r1, lsl #1
+        bx              r12
+7:
+        sub             r10, r0,  r1
+        sub             r10, r10, r1, lsl #1
+        vst1.8          {d21}, [r10, :64], r1 // p2
+        vst1.8          {d24}, [r0,  :64], r1 // q0
+        vst1.8          {d22}, [r10, :64], r1 // p1
+        vst1.8          {d25}, [r0,  :64], r1 // q1
+        vst1.8          {d23}, [r10, :64], r1 // p0
+        vst1.8          {d26}, [r0,  :64], r1 // q2
+        sub             r0,  r0,  r1, lsl #1
+        sub             r0,  r0,  r1
+        bx              r12
+
+8:
+        sub             r10, r0,  r1, lsl #1
+        vst1.8          {d22}, [r10, :64], r1 // p1
+        vst1.8          {d24}, [r0,  :64], r1 // q0
+        vst1.8          {d23}, [r10, :64], r1 // p0
+        vst1.8          {d25}, [r0,  :64], r1 // q1
+        sub             r0,  r0,  r1, lsl #1
+        bx              r12
+endfunc
+
+function lpf_h_16_8_neon
+        mov             r12, lr
+        sub             r10, r0,  #8
+        vld1.8          {d16}, [r10, :64], r1
+        vld1.8          {d24}, [r0,  :64], r1
+        vld1.8          {d17}, [r10, :64], r1
+        vld1.8          {d25}, [r0,  :64], r1
+        vld1.8          {d18}, [r10, :64], r1
+        vld1.8          {d26}, [r0,  :64], r1
+        vld1.8          {d19}, [r10, :64], r1
+        vld1.8          {d27}, [r0,  :64], r1
+        vld1.8          {d20}, [r10, :64], r1
+        vld1.8          {d28}, [r0,  :64], r1
+        vld1.8          {d21}, [r10, :64], r1
+        vld1.8          {d29}, [r0,  :64], r1
+        vld1.8          {d22}, [r10, :64], r1
+        vld1.8          {d30}, [r0,  :64], r1
+        vld1.8          {d23}, [r10, :64], r1
+        vld1.8          {d31}, [r0,  :64], r1
+
+        transpose_8x8b  q8,  q9,  q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+        transpose_8x8b  q12, q13, q14, q15, d24, d25, d26, d27, d28, d29, d30, d31
+
+        lpf_8_wd16
+
+        sub             r0,  r0,  r1, lsl #3
+        sub             r10, r0,  #8
+
+        transpose_8x8b  q8,  q0,  q1,  q2,  d16, d17, d0,  d1,  d2,  d3,  d4,  d5
+        transpose_8x8b  q3,  q4,  q5,  q15, d6,  d7,  d8,  d9,  d10, d11, d30, d31
+
+        vst1.8          {d16}, [r10, :64], r1
+        vst1.8          {d6},  [r0,  :64], r1
+        vst1.8          {d17}, [r10, :64], r1
+        vst1.8          {d7},  [r0,  :64], r1
+        vst1.8          {d0},  [r10, :64], r1
+        vst1.8          {d8},  [r0,  :64], r1
+        vst1.8          {d1},  [r10, :64], r1
+        vst1.8          {d9},  [r0,  :64], r1
+        vst1.8          {d2},  [r10, :64], r1
+        vst1.8          {d10}, [r0,  :64], r1
+        vst1.8          {d3},  [r10, :64], r1
+        vst1.8          {d11}, [r0,  :64], r1
+        vst1.8          {d4},  [r10, :64], r1
+        vst1.8          {d30}, [r0,  :64], r1
+        vst1.8          {d5},  [r10, :64], r1
+        vst1.8          {d31}, [r0,  :64], r1
+        bx              r12
+
+7:
+        sub             r10, r0,  r1, lsl #3
+        sub             r10, r10, #4
+        transpose_8x8b  q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+        add             r0,  r10, r1, lsl #2
+
+        vst1.8          {d20}, [r10], r1
+        vst1.8          {d24}, [r0],  r1
+        vst1.8          {d21}, [r10], r1
+        vst1.8          {d25}, [r0],  r1
+        vst1.8          {d22}, [r10], r1
+        vst1.8          {d26}, [r0],  r1
+        vst1.8          {d23}, [r10], r1
+        vst1.8          {d27}, [r0],  r1
+        add             r0,  r0,  #4
+        bx              r12
+8:
+        sub             r10, r0,  r1, lsl #3
+        sub             r10, r10, #2
+        transpose_4x8b  q11, q12, d22, d23, d24, d25
+        add             r0,  r10, r1, lsl #2
+
+        vst1.32         {d22[0]}, [r10], r1
+        vst1.32         {d22[1]}, [r0],  r1
+        vst1.32         {d23[0]}, [r10], r1
+        vst1.32         {d23[1]}, [r0],  r1
+        vst1.32         {d24[0]}, [r10], r1
+        vst1.32         {d24[1]}, [r0],  r1
+        vst1.32         {d25[0]}, [r10], r1
+        vst1.32         {d25[1]}, [r0],  r1
+        add             r0,  r0,  #2
+        bx              r12
+endfunc
+
+// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                 const uint32_t *const vmask,
+//                                 const uint8_t (*l)[4], ptrdiff_t b4_stride,
+//                                 const Av1FilterLUT *lut, const int w)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [r2] // vmask[0], vmask[1]
+.ifc \type, y
+        ldr             r2,  [r2, #8]  // vmask[2]
+.endif
+        add             r5,  r5,  #128 // Move to sharp part of lut
+.ifc \type, y
+        orr             r7,  r7,  r2   // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+        sub             r4,  r3,  r4, lsl #2
+.else
+        sub             r3,  r3,  #4
+        lsl             r4,  r4,  #2
+.endif
+        orr             r6,  r6,  r7   // vmask[0] |= vmask[1]
+
+1:
+        tst             r6,  #0x03
+.ifc \dir, v
+        vld1.8          {d0}, [r4]!
+        vld1.8          {d1}, [r3]!
+.else
+        vld2.32         {d0[0], d1[0]}, [r3], r4
+        vld2.32         {d0[1], d1[1]}, [r3], r4
+.endif
+        beq             7f             // if (!(vm & bits)) continue;
+
+        vld1.8          {d5[]}, [r5]   // sharp[0]
+        add             r5,  r5,  #8
+        vmov.i32        d2,  #0xff
+        vdup.32         d13, r6        // vmask[0]
+
+        vand            d0,  d0,  d2   // Keep only lowest byte in each 32 bit word
+        vand            d1,  d1,  d2
+        vtst.8          d3,  d1,  d2   // Check for nonzero values in l[0][0]
+        vmov.i8         d4,  #1
+        vld1.8          {d6[]}, [r5]   // sharp[1]
+        sub             r5,  r5,  #8
+        vbif            d1,  d0,  d3   // if (!l[0][0]) L = l[offset][0]
+        vmul.i32        d1,  d1,  d4   // L
+.ifc \type, y
+        vdup.32         d15, r2        // vmask[2]
+.endif
+        vtst.32         d2,  d1,  d2   // L != 0
+        vdup.32         d14, r7        // vmask[1]
+        vmov            r10, r11, d2
+        orrs            r10, r10, r11
+        beq             7f             // if (!L) continue;
+        vneg.s8         d5,  d5        // -sharp[0]
+        movrel_local    r10, word_12
+        vshr.u8         d12, d1,  #4   // H
+        vld1.32         {d16}, [r10, :64]
+        vshl.s8         d3,  d1,  d5   // L >> sharp[0]
+.ifc \type, y
+        vtst.32         d15, d15, d16  // if (vmask[2] & bits)
+.endif
+        vmov.i8         d7,  #2
+        vmin.u8         d3,  d3,  d6   // imin(L >> sharp[0], sharp[1])
+        vadd.i8         d0,  d1,  d7   // L + 2
+        vmax.u8         d11, d3,  d4   // imax(imin(), 1) = limit = I
+        vadd.u8         d0,  d0,  d0   // 2*(L + 2)
+        vtst.32         d14, d14, d16  // if (vmask[1] & bits)
+        vadd.i8         d10, d0,  d11  // 2*(L + 2) + limit = E
+        vtst.32         d13, d13, d16  // if (vmask[0] & bits)
+        vand            d13, d13, d2   // vmask[0] &= L != 0
+
+.ifc \type, y
+        tst             r2,  #0x03
+        beq             2f
+        // wd16
+        bl              lpf_\dir\()_16_8_neon
+        b               8f
+2:
+.endif
+        tst             r7,  #0x03
+        beq             3f
+.ifc \type, y
+        // wd8
+        bl              lpf_\dir\()_8_8_neon
+.else
+        // wd6
+        bl              lpf_\dir\()_6_8_neon
+.endif
+        b               8f
+3:
+        // wd4
+        bl              lpf_\dir\()_4_8_neon
+.ifc \dir, h
+        b               8f
+7:
+        // For dir h, the functions above increment r0.
+        // If the whole function is skipped, increment it here instead.
+        add             r0,  r0,  r1,  lsl #3
+.else
+7:
+.endif
+8:
+        lsrs            r6,  r6,  #2   // vmask[0] >>= 2
+        lsr             r7,  r7,  #2   // vmask[1] >>= 2
+.ifc \type, y
+        lsr             r2,  r2,  #2   // vmask[2] >>= 2
+.endif
+.ifc \dir, v
+        add             r0,  r0,  #8
+.else
+        // For dir h, r0 is returned incremented
+.endif
+        bne             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_12, align=4
+        .word 1, 2
+endconst
diff --git a/src/arm/32/looprestoration.S b/src/arm/32/looprestoration.S
new file mode 100644 (file)
index 0000000..ea32d63
--- /dev/null
@@ -0,0 +1,2110 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
+//                                      const pixel *src, ptrdiff_t stride,
+//                                      const int16_t fh[7], const intptr_t w,
+//                                      int h, enum LrEdgeFlags edges);
+function wiener_filter_h_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4}
+        ldrd            r4,  r5,  [sp, #52]
+        ldrd            r6,  r7,  [sp, #60]
+        mov             r8,  r5
+        vld1.16         {q0},  [r4]
+        movw            r9,  #(1 << 14) - (1 << 2)
+        vdup.16         q14,  r9
+        vmov.s16        q15,  #2048
+        // Calculate mid_stride
+        add             r10, r5,  #7
+        bic             r10, r10, #7
+        lsl             r10, r10, #1
+
+        // Clear the last unused element of q0, to allow filtering a single
+        // pixel with one plain vmul+vpadd.
+        mov             r12, #0
+        vmov.16         d1[3], r12
+
+        // Set up pointers for reading/writing alternate rows
+        add             r12, r0,  r10
+        lsl             r10, r10, #1
+        add             lr,  r2,  r3
+        lsl             r3,  r3,  #1
+
+        // Subtract the width from mid_stride
+        sub             r10, r10, r5, lsl #1
+
+        // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
+        cmp             r5,  #8
+        add             r11, r5,  #13
+        bic             r11, r11, #7
+        bge             1f
+        mov             r11, #16
+1:
+        sub             r3,  r3,  r11
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r1,  #0
+        bne             0f
+        // left == NULL
+        sub             r2,  r2,  #3
+        sub             lr,  lr,  #3
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r3,  r3,  #3
+
+
+1:      // Loop vertically
+        vld1.8          {q2},  [r2]!
+        vld1.8          {q9},  [lr]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r1,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.32         {d3[1]},  [r1]!
+        // Move r2/lr back to account for the last 3 bytes we loaded earlier,
+        // which we'll shift out.
+        sub             r2,  r2,  #3
+        sub             lr,  lr,  #3
+        vld1.32         {d17[1]},  [r1]!
+        vext.8          q2,  q1,  q2,  #13
+        vext.8          q9,  q8,  q9,  #13
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+        // and shift q2 to have 3x the first byte at the front.
+        vdup.8          q1, d4[0]
+        vdup.8          q8, d18[0]
+        // Move r2 back to account for the last 3 bytes we loaded before,
+        // which we shifted out.
+        sub             r2,  r2,  #3
+        sub             lr,  lr,  #3
+        vext.8          q2,  q1,  q2,  #13
+        vext.8          q9,  q8,  q9,  #13
+
+2:
+        vmovl.u8        q1,  d4
+        vmovl.u8        q2,  d5
+        vmovl.u8        q8,  d18
+        vmovl.u8        q9,  d19
+
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             r9,  r5, #14
+        ldrb            r11, [r2, r9]
+        ldrb            r9,  [lr, r9]
+        // Fill q12/q13 with the right padding pixel
+        vdup.8          d24, r11
+        vdup.8          d26, r9
+        vmovl.u8        q12, d24
+        vmovl.u8        q13, d26
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             r5,  #11
+        bge             4f   // If w >= 11, all used input pixels are valid
+        cmp             r5,  #7
+        bge             5f   // If w >= 7, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro filter_8
+        // This is tuned as some sort of compromise between Cortex A7, A8,
+        // A9 and A53.
+        vmul.s16        q3,  q1,  d0[0]
+        vext.8          q10, q1,  q2,  #2
+        vext.8          q11, q1,  q2,  #4
+        vmla.s16        q3,  q10, d0[1]
+        vmla.s16        q3,  q11, d0[2]
+        vext.8          q10, q1,  q2,  #6
+        vext.8          q11, q1,  q2,  #8
+        vmla.s16        q3,  q10, d0[3]
+        vmla.s16        q3,  q11, d1[0]
+        vext.8          q10, q1,  q2,  #10
+        vext.8          q11, q1,  q2,  #12
+        vmla.s16        q3,  q10, d1[1]
+        vmla.s16        q3,  q11, d1[2]
+
+        vmul.s16        q10, q8,  d0[0]
+        vext.8          q11, q8,  q9,  #2
+        vext.8          q4,  q8,  q9,  #4
+        vmla.s16        q10, q11, d0[1]
+        vmla.s16        q10, q4,  d0[2]
+        vext.8          q11, q8,  q9,  #6
+        vext.8          q4,  q8,  q9,  #8
+        vmla.s16        q10, q11, d0[3]
+        vmla.s16        q10, q4,  d1[0]
+        vext.8          q11, q8,  q9,  #10
+        vext.8          q4,  q8,  q9,  #12
+        vmla.s16        q10, q11, d1[1]
+        vmla.s16        q10, q4,  d1[2]
+
+        vext.8          q1,  q1,  q2,  #6
+        vext.8          q8,  q8,  q9,  #6
+        vshl.s16        q1,  q1,  #7
+        vshl.s16        q8,  q8,  #7
+        vsub.s16        q1,  q1,  q14
+        vsub.s16        q8,  q8,  q14
+        vqadd.s16       q3,  q3,  q1
+        vqadd.s16       q10, q10, q8
+        vshr.s16        q3,  q3,  #3
+        vshr.s16        q10, q10, #3
+        vadd.s16        q3,  q3,  q15
+        vadd.s16        q10, q10, q15
+.endm
+        filter_8
+        vst1.16         {q3},  [r0,  :128]!
+        vst1.16         {q10}, [r12, :128]!
+
+        subs            r5,  r5,  #8
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vmov            q1,  q2
+        vmov            q8,  q9
+        vld1.8          {d4},  [r2]!
+        vld1.8          {d18}, [lr]!
+        vmovl.u8        q2,  d4
+        vmovl.u8        q9,  d18
+        bne             4b // If we don't need to pad, just keep filtering.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Filter 4 pixels, 7 <= w < 11
+.macro filter_4
+        vmul.s16        d6,  d2,  d0[0]
+        vext.8          q10, q1,  q2,  #2
+        vext.8          q11, q1,  q2,  #4
+        vmla.s16        d6,  d20, d0[1]
+        vmla.s16        d6,  d22, d0[2]
+        vext.8          q10, q1,  q2,  #6
+        vext.8          q11, q1,  q2,  #8
+        vmla.s16        d6,  d20, d0[3]
+        vmla.s16        d6,  d22, d1[0]
+        vext.8          q10, q1,  q2,  #10
+        vext.8          q11, q1,  q2,  #12
+        vmla.s16        d6,  d20, d1[1]
+        vmla.s16        d6,  d22, d1[2]
+
+        vmul.s16        d20, d16, d0[0]
+        vext.8          q11, q8,  q9,  #2
+        vext.8          q4,  q8,  q9,  #4
+        vmla.s16        d20, d22, d0[1]
+        vmla.s16        d20, d8,  d0[2]
+        vext.8          q11, q8,  q9,  #6
+        vext.8          q4,  q8,  q9,  #8
+        vmla.s16        d20, d22, d0[3]
+        vmla.s16        d20, d8,  d1[0]
+        vext.8          q11, q8,  q9,  #10
+        vext.8          q4,  q8,  q9,  #12
+        vmla.s16        d20, d22, d1[1]
+        vmla.s16        d20, d8,  d1[2]
+
+        vext.8          q11, q1,  q2,  #6
+        vshl.s16        d22, d22, #7
+        vsub.s16        d22, d22, d28
+        vqadd.s16       d6,  d6,  d22
+        vext.8          q11, q8,  q9,  #6
+        vshl.s16        d22, d22, #7
+        vsub.s16        d22, d22, d28
+        vqadd.s16       d20, d20, d22
+        vshr.s16        d6,  d6,  #3
+        vshr.s16        d20, d20, #3
+        vadd.s16        d6,  d6,  d30
+        vadd.s16        d20, d20, d30
+.endm
+        filter_4
+        vst1.16         {d6},  [r0,  :64]!
+        vst1.16         {d20}, [r12, :64]!
+
+        subs            r5,  r5,  #4 // 3 <= w < 7
+        vext.8          q1,  q1,  q2,  #8
+        vext.8          q2,  q2,  q2,  #8
+        vext.8          q8,  q8,  q9,  #8
+        vext.8          q9,  q9,  q9,  #8
+
+6:      // Pad the right edge and filter the last few pixels.
+        // w < 7, w+3 pixels valid in q1-q2
+        cmp             r5,  #5
+        blt             7f
+        bgt             8f
+        // w == 5, 8 pixels valid in q1, q2 invalid
+        vmov            q2,  q12
+        vmov            q9,  q13
+        b               88f
+
+7:      // 1 <= w < 5, 4-7 pixels valid in q1
+        sub             r9,  r5,  #1
+        // r9 = (pixels valid - 4)
+        adr             r11, L(variable_shift_tbl)
+        ldr             r9,  [r11, r9, lsl #2]
+        add             r11, r11, r9
+        vmov            q2,  q12
+        vmov            q9,  q13
+        bx              r11
+
+        .align 2
+L(variable_shift_tbl):
+        .word 44f - L(variable_shift_tbl) + CONFIG_THUMB
+        .word 55f - L(variable_shift_tbl) + CONFIG_THUMB
+        .word 66f - L(variable_shift_tbl) + CONFIG_THUMB
+        .word 77f - L(variable_shift_tbl) + CONFIG_THUMB
+
+44:     // 4 pixels valid in d2/d16, fill d3/d17 with padding.
+        vmov            d3,  d4
+        vmov            d17, d18
+        b               88f
+        // Shift q1 right, shifting out invalid pixels,
+        // shift q1 left to the original offset, shifting in padding pixels.
+55:     // 5 pixels valid
+        vext.8          q1,  q1,  q1,  #10
+        vext.8          q1,  q1,  q2,  #6
+        vext.8          q8,  q8,  q8,  #10
+        vext.8          q8,  q8,  q9,  #6
+        b               88f
+66:     // 6 pixels valid
+        vext.8          q1,  q1,  q1,  #12
+        vext.8          q1,  q1,  q2,  #4
+        vext.8          q8,  q8,  q8,  #12
+        vext.8          q8,  q8,  q9,  #4
+        b               88f
+77:     // 7 pixels valid
+        vext.8          q1,  q1,  q1,  #14
+        vext.8          q1,  q1,  q2,  #2
+        vext.8          q8,  q8,  q8,  #14
+        vext.8          q8,  q8,  q9,  #2
+        b               88f
+
+8:      // w > 5, w == 6, 9 pixels valid in q1-q2, 1 pixel valid in q2
+        vext.8          q2,  q2,  q2,  #2
+        vext.8          q2,  q2,  q12, #14
+        vext.8          q9,  q9,  q9,  #2
+        vext.8          q9,  q9,  q13, #14
+
+88:
+        // w < 7, q1-q2 padded properly
+        cmp             r5,  #4
+        blt             888f
+
+        // w >= 4, filter 4 pixels
+        filter_4
+        vst1.16         {d6},  [r0,  :64]!
+        vst1.16         {d20}, [r12, :64]!
+        subs            r5,  r5,  #4 // 0 <= w < 4
+        vext.8          q1,  q1,  q2,  #8
+        vext.8          q8,  q8,  q9,  #8
+        beq             9f
+888:    // 1 <= w < 4, filter 1 pixel at a time
+        vmul.s16        q3,  q1,  q0
+        vmul.s16        q10, q8,  q0
+        vpadd.s16       d6,  d6,  d7
+        vpadd.s16       d7,  d20, d21
+        vdup.16         d24, d2[3]
+        vpadd.s16       d6,  d6,  d7
+        vdup.16         d25, d16[3]
+        vpadd.s16       d6,  d6,  d6
+        vtrn.16         d24, d25
+        vshl.s16        d24, d24,  #7
+        vsub.s16        d24, d24,  d28
+        vqadd.s16       d6,  d6,   d24
+        vshr.s16        d6,  d6,   #3
+        vadd.s16        d6,  d6,   d30
+        vst1.s16        {d6[0]}, [r0,  :16]!
+        vst1.s16        {d6[1]}, [r12, :16]!
+        subs            r5,  r5,  #1
+        vext.8          q1,  q1,  q2,  #2
+        vext.8          q8,  q8,  q9,  #2
+        bgt             888b
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r10
+        add             r12, r12, r10
+        add             r2,  r2,  r3
+        add             lr,  lr,  r3
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4}
+        pop             {r4-r11,pc}
+.purgem filter_8
+.purgem filter_4
+endfunc
+
+// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                      const int16_t *mid, int w, int h,
+//                                      const int16_t fv[7], enum LrEdgeFlags edges,
+//                                      ptrdiff_t mid_stride);
+function wiener_filter_v_8bpc_neon, export=1
+        push            {r4-r7,lr}
+        ldrd            r4,  r5,  [sp, #20]
+        ldrd            r6,  r7,  [sp, #28]
+        mov             lr,  r4
+        vmov.s16        q1,  #0
+        mov             r12, #128
+        vld1.16         {q0},  [r5]
+        vmov.s16        d2[3], r12
+        vadd.s16        q0,  q0,  q1
+
+        // Calculate the number of rows to move back when looping vertically
+        mov             r12, r4
+        tst             r6,  #4 // LR_HAVE_TOP
+        beq             0f
+        sub             r2,  r2,  r7,  lsl #1
+        add             r12, r12, #2
+0:
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             1f
+        add             r12, r12, #2
+
+1:      // Start of horizontal loop; start one vertical filter slice.
+        // Load rows into q8-q11 and pad properly.
+        tst             r6,  #4 // LR_HAVE_TOP
+        vld1.16         {q8},  [r2, :128], r7
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.16         {q10}, [r2, :128], r7
+        vmov            q9,  q8
+        vld1.16         {q11}, [r2, :128], r7
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q9,  q8
+        vmov            q10, q8
+        vmov            q11, q8
+
+3:
+        cmp             r4,  #4
+        blt             5f
+        // Start filtering normally; fill in q12-q14 with unique rows.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        vld1.16         {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+        subs            r4,  r4,  #1
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        vmull.s16       q2,  d16,  d0[0]
+        vmlal.s16       q2,  d18,  d0[1]
+        vmlal.s16       q2,  d20,  d0[2]
+        vmlal.s16       q2,  d22,  d0[3]
+        vmlal.s16       q2,  d24,  d1[0]
+        vmlal.s16       q2,  d26,  d1[1]
+        vmlal.s16       q2,  d28,  d1[2]
+        vmull.s16       q3,  d17,  d0[0]
+        vmlal.s16       q3,  d19,  d0[1]
+        vmlal.s16       q3,  d21,  d0[2]
+        vmlal.s16       q3,  d23,  d0[3]
+        vmlal.s16       q3,  d25,  d1[0]
+        vmlal.s16       q3,  d27,  d1[1]
+        vmlal.s16       q3,  d29,  d1[2]
+        vqrshrun.s32    d4,  q2,   #11
+        vqrshrun.s32    d5,  q3,   #11
+        vqmovun.s16     d4,  q2
+        vst1.8          {d4}, [r0], r1
+.if \compare
+        cmp             r4,  #4
+.else
+        ble             9f
+.endif
+        vmov            q8,  q9
+        vmov            q9,  q10
+        vmov            q10, q11
+        vmov            q11, q12
+        vmov            q12, q13
+        vmov            q13, q14
+.endm
+        filter          1
+        blt             7f
+        vld1.16         {q14}, [r2, :128], r7
+        b               4b
+
+5:      // Less than 4 rows in total; not all of q12-q13 are filled yet.
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             6f
+        // LR_HAVE_BOTTOM
+        cmp             r4,  #2
+        // We load at least 2 rows in all cases.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        bgt             53f // 3 rows in total
+        beq             52f // 2 rows in total
+51:     // 1 row in total, q11 already loaded, load edge into q12-q14.
+        vmov            q13, q12
+        b               8f
+52:     // 2 rows in total, q11 already loaded, load q12 with content data
+        // and 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vmov            q15,  q14
+        b               8f
+53:
+        // 3 rows in total, q11 already loaded, load q12 and q13 with content
+        // and 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vld1.16         {q15}, [r2, :128], r7
+        vmov            q1,  q15
+        b               8f
+
+6:
+        // !LR_HAVE_BOTTOM
+        cmp             r4,  #2
+        bgt             63f // 3 rows in total
+        beq             62f // 2 rows in total
+61:     // 1 row in total, q11 already loaded, pad that into q12-q14.
+        vmov            q12, q11
+        vmov            q13, q11
+        vmov            q14, q11
+        b               8f
+62:     // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+        vld1.16         {q12}, [r2, :128], r7
+        vmov            q13, q12
+        vmov            q14, q12
+        vmov            q15, q12
+        b               8f
+63:
+        // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        vmov            q14, q13
+        vmov            q15, q13
+        vmov            q1,  q13
+        b               8f
+
+7:
+        // All registers up to q13 are filled already, 3 valid rows left.
+        // < 4 valid rows left; fill in padding and filter the last
+        // few rows.
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             71f
+        // LR_HAVE_BOTTOM; load 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vld1.16         {q15}, [r2, :128], r7
+        vmov            q1,  q15
+        b               8f
+71:
+        // !LR_HAVE_BOTTOM, pad 3 rows
+        vmov            q14, q13
+        vmov            q15, q13
+        vmov            q1,  q13
+
+8:      // At this point, all registers up to q14-15,q1 are loaded with
+        // edge/padding (depending on how many rows are left).
+        filter          0 // This branches to 9f when done
+        vmov            q14, q15
+        vmov            q15, q1
+        b               8b
+
+9:      // End of one vertical slice.
+        subs            r3,  r3,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        mls             r0,  r1,  lr,  r0
+        mls             r2,  r7,  r12, r2
+        add             r0,  r0,  #8
+        add             r2,  r2,  #16
+        mov             r4,  lr
+        b               1b
+
+0:
+        pop             {r4-r7,pc}
+.purgem filter
+endfunc
+
+// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                  const pixel *src, int w, int h);
+function copy_narrow_8bpc_neon, export=1
+        push            {r4,lr}
+        ldr             r4, [sp, #8]
+        adr             r12, L(copy_narrow_tbl)
+        ldr             r3,  [r12, r3, lsl #2]
+        add             r12, r12, r3
+        bx              r12
+
+        .align 2
+L(copy_narrow_tbl):
+        .word 0
+        .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
+
+10:
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+18:
+        subs            r4,  r4,  #8
+        blt             110f
+        vld1.8          {d0}, [r2, :64]!
+        vst1.8          {d0[0]}, [r0], r1
+        vst1.8          {d0[1]}, [r3], r1
+        vst1.8          {d0[2]}, [r0], r1
+        vst1.8          {d0[3]}, [r3], r1
+        vst1.8          {d0[4]}, [r0], r1
+        vst1.8          {d0[5]}, [r3], r1
+        vst1.8          {d0[6]}, [r0], r1
+        vst1.8          {d0[7]}, [r3], r1
+        ble             0f
+        b               18b
+110:
+        add             r4,  r4,  #8
+        asr             r1,  r1,  #1
+11:
+        subs            r4,  r4,  #1
+        vld1.8          {d0[]},  [r2]!
+        vst1.8          {d0[0]}, [r0], r1
+        bgt             11b
+0:
+        pop             {r4,pc}
+
+20:
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+24:
+        subs            r4,  r4,  #4
+        blt             210f
+        vld1.16         {d0}, [r2, :64]!
+        vst1.16         {d0[0]}, [r0, :16], r1
+        vst1.16         {d0[1]}, [r3, :16], r1
+        vst1.16         {d0[2]}, [r0, :16], r1
+        vst1.16         {d0[3]}, [r3, :16], r1
+        ble             0f
+        b               24b
+210:
+        add             r4,  r4,  #4
+        asr             r1,  r1,  #1
+22:
+        subs            r4,  r4,  #1
+        vld1.16         {d0[]},  [r2]!
+        vst1.16         {d0[0]}, [r0], r1
+        bgt             22b
+0:
+        pop             {r4,pc}
+
+30:
+        ldrh            r3,  [r2]
+        ldrb            r12, [r2, #2]
+        add             r2,  r2,  #3
+        subs            r4,  r4,  #1
+        strh            r3,  [r0]
+        strb            r12, [r0, #2]
+        add             r0,  r0,  r1
+        bgt             30b
+        pop             {r4,pc}
+
+40:
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+42:
+        subs            r4,  r4,  #2
+        blt             41f
+        vld1.8          {d0}, [r2, :64]!
+        vst1.32         {d0[0]}, [r0, :32], r1
+        vst1.32         {d0[1]}, [r3, :32], r1
+        ble             0f
+        b               42b
+41:
+        vld1.32         {d0[]},  [r2]
+        vst1.32         {d0[0]}, [r0]
+0:
+        pop             {r4,pc}
+
+50:
+        ldr             r3,  [r2]
+        ldrb            r12, [r2, #4]
+        add             r2,  r2,  #5
+        subs            r4,  r4,  #1
+        str             r3,  [r0]
+        strb            r12, [r0, #4]
+        add             r0,  r0,  r1
+        bgt             50b
+        pop             {r4,pc}
+
+60:
+        ldr             r3,  [r2]
+        ldrh            r12, [r2, #4]
+        add             r2,  r2,  #6
+        subs            r4,  r4,  #1
+        str             r3,  [r0]
+        strh            r12, [r0, #4]
+        add             r0,  r0,  r1
+        bgt             60b
+        pop             {r4,pc}
+
+70:
+        ldr             r3,  [r2]
+        ldrh            r12, [r2, #4]
+        ldrb            lr,  [r2, #6]
+        add             r2,  r2,  #7
+        subs            r4,  r4,  #1
+        str             r3,  [r0]
+        strh            r12, [r0, #4]
+        strb            lr,  [r0, #6]
+        add             r0,  r0,  r1
+        bgt             70b
+        pop             {r4,pc}
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                 const pixel (*left)[4],
+//                                 const pixel *src, const ptrdiff_t stride,
+//                                 const int w, const int h,
+//                                 const enum LrEdgeFlags edges);
+function sgr_box3_h_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
+        add             r5,  r5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
+        add             r11, r1,  #(2*SUM_STRIDE)   // sum
+        add             r12, r3,  r4                // src
+        lsl             r4,  r4,  #1
+        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             0f
+        // !LR_HAVE_RIGHT
+        add             lr,  r5,  #3
+        bic             lr,  lr,  #3
+        b               1f
+0:
+        add             lr,  r5,  #7
+        bic             lr,  lr,  #7
+1:
+        sub             r9,  r9,  lr, lsl #1
+
+        // Store the width for the vertical loop
+        mov             r8,  r5
+
+        // Subtract the number of pixels read from the input from the stride
+        add             lr,  r5,  #14
+        bic             lr,  lr,  #7
+        sub             r4,  r4,  lr
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r2,  #0
+        bne             0f
+        // left == NULL
+        sub             r3,  r3,  #2
+        sub             r12, r12, #2
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 2 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r4,  r4,  #2
+
+
+1:      // Loop vertically
+        vld1.8          {q0}, [r3]!
+        vld1.8          {q4}, [r12]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r2,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.32         {d3[]}, [r2]!
+        // Move r3/r12 back to account for the last 2 bytes we loaded earlier,
+        // which we'll shift out.
+        sub             r3,  r3,  #2
+        sub             r12, r12, #2
+        vld1.32         {d11[]}, [r2]!
+        vext.8          q0,  q1,  q0,  #14
+        vext.8          q4,  q5,  q4,  #14
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+        // and shift q0 to have 2x the first byte at the front.
+        vdup.8          q1,  d0[0]
+        vdup.8          q5,  d8[0]
+        // Move r3 back to account for the last 2 bytes we loaded before,
+        // which we shifted out.
+        sub             r3,  r3,  #2
+        sub             r12, r12, #2
+        vext.8          q0,  q1,  q0,  #14
+        vext.8          q4,  q5,  q4,  #14
+
+2:
+        vmull.u8        q1,  d0,  d0
+        vmull.u8        q2,  d1,  d1
+        vmull.u8        q5,  d8,  d8
+        vmull.u8        q6,  d9,  d9
+
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             lr,  r5, #(2 + 16 - 2 + 1)
+        ldrb            r11, [r3,  lr]
+        ldrb            lr,  [r12, lr]
+        // Fill q14/q15 with the right padding pixel
+        vdup.8          q14, r11
+        vdup.8          q15, lr
+        // Restore r11 after using it for a temporary value
+        add             r11, r1,  #(2*SUM_STRIDE)
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             r5,  #10
+        bge             4f   // If w >= 10, all used input pixels are valid
+        cmp             r5,  #6
+        bge             5f   // If w >= 6, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro vaddl_u16_n      dst1, dst2, src1, src2, src3, src4, w
+        vaddl.u16       \dst1,  \src1,  \src3
+.if \w > 4
+        vaddl.u16       \dst2,  \src2,  \src4
+.endif
+.endm
+.macro vaddw_u16_n      dst1, dst2, src1, src2, w
+        vaddw.u16       \dst1,  \dst1,  \src1
+.if \w > 4
+        vaddw.u16       \dst2,  \dst2,  \src2
+.endif
+.endm
+.macro vadd_i32_n       dst1, dst2, src1, src2, w
+        vadd.i32        \dst1,  \dst1,  \src1
+.if \w > 4
+        vadd.i32        \dst2,  \dst2,  \src2
+.endif
+.endm
+
+.macro add3 w
+        vext.8          d16, d0,  d1,  #1
+        vext.8          d17, d0,  d1,  #2
+        vext.8          d18, d8,  d9,  #1
+        vext.8          d19, d8,  d9,  #2
+        vaddl.u8        q3,  d0,  d16
+        vaddw.u8        q3,  q3,  d17
+        vaddl.u8        q7,  d8,  d18
+        vaddw.u8        q7,  q7,  d19
+
+        vext.8          q8,  q1,  q2,  #2
+        vext.8          q9,  q1,  q2,  #4
+        vext.8          q10, q5,  q6,  #2
+        vext.8          q11, q5,  q6,  #4
+
+        vaddl_u16_n     q12, q13, d2,  d3,  d16, d17, \w
+        vaddw_u16_n     q12, q13, d18, d19, \w
+
+        vaddl_u16_n     q8,  q9,  d10, d11, d20, d21, \w
+        vaddw_u16_n     q8,  q9,  d22, d23, \w
+.endm
+        add3            8
+        vst1.16         {q3},       [r1,  :128]!
+        vst1.16         {q7},       [r11, :128]!
+        vst1.32         {q12, q13}, [r0,  :128]!
+        vst1.32         {q8,  q9},  [r10, :128]!
+
+        subs            r5,  r5,  #8
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vld1.8          {d6},  [r3]!
+        vld1.8          {d14}, [r12]!
+        vmov            q1,  q2
+        vmov            q5,  q6
+        vext.8          q0,  q0,  q3,  #8
+        vext.8          q4,  q4,  q7,  #8
+        vmull.u8        q2,  d6,  d6
+        vmull.u8        q6,  d14, d14
+
+        bne             4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 6 <= w < 10
+        add3            4
+        vst1.16         {d6},  [r1,  :64]!
+        vst1.16         {d14}, [r11, :64]!
+        vst1.32         {q12}, [r0,  :128]!
+        vst1.32         {q8},  [r10, :128]!
+
+        subs            r5,  r5,  #4 // 2 <= w < 6
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q4,  q4,  q4,  #4
+
+6:      // Pad the right edge and produce the last few pixels.
+        // 2 <= w < 6, 2-5 pixels valid in q0
+        sub             lr,  r5,  #2
+        // lr = (pixels valid - 2)
+        adr             r11, L(box3_variable_shift_tbl)
+        ldr             lr,  [r11, lr, lsl #2]
+        add             r11, r11, lr
+        bx              r11
+
+        .align 2
+L(box3_variable_shift_tbl):
+        .word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+        .word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+        .word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+        .word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+
+        // Shift q0 right, shifting out invalid pixels,
+        // shift q0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        vext.8          q0,  q0,  q0,  #2
+        vext.8          q4,  q4,  q4,  #2
+        vext.8          q0,  q0,  q14, #14
+        vext.8          q4,  q4,  q15, #14
+        b               88f
+33:     // 3 pixels valid
+        vext.8          q0,  q0,  q0,  #3
+        vext.8          q4,  q4,  q4,  #3
+        vext.8          q0,  q0,  q14, #13
+        vext.8          q4,  q4,  q15, #13
+        b               88f
+44:     // 4 pixels valid
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q4,  q4,  q4,  #4
+        vext.8          q0,  q0,  q14, #12
+        vext.8          q4,  q4,  q15, #12
+        b               88f
+55:     // 5 pixels valid
+        vext.8          q0,  q0,  q0,  #5
+        vext.8          q4,  q4,  q4,  #5
+        vext.8          q0,  q0,  q14, #11
+        vext.8          q4,  q4,  q15, #11
+
+88:
+        // Restore r11 after using it for a temporary value above
+        add             r11, r1,  #(2*SUM_STRIDE)
+        vmull.u8        q1,  d0,  d0
+        vmull.u8        q2,  d1,  d1
+        vmull.u8        q5,  d8,  d8
+        vmull.u8        q6,  d9,  d9
+
+        add3            4
+        subs            r5,  r5,  #4
+        vst1.16         {d6},  [r1,  :64]!
+        vst1.16         {d14}, [r11, :64]!
+        vst1.32         {q12}, [r0,  :128]!
+        vst1.32         {q8},  [r10, :128]!
+        ble             9f
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q1,  q1,  q2,  #8
+        vext.8          q4,  q4,  q4,  #4
+        vext.8          q5,  q5,  q6,  #8
+        // Only one needed pixel left, but do a normal 4 pixel
+        // addition anyway
+        add3            4
+        vst1.16         {d6},  [r1,  :64]!
+        vst1.16         {d14}, [r11, :64]!
+        vst1.32         {q12}, [r0,  :128]!
+        vst1.32         {q8},  [r10, :128]!
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r9, lsl #1
+        add             r10, r10, r9, lsl #1
+        add             r1,  r1,  r9
+        add             r11, r11, r9
+        add             r3,  r3,  r4
+        add             r12, r12, r4
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                 const pixel (*left)[4],
+//                                 const pixel *src, const ptrdiff_t stride,
+//                                 const int w, const int h,
+//                                 const enum LrEdgeFlags edges);
+function sgr_box5_h_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
+        add             r5,  r5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
+        add             r11, r1,  #(2*SUM_STRIDE)   // sum
+        add             r12, r3,  r4                // src
+        lsl             r4,  r4,  #1
+        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        // Subtract the number of pixels read from the input from the stride.
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             0f
+        // !LR_HAVE_RIGHT
+        add             lr,  r5,  #3
+        bic             lr,  lr,  #3
+        add             r8,  r5,  #13
+        b               1f
+0:
+        add             lr,  r5,  #7
+        bic             lr,  lr,  #7
+        add             r8,  r5,  #15
+1:
+        sub             r9,  r9,  lr, lsl #1
+        bic             r8,  r8,  #7
+        sub             r4,  r4,  r8
+
+        // Store the width for the vertical loop
+        mov             r8,  r5
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r2,  #0
+        bne             0f
+        // left == NULL
+        sub             r3,  r3,  #3
+        sub             r12, r12, #3
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r4,  r4,  #3
+
+1:      // Loop vertically
+        vld1.8          {q0}, [r3]!
+        vld1.8          {q4}, [r12]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r2,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.32         {d3[]}, [r2]!
+        // Move r3/r12 back to account for the last 3 bytes we loaded earlier,
+        // which we'll shift out.
+        sub             r3,  r3,  #3
+        sub             r12, r12, #3
+        vld1.32         {d11[]}, [r2]!
+        vext.8          q0,  q1,  q0,  #13
+        vext.8          q4,  q5,  q4,  #13
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+        // and shift q0 to have 3x the first byte at the front.
+        vdup.8          q1,  d0[0]
+        vdup.8          q5,  d8[0]
+        // Move r3 back to account for the last 3 bytes we loaded before,
+        // which we shifted out.
+        sub             r3,  r3,  #3
+        sub             r12, r12, #3
+        vext.8          q0,  q1,  q0,  #13
+        vext.8          q4,  q5,  q4,  #13
+
+2:
+        vmull.u8        q1,  d0,  d0
+        vmull.u8        q2,  d1,  d1
+        vmull.u8        q5,  d8,  d8
+        vmull.u8        q6,  d9,  d9
+
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             lr,  r5, #(2 + 16 - 3 + 1)
+        ldrb            r11, [r3,  lr]
+        ldrb            lr,  [r12, lr]
+        // Fill q14/q15 with the right padding pixel
+        vdup.8          q14, r11
+        vdup.8          q15, lr
+        // Restore r11 after using it for a temporary value
+        add             r11, r1,  #(2*SUM_STRIDE)
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             r5,  #11
+        bge             4f   // If w >= 11, all used input pixels are valid
+        cmp             r5,  #7
+        bge             5f   // If w >= 7, we can produce 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro add5 w
+        vext.8          d16, d0,  d1,  #1
+        vext.8          d17, d0,  d1,  #2
+        vext.8          d18, d0,  d1,  #3
+        vext.8          d19, d0,  d1,  #4
+        vext.8          d20, d8,  d9,  #1
+        vext.8          d21, d8,  d9,  #2
+        vext.8          d22, d8,  d9,  #3
+        vext.8          d23, d8,  d9,  #4
+        vaddl.u8        q3,  d0,  d16
+        vaddl.u8        q12, d17, d18
+        vaddl.u8        q7,  d8,  d20
+        vaddl.u8        q13, d21, d22
+        vaddw.u8        q3,  q3,  d19
+        vaddw.u8        q7,  q7,  d23
+        vadd.u16        q3,  q3,  q12
+        vadd.u16        q7,  q7,  q13
+
+        vext.8          q8,  q1,  q2,  #2
+        vext.8          q9,  q1,  q2,  #4
+        vext.8          q10, q1,  q2,  #6
+        vext.8          q11, q1,  q2,  #8
+        vaddl_u16_n     q12, q13, d2,  d3,  d16, d17, \w
+        vaddl_u16_n     q8,  q9,  d18, d19, d20, d21, \w
+        vaddw_u16_n     q12, q13, d22, d23, \w
+        vadd_i32_n      q12, q13, q8,  q9, \w
+        vext.8          q8,  q5,  q6,  #2
+        vext.8          q9,  q5,  q6,  #4
+        vext.8          q10, q5,  q6,  #6
+        vext.8          q11, q5,  q6,  #8
+.if \w > 4
+        vaddl_u16_n     q1,  q5,  d10, d11, d16, d17, 8
+        vaddl_u16_n     q8,  q9,  d18, d19, d20, d21, 8
+        vaddw_u16_n     q1,  q5,  d22, d23, 8
+        vadd.i32        q10, q1,  q8
+        vadd.i32        q11, q5,  q9
+.else
+        // Can't clobber q1/q5 if only doing 4 pixels
+        vaddl.u16       q8,  d10, d16
+        vaddl.u16       q9,  d18, d20
+        vaddw.u16       q8,  q8,  d22
+        vadd.i32        q10, q8,  q9
+.endif
+.endm
+        add5            8
+        vst1.16         {q3},       [r1,  :128]!
+        vst1.16         {q7},       [r11, :128]!
+        vst1.32         {q12, q13}, [r0,  :128]!
+        vst1.32         {q10, q11}, [r10, :128]!
+
+        subs            r5,  r5,  #8
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vld1.8          {d6},  [r3]!
+        vld1.8          {d14}, [r12]!
+        vmov            q1,  q2
+        vmov            q5,  q6
+        vext.8          q0,  q0,  q3,  #8
+        vext.8          q4,  q4,  q7,  #8
+        vmull.u8        q2,  d6,  d6
+        vmull.u8        q6,  d14, d14
+        bne             4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 7 <= w < 11
+        add5            4
+        vst1.16         {d6},  [r1,  :64]!
+        vst1.16         {d14}, [r11, :64]!
+        vst1.32         {q12}, [r0,  :128]!
+        vst1.32         {q10}, [r10, :128]!
+
+        subs            r5,  r5,  #4 // 3 <= w < 7
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q4,  q4,  q4,  #4
+
+6:      // Pad the right edge and produce the last few pixels.
+        // w < 7, w+1 pixels valid in q0/q4
+        sub             lr,   r5,  #1
+        // lr = pixels valid - 2
+        adr             r11, L(box5_variable_shift_tbl)
+        ldr             lr,  [r11, lr, lsl #2]
+        add             r11, r11, lr
+        bx              r11
+
+        .align 2
+L(box5_variable_shift_tbl):
+        .word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+
+        // Shift q0 right, shifting out invalid pixels,
+        // shift q0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        vext.8          q0,  q0,  q0,  #2
+        vext.8          q4,  q4,  q4,  #2
+        vext.8          q0,  q0,  q14, #14
+        vext.8          q4,  q4,  q15, #14
+        b               88f
+33:     // 3 pixels valid
+        vext.8          q0,  q0,  q0,  #3
+        vext.8          q4,  q4,  q4,  #3
+        vext.8          q0,  q0,  q14, #13
+        vext.8          q4,  q4,  q15, #13
+        b               88f
+44:     // 4 pixels valid
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q4,  q4,  q4,  #4
+        vext.8          q0,  q0,  q14, #12
+        vext.8          q4,  q4,  q15, #12
+        b               88f
+55:     // 5 pixels valid
+        vext.8          q0,  q0,  q0,  #5
+        vext.8          q4,  q4,  q4,  #5
+        vext.8          q0,  q0,  q14, #11
+        vext.8          q4,  q4,  q15, #11
+        b               88f
+66:     // 6 pixels valid
+        vext.8          q0,  q0,  q0,  #6
+        vext.8          q4,  q4,  q4,  #6
+        vext.8          q0,  q0,  q14, #10
+        vext.8          q4,  q4,  q15, #10
+        b               88f
+77:     // 7 pixels valid
+        vext.8          q0,  q0,  q0,  #7
+        vext.8          q4,  q4,  q4,  #7
+        vext.8          q0,  q0,  q14, #9
+        vext.8          q4,  q4,  q15, #9
+
+88:
+        // Restore r11 after using it for a temporary value above
+        add             r11, r1,  #(2*SUM_STRIDE)
+        vmull.u8        q1,  d0,  d0
+        vmull.u8        q2,  d1,  d1
+        vmull.u8        q5,  d8,  d8
+        vmull.u8        q6,  d9,  d9
+
+        add5            4
+        subs            r5,  r5,  #4
+        vst1.16         {d6},  [r1,  :64]!
+        vst1.16         {d14}, [r11, :64]!
+        vst1.32         {q12}, [r0,  :128]!
+        vst1.32         {q10}, [r10, :128]!
+        ble             9f
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q1,  q1,  q2,  #8
+        vext.8          q4,  q4,  q4,  #4
+        vext.8          q5,  q5,  q6,  #8
+        add5            4
+        vst1.16         {d6},  [r1,  :64]!
+        vst1.16         {d14}, [r11, :64]!
+        vst1.32         {q12}, [r0,  :128]!
+        vst1.32         {q10}, [r10, :128]!
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r9, lsl #1
+        add             r10, r10, r9, lsl #1
+        add             r1,  r1,  r9
+        add             r11, r11, r9
+        add             r3,  r3,  r4
+        add             r12, r12, r4
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+.purgem add5
+endfunc
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+        push            {r4-r9,lr}
+        ldr             r4,  [sp, #28]
+        add             r12, r3,  #2 // Number of output rows to move back
+        mov             lr,  r3      // Number of input rows to move back
+        add             r2,  r2,  #2 // Actual summed width
+        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             r8,       #(2*SUM_STRIDE) // sum stride
+        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             r4,  #4 // LR_HAVE_TOP
+        beq             0f
+        // If have top, read from row -2.
+        sub             r5,  r0,  #(4*SUM_STRIDE)
+        sub             r6,  r1,  #(2*SUM_STRIDE)
+        add             lr,  lr,  #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             r5,  r0,  #(4*SUM_STRIDE)
+        add             r6,  r1,  #(2*SUM_STRIDE)
+1:
+
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        beq             1f
+        // LR_HAVE_BOTTOM
+        add             r3,  r3,  #2  // Sum all h+2 lines with the main loop
+        add             lr,  lr,  #2
+1:
+        mov             r9,  r3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into q8-q13 and q0-q2 taking top
+        // padding into consideration.
+        tst             r4,  #4 // LR_HAVE_TOP
+        vld1.32         {q8,  q9},  [r5, :128], r7
+        vld1.16         {q0},       [r6, :128], r8
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.32         {q10, q11}, [r5, :128], r7
+        vld1.16         {q1},       [r6, :128], r8
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q2},       [r6, :128], r8
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q10, q8
+        vmov            q11, q9
+        vmov            q1,  q0
+        vmov            q12, q8
+        vmov            q13, q9
+        vmov            q2,  q0
+
+3:
+        subs            r3,  r3,  #1
+.macro add3
+        vadd.i32        q8,  q8,  q10
+        vadd.i32        q9,  q9,  q11
+        vadd.i16        q0,  q0,  q1
+        vadd.i32        q8,  q8,  q12
+        vadd.i32        q9,  q9,  q13
+        vadd.i16        q0,  q0,  q2
+        vst1.32         {q8, q9}, [r0, :128], r7
+        vst1.16         {q0},     [r1, :128], r8
+.endm
+        add3
+        vmov            q8,  q10
+        vmov            q9,  q11
+        vmov            q0,  q1
+        vmov            q10, q12
+        vmov            q11, q13
+        vmov            q1,  q2
+        ble             4f
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q2},       [r6, :128], r8
+        b               3b
+
+4:
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        bne             5f
+        // !LR_HAVE_BOTTOM
+        // Produce two more rows, extending the already loaded rows.
+        add3
+        vmov            q8,  q10
+        vmov            q9,  q11
+        vmov            q0,  q1
+        add3
+
+5:      // End of one vertical slice.
+        subs            r2,  r2,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        mls             r5,  r7,  lr,  r5
+        mls             r6,  r8,  lr,  r6
+        // Output pointers
+        mls             r0,  r7,  r12, r0
+        mls             r1,  r8,  r12, r1
+        add             r0,  r0,  #32
+        add             r1,  r1,  #16
+        add             r5,  r5,  #32
+        add             r6,  r6,  #16
+        mov             r3,  r9
+        b               1b
+
+0:
+        pop             {r4-r9,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+        push            {r4-r9,lr}
+        vpush           {q5-q7}
+        ldr             r4,  [sp, #76]
+        add             r12, r3,  #2 // Number of output rows to move back
+        mov             lr,  r3      // Number of input rows to move back
+        add             r2,  r2,  #8 // Actual summed width
+        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             r8,       #(2*SUM_STRIDE) // sum stride
+        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             r4,  #4 // LR_HAVE_TOP
+        beq             0f
+        // If have top, read from row -2.
+        sub             r5,  r0,  #(4*SUM_STRIDE)
+        sub             r6,  r1,  #(2*SUM_STRIDE)
+        add             lr,  lr,  #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             r5,  r0,  #(4*SUM_STRIDE)
+        add             r6,  r1,  #(2*SUM_STRIDE)
+1:
+
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        beq             0f
+        // LR_HAVE_BOTTOM
+        add             r3,  r3,  #2  // Handle h+2 lines with the main loop
+        add             lr,  lr,  #2
+        b               1f
+0:
+        // !LR_HAVE_BOTTOM
+        sub             r3,  r3,  #1  // Handle h-1 lines with the main loop
+1:
+        mov             r9,  r3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into q6-q15 and q0-q3,q5 taking top
+        // padding into consideration.
+        tst             r4,  #4 // LR_HAVE_TOP
+        vld1.32         {q6,  q7},  [r5, :128], r7
+        vld1.16         {q0},       [r6, :128], r8
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.32         {q10, q11}, [r5, :128], r7
+        vld1.16         {q2},       [r6, :128], r8
+        vmov            q8,  q6
+        vmov            q9,  q7
+        vmov            q1,  q0
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q3},       [r6, :128], r8
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q8,  q6
+        vmov            q9,  q7
+        vmov            q1,  q0
+        vmov            q10, q6
+        vmov            q11, q7
+        vmov            q2,  q0
+        vmov            q12, q6
+        vmov            q13, q7
+        vmov            q3,  q0
+
+3:
+        cmp             r3,  #0
+        beq             4f
+        vld1.32         {q14, q15}, [r5, :128], r7
+        vld1.16         {q5},       [r6, :128], r8
+
+3:
+        // Start of vertical loop
+        subs            r3,  r3,  #2
+.macro add5
+        vadd.i32        q6,  q6,  q8
+        vadd.i32        q7,  q7,  q9
+        vadd.i16        q0,  q0,  q1
+        vadd.i32        q6,  q6,  q10
+        vadd.i32        q7,  q7,  q11
+        vadd.i16        q0,  q0,  q2
+        vadd.i32        q6,  q6,  q12
+        vadd.i32        q7,  q7,  q13
+        vadd.i16        q0,  q0,  q3
+        vadd.i32        q6,  q6,  q14
+        vadd.i32        q7,  q7,  q15
+        vadd.i16        q0,  q0,  q5
+        vst1.32         {q6, q7}, [r0, :128], r7
+        vst1.16         {q0},     [r1, :128], r8
+.endm
+        add5
+.macro shift2
+        vmov            q6,  q10
+        vmov            q7,  q11
+        vmov            q0,  q2
+        vmov            q8,  q12
+        vmov            q9,  q13
+        vmov            q1,  q3
+        vmov            q10, q14
+        vmov            q11, q15
+        vmov            q2,  q5
+.endm
+        shift2
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        ble             5f
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q3},       [r6, :128], r8
+        vld1.32         {q14, q15}, [r5, :128], r7
+        vld1.16         {q5},       [r6, :128], r8
+        b               3b
+
+4:
+        // h == 1, !LR_HAVE_BOTTOM.
+        // Pad the last row with the only content row, and add.
+        vmov            q14, q12
+        vmov            q15, q13
+        vmov            q5,  q3
+        add5
+        shift2
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        add5
+        b               6f
+
+5:
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        bne             6f
+        // !LR_HAVE_BOTTOM
+        cmp             r3,  #0
+        bne             5f
+        // The intended three edge rows left; output the one at h-2 and
+        // the past edge one at h.
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q3},       [r6, :128], r8
+        // Pad the past-edge row from the last content row.
+        vmov            q14, q12
+        vmov            q15, q13
+        vmov            q5,  q3
+        add5
+        shift2
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        // The last two rows are already padded properly here.
+        add5
+        b               6f
+
+5:
+        // r3 == -1, two rows left, output one.
+        // Pad the last two rows from the mid one.
+        vmov            q12, q10
+        vmov            q13, q11
+        vmov            q3,  q2
+        vmov            q14, q10
+        vmov            q15, q11
+        vmov            q5,  q2
+        add5
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        b               6f
+
+6:      // End of one vertical slice.
+        subs            r2,  r2,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        mls             r5,  r7,  lr,  r5
+        mls             r6,  r8,  lr,  r6
+        // Output pointers
+        mls             r0,  r7,  r12, r0
+        mls             r1,  r8,  r12, r1
+        add             r0,  r0,  #32
+        add             r1,  r1,  #16
+        add             r5,  r5,  #32
+        add             r6,  r6,  #16
+        mov             r3,  r9
+        b               1b
+
+0:
+        vpop            {q5-q7}
+        pop             {r4-r9,pc}
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength);
+function sgr_calc_ab1_neon, export=1
+        push            {r4-r5,lr}
+        vpush           {q4-q7}
+        ldr             r4,  [sp, #76]
+        add             r3,  r3,  #2   // h += 2
+        vmov.i32        q15, #9        // n
+        movw            r5,  #455
+        mov             lr,  #SUM_STRIDE
+        b               sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+        push            {r4-r5,lr}
+        vpush           {q4-q7}
+        ldr             r4,  [sp, #76]
+        add             r3,  r3,  #3   // h += 3
+        asr             r3,  r3,  #1   // h /= 2
+        vmov.i32        q15, #25       // n
+        mov             r5,  #164
+        mov             lr,  #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+        movrel          r12, X(sgr_x_by_x)
+        vld1.8          {q8, q9}, [r12, :128]!
+        vmov.i8         q11, #5
+        vmov.i8         d10, #55       // idx of last 5
+        vld1.8          {q10},    [r12, :128]
+        vmov.i8         d11, #72       // idx of last 4
+        vmov.i8         d12, #101      // idx of last 3
+        vmov.i8         d13, #169      // idx of last 2
+        vmov.i8         d14, #254      // idx of last 1
+        vmov.i8         d15, #32       // elements consumed in first vtbl
+        add             r2,  r2,  #2   // w += 2
+        add             r12, r2,  #7
+        bic             r12, r12, #7   // aligned w
+        sub             r12, lr,  r12  // increment between rows
+        vmov.i16        q13, #256
+        vdup.32         q12, r4
+        vdup.32         q14, r5        // one_by_x
+        sub             r0,  r0,  #(4*(SUM_STRIDE))
+        sub             r1,  r1,  #(2*(SUM_STRIDE))
+        mov             r4,  r2        // backup of w
+        vsub.i8         q8,  q8,  q11
+        vsub.i8         q9,  q9,  q11
+        vsub.i8         q10, q10, q11
+1:
+        subs            r2,  r2,  #8
+        vld1.32         {q0, q1}, [r0, :128] // a
+        vld1.16         {q2},     [r1, :128] // b
+        vmul.i32        q0,  q0,  q15  // a * n
+        vmul.i32        q1,  q1,  q15  // a * n
+        vmull.u16       q3,  d4,  d4   // b * b
+        vmull.u16       q4,  d5,  d5   // b * b
+        vqsub.u32       q0,  q0,  q3   // imax(a * n - b * b, 0)
+        vqsub.u32       q1,  q1,  q4   // imax(a * n - b * b, 0)
+        vmul.i32        q0,  q0,  q12  // p * s
+        vmul.i32        q1,  q1,  q12  // p * s
+        vqshrn.u32      d0,  q0,  #16
+        vqshrn.u32      d1,  q1,  #16
+        vqrshrn.u16     d0,  q0,  #4   // imin(z, 255)
+
+        vcgt.u8         d2,  d0,  d10  // = -1 if sgr_x_by_x[d0] < 5
+        vcgt.u8         d3,  d0,  d11  // = -1 if sgr_x_by_x[d0] < 4
+        vtbl.8          d1,  {q8, q9}, d0
+        vcgt.u8         d6,  d0,  d12  // = -1 if sgr_x_by_x[d0] < 3
+        vsub.i8         d9,  d0,  d15  // indices for vtbx
+        vcgt.u8         d7,  d0,  d13  // = -1 if sgr_x_by_x[d0] < 2
+        vadd.i8         d2,  d2,  d3
+        vtbx.8          d1,  {q10}, d9
+        vcgt.u8         d8,  d0,  d14  // = -1 if sgr_x_by_x[d0] < 1
+        vadd.i8         d6,  d6,  d7
+        vadd.i8         d8,  d8,  d22
+        vadd.i8         d2,  d2,  d6
+        vadd.i8         d1,  d1,  d8
+        vadd.i8         d1,  d1,  d2
+        vmovl.u8        q0,  d1        // x
+
+        vmull.u16       q1,  d0,  d4   // x * BB[i]
+        vmull.u16       q2,  d1,  d5   // x * BB[i]
+        vmul.i32        q1,  q1,  q14  // x * BB[i] * sgr_one_by_x
+        vmul.i32        q2,  q2,  q14  // x * BB[i] * sgr_one_by_x
+        vrshr.s32       q1,  q1,  #12  // AA[i]
+        vrshr.s32       q2,  q2,  #12  // AA[i]
+        vsub.i16        q0,  q13, q0   // 256 - x
+
+        vst1.32         {q1, q2}, [r0, :128]!
+        vst1.16         {q0},     [r1, :128]!
+        bgt             1b
+
+        subs            r3,  r3,  #1
+        ble             0f
+        add             r0,  r0,  r12, lsl #2
+        add             r1,  r1,  r12, lsl #1
+        mov             r2,  r4
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r5,pc}
+endfunc
+
+#define FILTER_OUT_STRIDE 384
+
+// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter1_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldr             r6,  [sp, #108]
+        sub             r7,  r3,  #(4*SUM_STRIDE)
+        add             r8,  r3,  #(4*SUM_STRIDE)
+        sub             r9,  r4,  #(2*SUM_STRIDE)
+        add             r10, r4,  #(2*SUM_STRIDE)
+        mov             r11, #SUM_STRIDE
+        mov             r12, #FILTER_OUT_STRIDE
+        add             lr,  r5,  #3
+        bic             lr,  lr,  #3 // Aligned width
+        sub             r2,  r2,  lr
+        sub             r12, r12, lr
+        sub             r11, r11, lr
+        sub             r11, r11, #4 // We read 4 extra elements from both a and b
+        mov             lr,  r5
+        vmov.i16        q14, #3
+        vmov.i32        q15, #3
+1:
+        vld1.16         {q0},       [r9]!
+        vld1.16         {q1},       [r4]!
+        vld1.16         {q2},       [r10]!
+        vld1.32         {q8,  q9},  [r7]!
+        vld1.32         {q10, q11}, [r3]!
+        vld1.32         {q12, q13}, [r8]!
+
+2:
+        subs            r5,  r5,  #4
+        vext.8          d6,  d0,  d1,  #2  // -stride
+        vext.8          d7,  d2,  d3,  #2  // 0
+        vext.8          d8,  d4,  d5,  #2  // +stride
+        vext.8          d9,  d0,  d1,  #4  // +1-stride
+        vext.8          d10, d2,  d3,  #4  // +1
+        vext.8          d11, d4,  d5,  #4  // +1+stride
+        vadd.i16        d2,  d2,  d6       // -1, -stride
+        vadd.i16        d7,  d7,  d8       // 0, +stride
+        vadd.i16        d0,  d0,  d9       // -1-stride, +1-stride
+        vadd.i16        d2,  d2,  d7
+        vadd.i16        d4,  d4,  d11      // -1+stride, +1+stride
+        vadd.i16        d2,  d2,  d10      // +1
+        vadd.i16        d0,  d0,  d4
+
+        vext.8          q3,  q8,  q9,  #4  // -stride
+        vshl.i16        d2,  d2,  #2
+        vext.8          q4,  q8,  q9,  #8  // +1-stride
+        vext.8          q5,  q10, q11, #4  // 0
+        vext.8          q6,  q10, q11, #8  // +1
+        vmla.i16        d2,  d0,  d28      // * 3 -> a
+        vadd.i32        q3,  q3,  q10      // -stride, -1
+        vadd.i32        q8,  q8,  q4       // -1-stride, +1-stride
+        vadd.i32        q5,  q5,  q6       // 0, +1
+        vadd.i32        q8,  q8,  q12      // -1+stride
+        vadd.i32        q3,  q3,  q5
+        vext.8          q7,  q12, q13, #4  // +stride
+        vext.8          q10, q12, q13, #8  // +1+stride
+        vld1.32         {d24[0]}, [r1]!    // src
+        vadd.i32        q3,  q3,  q7       // +stride
+        vadd.i32        q8,  q8,  q10      // +1+stride
+        vshl.i32        q3,  q3,  #2
+        vmla.i32        q3,  q8,  q15      // * 3 -> b
+        vmovl.u8        q12, d24           // src
+        vmov            d0,  d1
+        vmlal.u16       q3,  d2,  d24      // b + a * src
+        vmov            d2,  d3
+        vrshrn.i32      d6,  q3,  #9
+        vmov            d4,  d5
+        vst1.16         {d6}, [r0]!
+
+        ble             3f
+        vmov            q8,  q9
+        vmov            q10, q11
+        vmov            q12, q13
+        vld1.16         {d1},  [r9]!
+        vld1.16         {d3},  [r4]!
+        vld1.16         {d5},  [r10]!
+        vld1.32         {q9},  [r7]!
+        vld1.32         {q11}, [r3]!
+        vld1.32         {q13}, [r8]!
+        b               2b
+
+3:
+        subs            r6,  r6,  #1
+        ble             0f
+        mov             r5,  lr
+        add             r0,  r0,  r12, lsl #1
+        add             r1,  r1,  r2
+        add             r3,  r3,  r11, lsl #2
+        add             r7,  r7,  r11, lsl #2
+        add             r8,  r8,  r11, lsl #2
+        add             r4,  r4,  r11, lsl #1
+        add             r9,  r9,  r11, lsl #1
+        add             r10, r10, r11, lsl #1
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter2_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldr             r6,  [sp, #108]
+        add             r7,  r3,  #(4*(SUM_STRIDE))
+        sub             r3,  r3,  #(4*(SUM_STRIDE))
+        add             r8,  r4,  #(2*(SUM_STRIDE))
+        sub             r4,  r4,  #(2*(SUM_STRIDE))
+        mov             r9,  #(2*SUM_STRIDE)
+        mov             r10, #FILTER_OUT_STRIDE
+        add             r11, r5,  #7
+        bic             r11, r11, #7 // Aligned width
+        sub             r2,  r2,  r11
+        sub             r10, r10, r11
+        sub             r9,  r9,  r11
+        sub             r9,  r9,  #4 // We read 4 extra elements from a
+        sub             r12, r9,  #4 // We read 8 extra elements from b
+        mov             lr,  r5
+
+1:
+        vld1.16         {q0,  q1},  [r4]!
+        vld1.16         {q2,  q3},  [r8]!
+        vld1.32         {q8,  q9},  [r3]!
+        vld1.32         {q11, q12}, [r7]!
+        vld1.32         {q10},      [r3]!
+        vld1.32         {q13},      [r7]!
+
+2:
+        vmov.i16        q14, #5
+        vmov.i16        q15, #6
+        subs            r5,  r5,  #8
+        vext.8          q4,  q0,  q1,  #4  // +1-stride
+        vext.8          q5,  q2,  q3,  #4  // +1+stride
+        vext.8          q6,  q0,  q1,  #2  // -stride
+        vext.8          q7,  q2,  q3,  #2  // +stride
+        vadd.i16        q0,  q0,  q4       // -1-stride, +1-stride
+        vadd.i16        q5,  q2,  q5       // -1+stride, +1+stride
+        vadd.i16        q2,  q6,  q7       // -stride, +stride
+        vadd.i16        q0,  q0,  q5
+
+        vext.8          q4,  q8,  q9,  #8  // +1-stride
+        vext.8          q5,  q9,  q10, #8
+        vext.8          q6,  q11, q12, #8  // +1+stride
+        vext.8          q7,  q12, q13, #8
+        vmul.i16        q0,  q0,  q14      // * 5
+        vmla.i16        q0,  q2,  q15      // * 6
+        vadd.i32        q4,  q4,  q8       // -1-stride, +1-stride
+        vadd.i32        q5,  q5,  q9
+        vadd.i32        q6,  q6,  q11      // -1+stride, +1+stride
+        vadd.i32        q7,  q7,  q12
+        vadd.i32        q4,  q4,  q6
+        vadd.i32        q5,  q5,  q7
+        vext.8          q6,  q8,  q9,  #4  // -stride
+        vext.8          q7,  q9,  q10, #4
+        vext.8          q8,  q11, q12, #4  // +stride
+        vext.8          q11, q12, q13, #4
+
+        vld1.8          {d4}, [r1]!
+
+        vmov.i32        q14, #5
+        vmov.i32        q15, #6
+
+        vadd.i32        q6,  q6,  q8       // -stride, +stride
+        vadd.i32        q7,  q7,  q11
+        vmul.i32        q4,  q4,  q14      // * 5
+        vmla.i32        q4,  q6,  q15      // * 6
+        vmul.i32        q5,  q5,  q14      // * 5
+        vmla.i32        q5,  q7,  q15      // * 6
+
+        vmovl.u8        q2,  d4
+        vmlal.u16       q4,  d0,  d4       // b + a * src
+        vmlal.u16       q5,  d1,  d5       // b + a * src
+        vmov            q0,  q1
+        vrshrn.i32      d8,  q4,  #9
+        vrshrn.i32      d9,  q5,  #9
+        vmov            q2,  q3
+        vst1.16         {q4}, [r0]!
+
+        ble             3f
+        vmov            q8,  q10
+        vmov            q11, q13
+        vld1.16         {q1},       [r4]!
+        vld1.16         {q3},       [r8]!
+        vld1.32         {q9,  q10}, [r3]!
+        vld1.32         {q12, q13}, [r7]!
+        b               2b
+
+3:
+        subs            r6,  r6,  #1
+        ble             0f
+        mov             r5,  lr
+        add             r0,  r0,  r10, lsl #1
+        add             r1,  r1,  r2
+        add             r3,  r3,  r9,  lsl #2
+        add             r7,  r7,  r9,  lsl #2
+        add             r4,  r4,  r12, lsl #1
+        add             r8,  r8,  r12, lsl #1
+
+        vld1.32         {q8, q9}, [r3]!
+        vld1.16         {q0, q1}, [r4]!
+        vld1.32         {q10},    [r3]!
+
+        vmov.i16        q12, #5
+        vmov.i16        q13, #6
+
+4:
+        subs            r5,  r5,  #8
+        vext.8          q3,  q0,  q1,  #4  // +1
+        vext.8          q2,  q0,  q1,  #2  // 0
+        vadd.i16        q0,  q0,  q3       // -1, +1
+
+        vext.8          q4,  q8,  q9,  #4  // 0
+        vext.8          q5,  q9,  q10, #4
+        vext.8          q6,  q8,  q9,  #8  // +1
+        vext.8          q7,  q9,  q10, #8
+        vmul.i16        q2,  q2,  q13      // * 6
+        vmla.i16        q2,  q0,  q12      // * 5 -> a
+        vld1.8          {d22}, [r1]!
+        vadd.i32        q8,  q8,  q6       // -1, +1
+        vadd.i32        q9,  q9,  q7
+        vmovl.u8        q11, d22
+        vmul.i32        q4,  q4,  q15      // * 6
+        vmla.i32        q4,  q8,  q14      // * 5 -> b
+        vmul.i32        q5,  q5,  q15      // * 6
+        vmla.i32        q5,  q9,  q14      // * 5 -> b
+
+        vmlal.u16       q4,  d4,  d22      // b + a * src
+        vmlal.u16       q5,  d5,  d23
+        vmov            q0,  q1
+        vrshrn.i32      d8,  q4,  #8
+        vrshrn.i32      d9,  q5,  #8
+        vmov            q8,  q10
+        vst1.16         {q4}, [r0]!
+
+        ble             5f
+        vld1.16         {q1},      [r4]!
+        vld1.32         {q9, q10}, [r3]!
+        b               4b
+
+5:
+        subs            r6,  r6,  #1
+        ble             0f
+        mov             r5,  lr
+        sub             r3,  r3,  r11, lsl #2 // Rewind r3/r4 to where they started
+        sub             r4,  r4,  r11, lsl #1
+        add             r0,  r0,  r10, lsl #1
+        add             r1,  r1,  r2
+        sub             r3,  r3,  #16
+        sub             r4,  r4,  #16
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int w, const int h,
+//                                    const int wt);
+function sgr_weighted1_8bpc_neon, export=1
+        push            {r4-r9,lr}
+        ldrd            r4,  r5,  [sp, #28]
+        ldrd            r6,  r7,  [sp, #36]
+        ldr             r8,  [sp, #44]
+        vdup.16         d31, r7
+        cmp             r6,  #2
+        add             r9,  r0,  r1
+        add             r12, r2,  r3
+        add             lr,  r4,  #2*FILTER_OUT_STRIDE
+        mov             r7,  #(4*FILTER_OUT_STRIDE)
+        lsl             r1,  r1,  #1
+        lsl             r3,  r3,  #1
+        add             r8,  r5,  #7
+        bic             r8,  r8,  #7 // Aligned width
+        sub             r1,  r1,  r8
+        sub             r3,  r3,  r8
+        sub             r7,  r7,  r8, lsl #1
+        mov             r8,  r5
+        blt             2f
+1:
+        vld1.8          {d0},  [r2]!
+        vld1.8          {d16}, [r12]!
+        vld1.16         {q1},  [r4]!
+        vld1.16         {q9},  [lr]!
+        subs            r5,  r5,  #8
+        vshll.u8        q0,  d0,  #4     // u
+        vshll.u8        q8,  d16, #4     // u
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vsub.i16        q9,  q9,  q8     // t1 - u
+        vshll.u16       q2,  d0,  #7     // u << 7
+        vshll.u16       q3,  d1,  #7     // u << 7
+        vshll.u16       q10, d16, #7     // u << 7
+        vshll.u16       q11, d17, #7     // u << 7
+        vmlal.s16       q2,  d2,  d31    // v
+        vmlal.s16       q3,  d3,  d31    // v
+        vmlal.s16       q10, d18, d31    // v
+        vmlal.s16       q11, d19, d31    // v
+        vrshrn.i32      d4,  q2,  #11
+        vrshrn.i32      d5,  q3,  #11
+        vrshrn.i32      d20, q10, #11
+        vrshrn.i32      d21, q11, #11
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d20, q10
+        vst1.8          {d4},  [r0]!
+        vst1.8          {d20}, [r9]!
+        bgt             1b
+
+        sub             r6,  r6,  #2
+        cmp             r6,  #1
+        blt             0f
+        mov             r5,  r8
+        add             r0,  r0,  r1
+        add             r9,  r9,  r1
+        add             r2,  r2,  r3
+        add             r12, r12, r3
+        add             r4,  r4,  r7
+        add             lr,  lr,  r7
+        beq             2f
+        b               1b
+
+2:
+        vld1.8          {d0}, [r2]!
+        vld1.16         {q1}, [r4]!
+        subs            r5,  r5,  #8
+        vshll.u8        q0,  d0,  #4     // u
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vshll.u16       q2,  d0,  #7     // u << 7
+        vshll.u16       q3,  d1,  #7     // u << 7
+        vmlal.s16       q2,  d2,  d31    // v
+        vmlal.s16       q3,  d3,  d31    // v
+        vrshrn.i32      d4,  q2,  #11
+        vrshrn.i32      d5,  q3,  #11
+        vqmovun.s16     d2,  q2
+        vst1.8          {d2}, [r0]!
+        bgt             2b
+0:
+        pop             {r4-r9,pc}
+endfunc
+
+// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int16_t *t2,
+//                                    const int w, const int h,
+//                                    const int16_t wt[2]);
+function sgr_weighted2_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+        ldr             r8,  [sp, #52]
+        cmp             r7,  #2
+        add             r10, r0,  r1
+        add             r11, r2,  r3
+        add             r12, r4,  #2*FILTER_OUT_STRIDE
+        add             lr,  r5,  #2*FILTER_OUT_STRIDE
+        vld2.16         {d30[], d31[]}, [r8] // wt[0], wt[1]
+        mov             r8,  #4*FILTER_OUT_STRIDE
+        lsl             r1,  r1,  #1
+        lsl             r3,  r3,  #1
+        add             r9,  r6,  #7
+        bic             r9,  r9,  #7 // Aligned width
+        sub             r1,  r1,  r9
+        sub             r3,  r3,  r9
+        sub             r8,  r8,  r9, lsl #1
+        mov             r9,  r6
+        blt             2f
+1:
+        vld1.8          {d0},  [r2]!
+        vld1.8          {d16}, [r11]!
+        vld1.16         {q1},  [r4]!
+        vld1.16         {q9},  [r12]!
+        vld1.16         {q2},  [r5]!
+        vld1.16         {q10}, [lr]!
+        subs            r6,  r6,  #8
+        vshll.u8        q0,  d0,  #4     // u
+        vshll.u8        q8,  d16, #4     // u
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vsub.i16        q2,  q2,  q0     // t2 - u
+        vsub.i16        q9,  q9,  q8     // t1 - u
+        vsub.i16        q10, q10, q8     // t2 - u
+        vshll.u16       q3,  d0,  #7     // u << 7
+        vshll.u16       q0,  d1,  #7     // u << 7
+        vshll.u16       q11, d16, #7     // u << 7
+        vshll.u16       q8,  d17, #7     // u << 7
+        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
+        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
+        vmlal.s16       q11, d18, d30    // wt[0] * (t1 - u)
+        vmlal.s16       q11, d20, d31    // wt[1] * (t2 - u)
+        vmlal.s16       q8,  d19, d30    // wt[0] * (t1 - u)
+        vmlal.s16       q8,  d21, d31    // wt[1] * (t2 - u)
+        vrshrn.i32      d6,  q3,  #11
+        vrshrn.i32      d7,  q0,  #11
+        vrshrn.i32      d22, q11, #11
+        vrshrn.i32      d23, q8,  #11
+        vqmovun.s16     d6,  q3
+        vqmovun.s16     d22, q11
+        vst1.8          {d6},  [r0]!
+        vst1.8          {d22}, [r10]!
+        bgt             1b
+
+        subs            r7,  r7,  #2
+        cmp             r7,  #1
+        blt             0f
+        mov             r6,  r9
+        add             r0,  r0,  r1
+        add             r10, r10, r1
+        add             r2,  r2,  r3
+        add             r11, r11, r3
+        add             r4,  r4,  r8
+        add             r12, r12, r8
+        add             r5,  r5,  r8
+        add             lr,  lr,  r8
+        beq             2f
+        b               1b
+
+2:
+        vld1.8          {d0}, [r2]!
+        vld1.16         {q1}, [r4]!
+        vld1.16         {q2}, [r5]!
+        subs            r6,  r6,  #8
+        vshll.u8        q0,  d0,  #4     // u
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vsub.i16        q2,  q2,  q0     // t2 - u
+        vshll.u16       q3,  d0,  #7     // u << 7
+        vshll.u16       q0,  d1,  #7     // u << 7
+        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
+        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
+        vrshrn.i32      d6,  q3,  #11
+        vrshrn.i32      d7,  q0,  #11
+        vqmovun.s16     d6,  q3
+        vst1.8          {d6}, [r0]!
+        bgt             1b
+0:
+        pop             {r4-r11,pc}
+endfunc
diff --git a/src/arm/32/mc.S b/src/arm/32/mc.S
new file mode 100644 (file)
index 0000000..47631c0
--- /dev/null
@@ -0,0 +1,3351 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro avg dst0, dst1, t0, t1, t2, t3
+        vld1.16         {\t0,\t1},   [r2, :128]!
+        vld1.16         {\t2,\t3},   [r3, :128]!
+        vadd.i16        \t0,   \t0,  \t2
+        vadd.i16        \t1,   \t1,  \t3
+        vqrshrun.s16    \dst0, \t0,  #5
+        vqrshrun.s16    \dst1, \t1,  #5
+.endm
+
+.macro w_avg dst0, dst1, t0, t1, t2, t3
+        vld1.16         {\t0,\t1},   [r2, :128]!
+        vld1.16         {\t2,\t3},   [r3, :128]!
+        vsub.i16        \t0,   \t2,  \t0
+        vsub.i16        \t1,   \t3,  \t1
+        vqdmulh.s16     \t0,   \t0,  q15
+        vqdmulh.s16     \t1,   \t1,  q15
+        vadd.i16        \t0,   \t2,  \t0
+        vadd.i16        \t1,   \t3,  \t1
+        vqrshrun.s16    \dst0, \t0,  #4
+        vqrshrun.s16    \dst1, \t1,  #4
+.endm
+
+.macro mask dst0, dst1, t0, t1, t2, t3
+        vld1.8          {q14}, [lr, :128]!
+        vld1.16         {\t0,\t1},   [r2, :128]!
+        vmul.i8         q14,   q14,  q15
+        vld1.16         {\t2,\t3},   [r3, :128]!
+        vshll.i8        q13,   d28,  #8
+        vshll.i8        q14,   d29,  #8
+        vsub.i16        \t0,   \t2,  \t0
+        vsub.i16        \t1,   \t3,  \t1
+        vqdmulh.s16     \t0,   \t0,  q13
+        vqdmulh.s16     \t1,   \t1,  q14
+        vadd.i16        \t0,   \t2,  \t0
+        vadd.i16        \t1,   \t3,  \t1
+        vqrshrun.s16    \dst0, \t0,  #4
+        vqrshrun.s16    \dst1, \t1,  #4
+.endm
+
+.macro bidir_fn type
+function \type\()_8bpc_neon, export=1
+        push            {r4-r6,lr}
+        ldr             r4, [sp, #16]
+        ldr             r5, [sp, #20]
+        clz             r4,  r4
+.ifnc \type, avg
+        ldr             lr, [sp, #24]
+.endif
+.ifc \type, w_avg
+        vdup.s16        q15, lr
+        vneg.s16        q15, q15
+        vshl.i16        q15, q15, #11
+.endif
+.ifc \type, mask
+        vmov.i8         q15, #256-2
+.endif
+        adr             r12, L(\type\()_tbl)
+        sub             r4,  r4,  #24
+        ldr             r4,  [r12, r4, lsl #2]
+        \type           d16, d17, q0,  q1,  q2,  q3
+        add             r12, r12, r4
+        bx              r12
+
+        .align 2
+L(\type\()_tbl):
+        .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_tbl) + CONFIG_THUMB
+        .word 4f    - L(\type\()_tbl) + CONFIG_THUMB
+
+4:
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+        cmp             r5,  #4
+        vst1.32         {d16[0]},  [r0, :32], r1
+        vst1.32         {d16[1]},  [r6, :32], r1
+        vst1.32         {d17[0]},  [r0, :32], r1
+        vst1.32         {d17[1]},  [r6, :32], r1
+        beq             0f
+        \type           d18, d19,  q0,  q1,  q2,  q3
+        cmp             r5,  #8
+        vst1.32         {d18[0]},  [r0, :32], r1
+        vst1.32         {d18[1]},  [r6, :32], r1
+        vst1.32         {d19[0]},  [r0, :32], r1
+        vst1.32         {d19[1]},  [r6, :32], r1
+        beq             0f
+        \type           d16, d17, q0,  q1,  q2,  q3
+        vst1.32         {d16[0]},  [r0, :32], r1
+        vst1.32         {d16[1]},  [r6, :32], r1
+        \type           d18, d19,  q0,  q1,  q2,  q3
+        vst1.32         {d17[0]},  [r0, :32], r1
+        vst1.32         {d17[1]},  [r6, :32], r1
+        vst1.32         {d18[0]},  [r0, :32], r1
+        vst1.32         {d18[1]},  [r6, :32], r1
+        vst1.32         {d19[0]},  [r0, :32], r1
+        vst1.32         {d19[1]},  [r6, :32], r1
+        pop             {r4-r6,pc}
+80:
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+8:
+        vst1.8          {d16},  [r0, :64], r1
+        \type           d18, d19, q0,  q1,  q2,  q3
+        vst1.8          {d17},  [r6, :64], r1
+        vst1.8          {d18},  [r0, :64], r1
+        subs            r5,  r5,  #4
+        vst1.8          {d19},  [r6, :64], r1
+        ble             0f
+        \type           d16, d17, q0,  q1,  q2,  q3
+        b               8b
+160:
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+16:
+        \type           d18, d19, q0, q1, q2, q3
+        vst1.8          {q8},  [r0, :128], r1
+        \type           d20, d21, q0, q1, q2, q3
+        vst1.8          {q9},  [r6, :128], r1
+        \type           d22, d23, q0, q1, q2, q3
+        vst1.8          {q10}, [r0, :128], r1
+        subs            r5,  r5,  #4
+        vst1.8          {q11}, [r6, :128], r1
+        ble             0f
+        \type           d16, d17, q0, q1, q2, q3
+        b               16b
+320:
+        add             r6,  r0,  r1
+        lsl             r1,  r1,  #1
+32:
+        \type           d18, d19, q0, q1, q2, q3
+        \type           d20, d21, q0, q1, q2, q3
+        vst1.8          {q8,  q9},  [r0, :128], r1
+        \type           d22, d23, q0, q1, q2, q3
+        subs            r5,  r5,  #2
+        vst1.8          {q10, q11}, [r6, :128], r1
+        ble             0f
+        \type           d16, d17, q0, q1, q2, q3
+        b               32b
+640:
+        add             r6,  r0,  #32
+64:
+        \type           d18, d19, q0, q1, q2, q3
+        \type           d20, d21, q0, q1, q2, q3
+        \type           d22, d23, q0, q1, q2, q3
+        vst1.8          {q8,  q9},  [r0, :128], r1
+        \type           d16, d17, q0, q1, q2, q3
+        vst1.8          {q10, q11}, [r6, :128], r1
+        \type           d18, d19, q0, q1, q2, q3
+        \type           d20, d21, q0, q1, q2, q3
+        vst1.8          {q8,  q9},  [r0, :128], r1
+        \type           d22, d23, q0, q1, q2, q3
+        subs            r5,  r5,  #2
+        vst1.8          {q10, q11}, [r6, :128], r1
+        ble             0f
+        \type           d16, d17, q0, q1, q2, q3
+        b               64b
+1280:
+        sub             r1,  r1,  #32
+        add             r6,  r0,  #64
+128:
+        \type           d18, d19, q0, q1, q2, q3
+        \type           d20, d21, q0, q1, q2, q3
+        \type           d22, d23, q0, q1, q2, q3
+        vst1.8          {q8,  q9},  [r0, :128]!
+        \type           d16, d17, q0, q1, q2, q3
+        vst1.8          {q10, q11}, [r0, :128], r1
+        \type           d18, d19, q0, q1, q2, q3
+        \type           d20, d21, q0, q1, q2, q3
+        vst1.8          {q8,  q9},  [r6, :128]!
+        \type           d22, d23, q0, q1, q2, q3
+        subs            r5,  r5,  #1
+        vst1.8          {q10, q11}, [r6, :128], r1
+        ble             0f
+        \type           d16, d17, q0, q1, q2, q3
+        b               128b
+
+0:
+        pop             {r4-r6,pc}
+endfunc
+.endm
+
+bidir_fn avg
+bidir_fn w_avg
+bidir_fn mask
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_8bpc_neon, export=1
+        push            {r4-r9,lr}
+        ldr             r4,  [sp, #28]
+        ldr             r5,  [sp, #32]
+        ldr             r6,  [sp, #36]
+        ldr             r7,  [sp, #40]
+        clz             r8,  r4
+        adr             r9,  L(w_mask_\type\()_tbl)
+        sub             r8,  r8,  #24
+        ldr             r8,  [r9,  r8,  lsl #2]
+        add             r9,  r9,  r8
+        movw            r12, #6903
+        vdup.16         q14, r12
+.if \type == 444
+        vmov.i8         q15, #64
+.elseif \type == 422
+        vdup.8          d0,  r7         // d0[] <- sign
+        vmov.i8         d30, #129
+        vsub.i8         d30, d30, d0    // 129 - sign
+.elseif \type == 420
+        vdup.16         q0,  r7         // d0[] <- sign
+        vmov.i16        q15, #256
+        vsub.i16        q15, q15, q0    // 256 - sign
+.endif
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+        bx              r9
+
+        .align 2
+L(w_mask_\type\()_tbl):
+        .word 1280f  - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+        .word 640f   - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+        .word 320f   - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+        .word 160f   - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+        .word 8f     - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+        .word 4f     - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+
+4:
+        vld1.16         {d0,  d1,  d2,  d3},  [r2,  :128]! // tmp1 (four rows at once)
+        vld1.16         {d4,  d5,  d6,  d7},  [r3,  :128]! // tmp2 (four rows at once)
+        subs            r5,  r5,  #4
+        vsub.i16        q8,  q2,  q0    // tmp2-tmp1
+        vsub.i16        q9,  q3,  q1
+        vabd.s16        q10, q0,  q2    // (abs(tmp1[x] - tmp2[x]))
+        vabd.s16        q11, q1,  q3
+        vqsub.u16       q10, q14, q10   // 6903 - abs ()
+        vqsub.u16       q11, q14, q11
+        vshr.s16        q10, q10, #8    // 64-m = (6903 - abs()) >> 8
+        vshr.s16        q11, q11, #8
+        vshl.s16        q12, q10, #9    // (64-m)<<9
+        vshl.s16        q13, q11, #9
+        vqdmulh.s16     q12, q12, q8    // ((tmp2-tmp1)*(64-m)<<9)>>15
+        vqdmulh.s16     q13, q13, q9
+        vadd.i16        q12, q12, q0    // (((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1
+        vadd.i16        q13, q13, q1
+        vqrshrun.s16    d24, q12, #4    // (((((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1) + 8) >> 4
+        vqrshrun.s16    d25, q13, #4
+.if \type == 444
+        vmovn.u16       d20, q10        // 64 - m
+        vmovn.u16       d21, q11
+        vsub.i8         q10, q15, q10   // m
+        vst1.8          {d20, d21}, [r6,  :128]!
+.elseif \type == 422
+        vpadd.s16       d20, d20, d21   // (64 - m) + (64 - n) (column wise addition)
+        vpadd.s16       d21, d22, d23
+        vmovn.s16       d6,  q10
+        vhsub.u8        d6,  d30, d6    // ((129 - sign) - ((64 - m) + (64 - n))) >> 1
+        vst1.8          {d6},  [r6,  :64]!
+.elseif \type == 420
+        vadd.s16        d20, d20, d21   // (64 - my1) + (64 - my2) (row wise addition)
+        vadd.s16        d21, d22, d23
+        vpadd.s16       d20, d20, d21   // (128 - m) + (128 - n) (column wise addition)
+        vsub.s16        d20, d30, d20   // (256 - sign) - ((128 - m) + (128 - n))
+        vrshrn.u16      d20, q10,  #2   // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+        vst1.32         {d20[0]}, [r6,  :32]!
+.endif
+        vst1.32         {d24[0]}, [r0,  :32], r1
+        vst1.32         {d24[1]}, [r12, :32], r1
+        vst1.32         {d25[0]}, [r0,  :32], r1
+        vst1.32         {d25[1]}, [r12, :32], r1
+        bgt             4b
+        pop             {r4-r9,pc}
+8:
+        vld1.16         {d0,  d1,  d2,  d3},  [r2,  :128]! // tmp1y1, tmp1y2
+        vld1.16         {d4,  d5,  d6,  d7},  [r3,  :128]! // tmp2y1, tmp2y2
+        subs            r5,  r5,  #2
+        vsub.i16        q8,  q2,  q0    // tmp2y1 - tmp1y1
+        vsub.i16        q9,  q3,  q1    // tmp2y2 - tmp1y2
+        vabd.s16        q10, q0,  q2    // abs(tmp1y1 - tmp2y1)
+        vabd.s16        q11, q1,  q3    // abs(tmp1y2 - tmp2y2)
+        vqsub.u16       q10, q14, q10   // 6903 - abs(tmp1y1 - tmp2y1)
+        vqsub.u16       q11, q14, q11   // 6903 - abs(tmp1y2 - tmp2y2)
+        vshr.s16        q10, q10, #8    // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8
+        vshr.s16        q11, q11, #8    // 64 - my2 = 6903 - abs(tmp1y2 - tmp2y2) >> 8
+        vshl.s16        q12, q10, #9    // (64 - my1) << 9
+        vshl.s16        q13, q11, #9    // (64 - my2) << 9
+        vqdmulh.s16     q12, q12, q8    // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15
+        vqdmulh.s16     q13, q13, q9    // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
+        vadd.s16        q12, q12, q0    // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
+        vadd.s16        q13, q13, q1    // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2
+        vqrshrun.s16    d24, q12, #4    // (((((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
+        vqrshrun.s16    d25, q13, #4    // (((((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
+.if \type == 444
+        vmovn.u16       d20, q10        // 64 - m
+        vmovn.u16       d21, q11
+        vsub.i8         q10, q15, q10   // m
+        vst1.8          {d20, d21}, [r6,  :128]!
+.elseif \type == 422
+        vpadd.s16       d20, d20, d21   // (64 - my1) + (64 - ny1) (column wise addition)
+        vpadd.s16       d21, d22, d23   // (64 - my2) + (64 - ny2)
+        vmovn.s16       d20, q10
+        vhsub.u8        d20, d30, d20   // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1
+        vst1.8          {d20}, [r6,  :64]!
+.elseif \type == 420
+        vadd.s16        q10, q10, q11   // (64 - my1) + (64 - my2) (row wise addition)
+        vpadd.s16       d20, d20, d21   // (128 - m) + (128 - n) (column wise addition)
+        vsub.s16        d20, d30, d20   // (256 - sign) - ((128 - m) + (128 - n))
+        vrshrn.u16      d20, q10, #2    // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+        vst1.32         {d20[0]}, [r6,  :32]!
+.endif
+        vst1.16         {d24}, [r0,  :64], r1
+        vst1.16         {d25}, [r12, :64], r1
+        bgt             8b
+        pop             {r4-r9,pc}
+1280:
+640:
+320:
+160:
+        sub             r1,  r1,  r4
+.if \type == 444
+        add             lr,  r6,  r4
+.elseif \type == 422
+        add             lr,  r6,  r4,  lsr #1
+.endif
+        add             r9,  r3,  r4,  lsl #1
+        add             r7,  r2,  r4,  lsl #1
+161:
+        mov             r8,  r4
+16:
+        vld1.16         {d0,  d1,  d2,  d3},  [r2,  :128]! // tmp1y1
+        vld1.16         {d4,  d5,  d6,  d7},  [r3,  :128]! // tmp2y1
+        vld1.16         {d16, d17, d18, d19}, [r7,  :128]! // tmp1y2
+        subs            r8,  r8,  #16
+        vsub.i16        q2,  q2,  q0    // tmp2y1 - tmp1y1
+        vsub.i16        q3,  q3,  q1
+        vabs.s16        q10, q2         // abs(tm2y1 - tmp1y1)
+        vabs.s16        q11, q3
+        vqsub.u16       q10, q14, q10   // 6903 - abs(tmp1y1 - tmp2y1)
+        vqsub.u16       q11, q14, q11
+        vshr.s16        q10, q10, #8    // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8
+        vshr.s16        q11, q11, #8
+        vshl.s16        q12, q10, #9    // (64 - my1) << 9
+        vshl.s16        q13, q11, #9
+        vqdmulh.s16     q12, q12, q2    // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15
+        vqdmulh.s16     q13, q13, q3
+        vadd.i16        q12, q12, q0    // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
+        vadd.i16        q13, q13, q1
+        vld1.16         {d0,  d1,  d2,  d3},  [r9,  :128]! // tmp2h2
+.if \type == 444
+        vmovn.u16       d20, q10        // 64 - my1
+        vmovn.u16       d21, q11
+        vsub.i8         q10, q15, q10   // my1
+        vst1.8          {d20, d21}, [r6,  :128]!
+.elseif \type == 422
+        vpadd.s16       d20, d20, d21   // (64 - my1) + (64 - ny1) (column wise addition)
+        vpadd.s16       d21, d22, d23
+        vmovn.s16       d20, q10
+        vhsub.u8        d20, d30, d20   // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1
+        vst1.8          {d20}, [r6,  :64]!
+.endif
+        vqrshrun.s16    d24, q12, #4    // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
+        vqrshrun.s16    d25, q13, #4
+        vsub.i16        q0,  q0,  q8    // tmp2y2 - tmp1y2
+        vsub.i16        q1,  q1,  q9
+        vst1.16         {d24, d25}, [r0,  :128]!    // store dsty1
+        vabs.s16        q2,  q0         // abs(tmp2y2 - tmp1y2)
+        vabs.s16        q3,  q1
+        vqsub.u16       q2,  q14, q2    // 6903 - abs(tmp2y2 - tmp1y2)
+        vqsub.u16       q3,  q14, q3
+        vshr.s16        q2,  q2,  #8    // (6903 - abs(tmp2y2 - tmp1y2)) >> 8
+        vshr.s16        q3,  q3,  #8
+        vshl.s16        q12, q2,  #9    // (64 - my2) << 9
+        vshl.s16        q13, q3,  #9
+.if \type == 444
+        vmovn.u16       d4,  q2         // 64 - my2
+        vmovn.u16       d5,  q3
+        vsub.i8         q2,  q15, q2    // my2
+        vst1.8          {d4,  d5},  [lr,  :128]!
+.elseif \type == 422
+        vpadd.s16       d4,  d4,  d5    // (64 - my2) + (64 - ny2) (column wise addition)
+        vpadd.s16       d5,  d6,  d7
+        vmovn.s16       d4,  q2
+        vhsub.u8        d4,  d30, d4    // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1
+        vst1.8          {d4},  [lr,  :64]!
+.elseif \type == 420
+        vadd.s16        q10, q10, q2    // (64 - my1) + (64 - my2) (row wise addition)
+        vadd.s16        q11, q11, q3
+        vpadd.s16       d20, d20, d21   // (128 - m) + (128 - n) (column wise addition)
+        vpadd.s16       d21, d22, d23
+        vsub.s16        q10, q15, q10   // (256 - sign) - ((128 - m) + (128 - n))
+        vrshrn.u16      d20, q10, #2    // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+        vst1.8          {d20}, [r6,  :64]!
+.endif
+        vqdmulh.s16     q12, q12, q0    // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
+        vqdmulh.s16     q13, q13, q1
+        vadd.i16        q12, q12, q8    // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2
+        vadd.i16        q13, q13, q9
+        vqrshrun.s16    d24, q12, #4    // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
+        vqrshrun.s16    d25, q13, #4
+        vst1.16         {d24, d25}, [r12, :128]!   // store dsty2
+        bgt             16b
+        subs            r5,  r5,  #2
+        add             r2,  r2,  r4,  lsl #1
+        add             r3,  r3,  r4,  lsl #1
+        add             r7,  r7,  r4,  lsl #1
+        add             r9,  r9,  r4,  lsl #1
+.if \type == 444
+        add             r6,  r6,  r4
+        add             lr,  lr,  r4
+.elseif \type == 422
+        add             r6,  r6,  r4,  lsr #1
+        add             lr,  lr,  r4,  lsr #1
+.endif
+        add             r0,  r0,  r1
+        add             r12, r12, r1
+        bgt             161b
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_8bpc_neon, export=1
+        push            {r4-r5,lr}
+        ldr             r4,  [sp, #12]
+        ldr             r5,  [sp, #16]
+        clz             lr,  r3
+        adr             r3,  L(blend_tbl)
+        sub             lr,  lr,  #26
+        ldr             lr,  [r3, lr, lsl #2]
+        add             r3,  r3,  lr
+        bx              r3
+
+        .align 2
+L(blend_tbl):
+        .word 320f  - L(blend_tbl) + CONFIG_THUMB
+        .word 160f  - L(blend_tbl) + CONFIG_THUMB
+        .word 80f   - L(blend_tbl) + CONFIG_THUMB
+        .word 40f   - L(blend_tbl) + CONFIG_THUMB
+
+40:
+        vmov.i8         d22, #64
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+4:
+        vld1.u8         {d2},     [r5,  :64]!
+        vld1.u8         {d1},     [r2,  :64]!
+        vld1.32         {d0[]},   [r0,  :32]
+        subs            r4,  r4,  #2
+        vld1.32         {d0[1]},  [r12, :32]
+        vsub.i8         d3,  d22, d2
+        vmull.u8        q8,  d1,  d2
+        vmlal.u8        q8,  d0,  d3
+        vrshrn.i16      d20, q8,  #6
+        vst1.32         {d20[0]}, [r0,  :32], r1
+        vst1.32         {d20[1]}, [r12, :32], r1
+        bgt             4b
+        pop             {r4-r5,pc}
+80:
+        vmov.i8         d16, #64
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+8:
+        vld1.u8         {q1},  [r5,  :128]!
+        vld1.u8         {q2},  [r2,  :128]!
+        vld1.u8         {d0},  [r0,  :64]
+        vsub.i8         d17, d16, d2
+        vld1.u8         {d1},  [r12, :64]
+        subs            r4,  r4,  #2
+        vsub.i8         d18, d16, d3
+        vmull.u8        q3,  d2,  d4
+        vmlal.u8        q3,  d0,  d17
+        vmull.u8        q10, d3,  d5
+        vmlal.u8        q10, d1,  d18
+        vrshrn.i16      d22, q3,  #6
+        vrshrn.i16      d23, q10, #6
+        vst1.u8         {d22}, [r0,  :64], r1
+        vst1.u8         {d23}, [r12, :64], r1
+        bgt             8b
+        pop             {r4-r5,pc}
+160:
+        vmov.i8         q12, #64
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+16:
+        vld1.u8         {q1,  q2},  [r5,  :128]!
+        vld1.u8         {q8,  q9},  [r2,  :128]!
+        vld1.u8         {q0},  [r0,  :128]
+        subs            r4,  r4,  #2
+        vsub.i8         q15, q12, q1
+        vld1.u8         {q13}, [r12, :128]
+        vmull.u8        q3,  d16, d2
+        vmlal.u8        q3,  d0,  d30
+        vmull.u8        q14, d17, d3
+        vmlal.u8        q14, d1,  d31
+        vsub.i8         q15, q12, q2
+        vrshrn.i16      d20, q3,  #6
+        vrshrn.i16      d21, q14, #6
+        vmull.u8        q3,  d18, d4
+        vmlal.u8        q3,  d26, d30
+        vmull.u8        q14, d19, d5
+        vmlal.u8        q14, d27, d31
+        vrshrn.i16      d22, q3,  #6
+        vrshrn.i16      d23, q14, #6
+        vst1.u8         {q10}, [r0,  :128], r1
+        vst1.u8         {q11}, [r12, :128], r1
+        bgt             16b
+        pop             {r4-r5,pc}
+320:
+        vmov.i8         q10, #64
+32:
+        vld1.u8         {q2,  q3},  [r5,  :128]!
+        vld1.u8         {q8,  q9},  [r2,  :128]!
+        vld1.u8         {q0,  q1},  [r0,  :128]
+        subs            r4,  r4,  #1
+        vsub.i8         q11, q10, q2
+        vmull.u8        q15, d16, d4
+        vmlal.u8        q15, d0,  d22
+        vmull.u8        q14, d17, d5
+        vmlal.u8        q14, d1,  d23
+        vsub.i8         q11, q10, q3
+        vrshrn.i16      d24, q15, #6
+        vrshrn.i16      d25, q14, #6
+        vmull.u8        q15, d18, d6
+        vmlal.u8        q15, d2,  d22
+        vmull.u8        q14, d19, d7
+        vmlal.u8        q14, d3,  d23
+        vrshrn.i16      d26, q15, #6
+        vrshrn.i16      d27, q14, #6
+        vst1.u8         {q12, q13}, [r0,  :128],  r1
+        bgt             32b
+        pop             {r4-r5,pc}
+endfunc
+
+function blend_h_8bpc_neon, export=1
+        push            {r4-r8,lr}
+        ldr             r4,  [sp, #24]
+        movrel          r5,  X(obmc_masks)
+        add             r5,  r5,  r4
+        sub             r4,  r4,  r4,  lsr #2
+        clz             r6,  r3
+        adr             r7,  L(blend_h_tbl)
+        sub             r6,  r6,  #24
+        ldr             r6,  [r7, r6, lsl #2]
+        add             r7,  r7,  r6
+        bx              r7
+
+        .align 2
+L(blend_h_tbl):
+        .word 1280f  - L(blend_h_tbl) + CONFIG_THUMB
+        .word 640f   - L(blend_h_tbl) + CONFIG_THUMB
+        .word 320f   - L(blend_h_tbl) + CONFIG_THUMB
+        .word 160f   - L(blend_h_tbl) + CONFIG_THUMB
+        .word 80f    - L(blend_h_tbl) + CONFIG_THUMB
+        .word 40f    - L(blend_h_tbl) + CONFIG_THUMB
+        .word 20f    - L(blend_h_tbl) + CONFIG_THUMB
+
+20:
+        vmov.i8         d22, #64
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+2:
+        vld1.16         {d2[], d3[]},  [r5,  :16]!
+        vld1.32         {d1[0]},  [r2,  :32]!
+        subs            r4,  r4,  #2
+        vld1.16         {d0[]},   [r0,  :16]
+        vzip.8          d2,  d3
+        vsub.i8         d4,  d22, d2
+        vld1.16         {d0[1]},  [r12, :16]
+        vmull.u8        q8,  d1,  d2
+        vmlal.u8        q8,  d0,  d4
+        vrshrn.i16      d20, q8,  #6
+        vst1.16         {d20[0]}, [r0,  :16], r1
+        vst1.16         {d20[1]}, [r12, :16], r1
+        bgt             2b
+        pop             {r4-r8,pc}
+40:
+        vmov.i8         d22, #64
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+4:
+        vld2.u8         {d2[],  d3[]},   [r5,  :16]!
+        vld1.u8         {d1},     [r2,  :64]!
+        subs            r4,  r4,  #2
+        vext.u8         d2,  d2,  d3,   #4
+        vld1.32         {d0[]},   [r0,  :32]
+        vsub.i8         d6,  d22, d2
+        vld1.32         {d0[1]},  [r12, :32]
+        vmull.u8        q8,  d1,  d2
+        vmlal.u8        q8,  d0,  d6
+        vrshrn.i16      d20, q8,  #6
+        vst1.32         {d20[0]}, [r0,  :32], r1
+        vst1.32         {d20[1]}, [r12, :32], r1
+        bgt             4b
+        pop             {r4-r8,pc}
+80:
+        vmov.i8         q8, #64
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+8:
+        vld2.u8         {d2[],  d3[]},  [r5,  :16]!
+        vld1.u8         {d4,  d5},  [r2,  :128]!
+        vld1.u8         {d0},   [r0,  :64]
+        vsub.i8         q9,  q8,  q1
+        vld1.u8         {d1},   [r12, :64]
+        subs            r4,  r4,  #2
+        vmull.u8        q3,  d2,  d4
+        vmlal.u8        q3,  d0,  d18
+        vmull.u8        q10, d3,  d5
+        vmlal.u8        q10, d1,  d19
+        vrshrn.i16      d22, q3,  #6
+        vrshrn.i16      d23, q10, #6
+        vst1.u8         {d22}, [r0,  :64], r1
+        vst1.u8         {d23}, [r12, :64], r1
+        bgt             8b
+        pop             {r4-r8,pc}
+160:
+        vmov.i8         q12, #64
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+16:
+        vld2.u8         {d28[], d29[]}, [r5,  :16]!
+        vld1.u8         {d2,  d3,  d4,  d5},  [r2,  :128]!
+        vsub.i8         q15, q12, q14
+        vld1.u8         {q0},  [r0,  :128]
+        subs            r4,  r4,  #2
+        vld1.u8         {q13}, [r12, :128]
+        vmull.u8        q3,  d2,  d28
+        vmlal.u8        q3,  d0,  d30
+        vmull.u8        q8,  d3,  d28
+        vmlal.u8        q8,  d1,  d30
+        vrshrn.i16      d18, q3,  #6
+        vrshrn.i16      d19, q8,  #6
+        vmull.u8        q3,  d4,  d29
+        vmlal.u8        q3,  d26, d31
+        vmull.u8        q8,  d5,  d29
+        vmlal.u8        q8,  d27, d31
+        vrshrn.i16      d20, q3,  #6
+        vrshrn.i16      d21, q8,  #6
+        vst1.u8         {q9},  [r0,  :128], r1
+        vst1.u8         {q10}, [r12, :128], r1
+        bgt             16b
+        pop             {r4-r8,pc}
+320:
+640:
+1280:
+        vmov.i8         d20, #64
+        sub             r1,  r1,  r3
+321:
+        vld1.u8         {d6[]},  [r5]!
+        vsub.i8         d7,  d20, d6
+        mov             r8,  r3
+32:
+        vld1.u8         {q8,  q9},  [r2,  :128]!
+        vld1.u8         {q0,  q1},  [r0,  :128]
+        vmull.u8        q15, d16, d6
+        vmlal.u8        q15, d0,  d7
+        vmull.u8        q14, d17, d6
+        vmlal.u8        q14, d1,  d7
+        vrshrn.i16      d0,  q15, #6
+        vrshrn.i16      d1,  q14, #6
+        vmull.u8        q15, d18, d6
+        vmlal.u8        q15, d2,  d7
+        vmull.u8        q14, d19, d6
+        vmlal.u8        q14, d3,  d7
+        vrshrn.i16      d2,  q15, #6
+        vrshrn.i16      d3,  q14, #6
+        vst1.u8         {q0,  q1},  [r0,  :128]!
+        subs            r8,  r8,  #32
+        bgt             32b
+        add             r0,  r0,  r1
+        subs            r4,  r4,  #1
+        bgt             321b
+        pop             {r4-r8,pc}
+endfunc
+
+function blend_v_8bpc_neon, export=1
+        push            {r4-r5,lr}
+        ldr             r4,  [sp, #12]
+        movrel          r5,  X(obmc_masks)
+        add             r5,  r5,  r3
+        clz             lr,  r3
+        adr             r3,  L(blend_v_tbl)
+        sub             lr,  lr,  #26
+        ldr             lr,  [r3, lr, lsl #2]
+        add             r3,  r3,  lr
+        bx              r3
+
+        .align 2
+L(blend_v_tbl):
+        .word 320f  - L(blend_v_tbl) + CONFIG_THUMB
+        .word 160f  - L(blend_v_tbl) + CONFIG_THUMB
+        .word 80f   - L(blend_v_tbl) + CONFIG_THUMB
+        .word 40f   - L(blend_v_tbl) + CONFIG_THUMB
+        .word 20f   - L(blend_v_tbl) + CONFIG_THUMB
+
+20:
+        vmov.i8         d22, #64
+        vld1.8          {d2[]},   [r5]
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+        vsub.i8         d3,  d22, d2
+2:
+        vld1.16         {d1[0]},  [r2,  :16]!
+        vld1.8          {d0[]},   [r0]
+        subs            r4,  r4,  #2
+        vld1.8          {d1[1]},  [r2]
+        vld1.8          {d0[1]},  [r12]
+        vmull.u8        q2,  d1,  d2
+        vmlal.u8        q2,  d0,  d3
+        vrshrn.i16      d6,  q2,  #6
+        add             r2,  r2,  #2
+        vst1.8          {d6[0]},  [r0],  r1
+        vst1.8          {d6[1]},  [r12], r1
+        bgt             2b
+        pop             {r4-r5,pc}
+40:
+        vmov.i8         d22, #64
+        vld1.32         {d4[]},   [r5,  :32]
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+        vsub.i8         d5,  d22, d4
+        sub             r1,  r1,  #2
+4:
+        vld1.u8         {d2},     [r2,  :64]!
+        vld1.32         {d0[]},   [r0,  :32]
+        vld1.32         {d0[1]},  [r12, :32]
+        subs            r4,  r4,  #2
+        vmull.u8        q3,  d2,  d4
+        vmlal.u8        q3,  d0,  d5
+        vrshrn.i16      d20, q3,  #6
+        vst1.16         {d20[0]}, [r0,  :16]!
+        vst1.16         {d20[2]}, [r12, :16]!
+        vst1.8          {d20[2]}, [r0],  r1
+        vst1.8          {d20[6]}, [r12], r1
+        bgt             4b
+        pop             {r4-r5,pc}
+80:
+        vmov.i8         d16, #64
+        vld1.u8         {d2},  [r5,  :64]
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+        vsub.i8         d17, d16, d2
+        sub             r1,  r1,  #4
+8:
+        vld1.u8         {d4,  d5},  [r2,  :128]!
+        vld1.u8         {d0},  [r0,  :64]
+        vld1.u8         {d1},  [r12, :64]
+        subs            r4,  r4,  #2
+        vmull.u8        q3,  d2,  d4
+        vmlal.u8        q3,  d0,  d17
+        vmull.u8        q10, d2,  d5
+        vmlal.u8        q10, d1,  d17
+        vrshrn.i16      d22, q3,  #6
+        vrshrn.i16      d23, q10, #6
+        vst1.32         {d22[0]}, [r0,  :32]!
+        vst1.32         {d23[0]}, [r12, :32]!
+        vst1.16         {d22[2]}, [r0,  :16], r1
+        vst1.16         {d23[2]}, [r12, :16], r1
+        bgt             8b
+        pop             {r4-r5,pc}
+160:
+        vmov.i8         q12, #64
+        vld1.u8         {q14}, [r5,  :128]
+        add             r12, r0,  r1
+        lsl             r1,  r1,  #1
+        vsub.i8         q11, q12, q14
+        sub             r1,  r1,  #8
+16:
+        vld1.u8         {q1,  q2},  [r2,  :128]!
+        vld1.u8         {q0},  [r0,  :128]
+        subs            r4,  r4,  #2
+        vld1.u8         {q13}, [r12, :128]
+        vmull.u8        q3,  d2,  d28
+        vmlal.u8        q3,  d0,  d22
+        vmull.u8        q8,  d3,  d29
+        vmlal.u8        q8,  d1,  d23
+        vrshrn.i16      d18, q3,  #6
+        vrshrn.i16      d19, q8,  #6
+        vmull.u8        q3,  d4,  d28
+        vmlal.u8        q3,  d26, d22
+        vmull.u8        q8,  d5,  d29
+        vmlal.u8        q8,  d27, d23
+        vrshrn.i16      d20, q3,  #6
+        vrshrn.i16      d21, q8,  #6
+        vst1.u8         {d18},    [r0,  :64]!
+        vst1.u8         {d20},    [r12, :64]!
+        vst1.32         {d19[0]}, [r0,  :32], r1
+        vst1.32         {d21[0]}, [r12, :32], r1
+        bgt             16b
+        pop             {r4-r5,pc}
+320:
+        vmov.i8         q10, #64
+        vld1.u8         {q2,  q3},  [r5,  :128]
+        vsub.i8         q11, q10, q2
+        vsub.i8         d24, d20, d6
+32:
+        vld1.u8         {q8,  q9},  [r2,  :128]!
+        vld1.u8         {d0,  d1,  d2},  [r0,  :64]
+        subs            r4,  r4,  #1
+        vmull.u8        q15, d16, d4
+        vmlal.u8        q15, d0,  d22
+        vmull.u8        q14, d17, d5
+        vmlal.u8        q14, d1,  d23
+        vrshrn.i16      d0,  q15, #6
+        vrshrn.i16      d1,  q14, #6
+        vmull.u8        q15, d18, d6
+        vmlal.u8        q15, d2,  d24
+        vrshrn.i16      d2,  q15, #6
+        vst1.u8         {d0,  d1,  d2},  [r0,  :64],  r1
+        bgt             32b
+        pop             {r4-r5,pc}
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// assumes that the caller has loaded the h argument into r5,
+// and assumes that r8 is set to (clz(w)-24).
+function put_neon
+        adr             r9,  L(put_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(put_tbl):
+        .word 1280f - L(put_tbl) + CONFIG_THUMB
+        .word 640f  - L(put_tbl) + CONFIG_THUMB
+        .word 32f   - L(put_tbl) + CONFIG_THUMB
+        .word 160f  - L(put_tbl) + CONFIG_THUMB
+        .word 8f    - L(put_tbl) + CONFIG_THUMB
+        .word 4f    - L(put_tbl) + CONFIG_THUMB
+        .word 2f    - L(put_tbl) + CONFIG_THUMB
+
+2:
+        vld1.16         {d0[]}, [r2], r3
+        vld1.16         {d1[]}, [r2], r3
+        subs            r5,  r5,  #2
+        vst1.16         {d0[0]}, [r0, :16], r1
+        vst1.16         {d1[0]}, [r0, :16], r1
+        bgt             2b
+        pop             {r4-r11,pc}
+4:
+        vld1.32         {d0[]}, [r2], r3
+        vld1.32         {d1[]}, [r2], r3
+        subs            r5,  r5,  #2
+        vst1.32         {d0[0]}, [r0, :32], r1
+        vst1.32         {d1[0]}, [r0, :32], r1
+        bgt             4b
+        pop             {r4-r11,pc}
+8:
+        vld1.8          {d0}, [r2], r3
+        vld1.8          {d1}, [r2], r3
+        subs            r5,  r5,  #2
+        vst1.8          {d0}, [r0, :64], r1
+        vst1.8          {d1}, [r0, :64], r1
+        bgt             8b
+        pop             {r4-r11,pc}
+160:
+        add             r8,  r0,  r1
+        lsl             r1,  r1,  #1
+        add             r9,  r2,  r3
+        lsl             r3,  r3,  #1
+16:
+        vld1.8          {q0}, [r2], r3
+        vld1.8          {q1}, [r9], r3
+        subs            r5,  r5,  #2
+        vst1.8          {q0}, [r0, :128], r1
+        vst1.8          {q1}, [r8, :128], r1
+        bgt             16b
+        pop             {r4-r11,pc}
+32:
+        vld1.8          {q0,  q1},  [r2], r3
+        subs            r5,  r5,  #1
+        vst1.8          {q0,  q1},  [r0, :128], r1
+        bgt             32b
+        pop             {r4-r11,pc}
+640:
+        sub             r1,  r1,  #32
+        sub             r3,  r3,  #32
+64:
+        vld1.8          {q0,  q1},  [r2]!
+        vst1.8          {q0,  q1},  [r0, :128]!
+        vld1.8          {q2,  q3},  [r2], r3
+        subs            r5,  r5,  #1
+        vst1.8          {q2,  q3},  [r0, :128], r1
+        bgt             64b
+        pop             {r4-r11,pc}
+1280:
+        sub             r1,  r1,  #96
+        sub             r3,  r3,  #96
+128:
+        vld1.8          {q8,  q9},  [r2]!
+        vst1.8          {q8,  q9},  [r0, :128]!
+        vld1.8          {q10, q11}, [r2]!
+        vst1.8          {q10, q11}, [r0, :128]!
+        vld1.8          {q12, q13}, [r2]!
+        vst1.8          {q12, q13}, [r0, :128]!
+        vld1.8          {q14, q15}, [r2], r3
+        subs            r5,  r5,  #1
+        vst1.8          {q14, q15}, [r0, :128], r1
+        bgt             128b
+        pop             {r4-r11,pc}
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// assumes that the caller has loaded the h argument into r4,
+// and assumes that r8 is set to (clz(w)-24), and r7 to w*2.
+function prep_neon
+        adr             r9,  L(prep_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(prep_tbl):
+        .word 1280f - L(prep_tbl) + CONFIG_THUMB
+        .word 640f  - L(prep_tbl) + CONFIG_THUMB
+        .word 320f  - L(prep_tbl) + CONFIG_THUMB
+        .word 160f  - L(prep_tbl) + CONFIG_THUMB
+        .word 8f    - L(prep_tbl) + CONFIG_THUMB
+        .word 4f    - L(prep_tbl) + CONFIG_THUMB
+
+4:
+        vld1.32         {d0[]}, [r1], r2
+        vld1.32         {d2[]}, [r1], r2
+        subs            r4,  r4,  #2
+        vshll.u8        q0,  d0,  #4
+        vshll.u8        q1,  d2,  #4
+        vst1.16         {d1, d2}, [r0, :64]!
+        bgt             4b
+        pop             {r4-r11,pc}
+8:
+        vld1.8          {d0}, [r1], r2
+        vld1.8          {d2}, [r1], r2
+        subs            r4,  r4,  #2
+        vshll.u8        q0,  d0,  #4
+        vshll.u8        q1,  d2,  #4
+        vst1.16         {q0, q1}, [r0, :128]!
+        bgt             8b
+        pop             {r4-r11,pc}
+160:
+        add             r9,  r1,  r2
+        lsl             r2,  r2,  #1
+        add             r8,  r0,  r7
+        lsl             r7,  r7,  #1
+16:
+        vld1.8          {q2}, [r1], r2
+        vld1.8          {q3}, [r9], r2
+        subs            r4,  r4,  #2
+        vshll.u8        q0,  d4,  #4
+        vshll.u8        q1,  d5,  #4
+        vshll.u8        q2,  d6,  #4
+        vshll.u8        q3,  d7,  #4
+        vst1.16         {q0, q1}, [r0, :128], r7
+        vst1.16         {q2, q3}, [r8, :128], r7
+        bgt             16b
+        pop             {r4-r11,pc}
+320:
+        add             r8,  r0,  r3
+32:
+        vld1.8          {q0,  q1},  [r1], r2
+        subs            r4,  r4,  #2
+        vshll.u8        q8,  d0,  #4
+        vshll.u8        q9,  d1,  #4
+        vld1.8          {q2,  q3},  [r1], r2
+        vshll.u8        q10, d2,  #4
+        vshll.u8        q11, d3,  #4
+        vshll.u8        q12, d4,  #4
+        vst1.16         {q8,  q9},  [r0, :128], r7
+        vshll.u8        q13, d5,  #4
+        vst1.16         {q10, q11}, [r8, :128], r7
+        vshll.u8        q14, d6,  #4
+        vst1.16         {q12, q13}, [r0, :128], r7
+        vshll.u8        q15, d7,  #4
+        vst1.16         {q14, q15}, [r8, :128], r7
+        bgt             32b
+        pop             {r4-r11,pc}
+640:
+        sub             r2,  r2,  #32
+        add             r8,  r0,  #32
+        mov             r6,  #64
+64:
+        vld1.8          {q0,  q1},  [r1]!
+        subs            r4,  r4,  #1
+        vshll.u8        q8,  d0,  #4
+        vshll.u8        q9,  d1,  #4
+        vld1.8          {q2,  q3},  [r1], r2
+        vshll.u8        q10, d2,  #4
+        vshll.u8        q11, d3,  #4
+        vshll.u8        q12, d4,  #4
+        vst1.16         {q8,  q9},  [r0, :128], r6
+        vshll.u8        q13, d5,  #4
+        vshll.u8        q14, d6,  #4
+        vst1.16         {q10, q11}, [r8, :128], r6
+        vshll.u8        q15, d7,  #4
+        vst1.16         {q12, q13}, [r0, :128], r6
+        vst1.16         {q14, q15}, [r8, :128], r6
+        bgt             64b
+        pop             {r4-r11,pc}
+1280:
+        sub             r2,  r2,  #96
+        add             r8,  r0,  #32
+        mov             r6,  #64
+128:
+        vld1.8          {q0,  q1},  [r1]!
+        vld1.8          {q2,  q3},  [r1]!
+        vshll.u8        q10, d0,  #4
+        vshll.u8        q11, d1,  #4
+        vshll.u8        q12, d2,  #4
+        vshll.u8        q13, d3,  #4
+        vshll.u8        q14, d4,  #4
+        vshll.u8        q15, d5,  #4
+        vld1.8          {q8,  q9},  [r1]!
+        vst1.16         {q10, q11}, [r0, :128], r6
+        vst1.16         {q12, q13}, [r8, :128], r6
+        vshll.u8        q0,  d6,  #4
+        vshll.u8        q1,  d7,  #4
+        vshll.u8        q2,  d16, #4
+        vshll.u8        q3,  d17, #4
+        vshll.u8        q8,  d18, #4
+        vshll.u8        q9,  d19, #4
+        vld1.8          {q10, q11}, [r1], r2
+        vst1.16         {q14, q15}, [r0, :128], r6
+        vst1.16         {q0,  q1},  [r8, :128], r6
+        vshll.u8        q12, d20, #4
+        vshll.u8        q13, d21, #4
+        vshll.u8        q14, d22, #4
+        vshll.u8        q15, d23, #4
+        subs            r4,  r4,  #1
+        vst1.16         {q2,  q3},  [r0, :128], r6
+        vst1.16         {q8,  q9},  [r8, :128], r6
+        vst1.16         {q12, q13}, [r0, :128], r6
+        vst1.16         {q14, q15}, [r8, :128], r6
+        bgt             128b
+        pop             {r4-r11,pc}
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+        vld1.\wd        {\d0[]}, [\s0], \strd
+        vld1.\wd        {\d1[]}, [\s1], \strd
+.ifnb \d2
+        vld1.\wd        {\d2[]}, [\s0], \strd
+        vld1.\wd        {\d3[]}, [\s1], \strd
+.endif
+.ifnb \d4
+        vld1.\wd        {\d4[]}, [\s0], \strd
+.endif
+.ifnb \d5
+        vld1.\wd        {\d5[]}, [\s1], \strd
+.endif
+.ifnb \d6
+        vld1.\wd        {\d6[]}, [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        vld1.8          {\d0}, [\s0], \strd
+        vld1.8          {\d1}, [\s1], \strd
+.ifnb \d2
+        vld1.8          {\d2}, [\s0], \strd
+        vld1.8          {\d3}, [\s1], \strd
+.endif
+.ifnb \d4
+        vld1.8          {\d4}, [\s0], \strd
+.endif
+.ifnb \d5
+        vld1.8          {\d5}, [\s1], \strd
+.endif
+.ifnb \d6
+        vld1.8          {\d6}, [\s0], \strd
+.endif
+.endm
+.macro load_16 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_slice      \s0, \s1, \strd, 16, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_slice      \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro interleave_1_16 r0, r1, r2, r3, r4
+        vext.8          \r0, \r0, \r1, #6
+        vext.8          \r1, \r1, \r2, #6
+.ifnb \r3
+        vext.8          \r2, \r2, \r3, #6
+        vext.8          \r3, \r3, \r4, #6
+.endif
+.endm
+.macro interleave_1_32 r0, r1, r2, r3, r4
+        vext.8          \r0, \r0, \r1, #4
+        vext.8          \r1, \r1, \r2, #4
+.ifnb \r3
+        vext.8          \r2, \r2, \r3, #4
+        vext.8          \r3, \r3, \r4, #4
+.endif
+.endm
+.macro vmovl_u8 q0, d0, q1, d1, q2, d2, q3, d3, q4, d4, q5, d5, q6, d6
+        vmovl.u8        \q0, \d0
+        vmovl.u8        \q1, \d1
+.ifnb \q2
+        vmovl.u8        \q2, \d2
+        vmovl.u8        \q3, \d3
+.endif
+.ifnb \q4
+        vmovl.u8        \q4, \d4
+.endif
+.ifnb \q5
+        vmovl.u8        \q5, \d5
+.endif
+.ifnb \q6
+        vmovl.u8        \q6, \d6
+.endif
+.endm
+.macro mul_mla_4 d, s0, s1, s2, s3
+        vmul.s16        \d,  \s0,  d0[0]
+        vmla.s16        \d,  \s1,  d0[1]
+        vmla.s16        \d,  \s2,  d0[2]
+        vmla.s16        \d,  \s3,  d0[3]
+.endm
+.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+        vmul.s16        \d0, \s0, d0[0]
+        vmla.s16        \d0, \s1, d0[1]
+        vmla.s16        \d0, \s2, d0[2]
+        vmla.s16        \d0, \s3, d0[3]
+        vmla.s16        \d0, \s4, d1[0]
+        vmla.s16        \d0, \s5, d1[1]
+        vmla.s16        \d0, \s6, d1[2]
+        vmla.s16        \d0, \s7, d1[3]
+        vmul.s16        \d1, \s1, d0[0]
+        vmla.s16        \d1, \s2, d0[1]
+        vmla.s16        \d1, \s3, d0[2]
+        vmla.s16        \d1, \s4, d0[3]
+        vmla.s16        \d1, \s5, d1[0]
+        vmla.s16        \d1, \s6, d1[1]
+        vmla.s16        \d1, \s7, d1[2]
+        vmla.s16        \d1, \s8, d1[3]
+.endm
+.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+        vmul.s16        \d0, \s0, d0[0]
+        vmla.s16        \d0, \s1, d0[1]
+        vmla.s16        \d0, \s2, d0[2]
+        vmla.s16        \d0, \s3, d0[3]
+        vmla.s16        \d0, \s4, d1[0]
+        vmla.s16        \d0, \s5, d1[1]
+        vmla.s16        \d0, \s6, d1[2]
+        vmla.s16        \d0, \s7, d1[3]
+        vmul.s16        \d1, \s2, d0[0]
+        vmla.s16        \d1, \s3, d0[1]
+        vmla.s16        \d1, \s4, d0[2]
+        vmla.s16        \d1, \s5, d0[3]
+        vmla.s16        \d1, \s6, d1[0]
+        vmla.s16        \d1, \s7, d1[1]
+        vmla.s16        \d1, \s8, d1[2]
+        vmla.s16        \d1, \s9, d1[3]
+.endm
+.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
+        vmul.s16        \d0, \s0,  d0[0]
+        vmla.s16        \d0, \s1,  d0[1]
+        vmla.s16        \d0, \s2,  d0[2]
+        vmla.s16        \d0, \s3,  d0[3]
+        vmla.s16        \d0, \s4,  d1[0]
+        vmla.s16        \d0, \s5,  d1[1]
+        vmla.s16        \d0, \s6,  d1[2]
+        vmla.s16        \d0, \s7,  d1[3]
+        vmul.s16        \d1, \s4,  d0[0]
+        vmla.s16        \d1, \s5,  d0[1]
+        vmla.s16        \d1, \s6,  d0[2]
+        vmla.s16        \d1, \s7,  d0[3]
+        vmla.s16        \d1, \s8,  d1[0]
+        vmla.s16        \d1, \s9,  d1[1]
+        vmla.s16        \d1, \s10, d1[2]
+        vmla.s16        \d1, \s11, d1[3]
+.endm
+.macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3
+        vqrshrun.s16    \d0, \q0, #\shift
+.ifnb \q1
+        vqrshrun.s16    \d1, \q1, #\shift
+.endif
+.ifnb \q2
+        vqrshrun.s16    \d2, \q2, #\shift
+        vqrshrun.s16    \d3, \q3, #\shift
+.endif
+.endm
+.macro vrshr_s16 shift, r0, r1, r2, r3
+        vrshr.s16       \r0, \r0, #\shift
+.ifnb \r1
+        vrshr.s16       \r1, \r1, #\shift
+.endif
+.ifnb \r2
+        vrshr.s16       \r2, \r2, #\shift
+        vrshr.s16       \r3, \r3, #\shift
+.endif
+.endm
+.macro st_16 strd, reg, lanes
+        vst1.16         {\reg[0]}, [r0, :16], \strd
+        vst1.16         {\reg[1]}, [r8, :16], \strd
+.if \lanes > 2
+        vst1.16         {\reg[2]}, [r0, :16], \strd
+        vst1.16         {\reg[3]}, [r8, :16], \strd
+.endif
+.endm
+.macro st_32 strd, r0, r1
+        vst1.32         {\r0[0]}, [r0, :32], \strd
+        vst1.32         {\r0[1]}, [r8, :32], \strd
+.ifnb \r1
+        vst1.32         {\r1[0]}, [r0, :32], \strd
+        vst1.32         {\r1[1]}, [r8, :32], \strd
+.endif
+.endm
+.macro st_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7
+        vst1.8          {\r0}, [r0, \align], \strd
+        vst1.8          {\r1}, [r8, \align], \strd
+.ifnb \r2
+        vst1.8          {\r2}, [r0, \align], \strd
+        vst1.8          {\r3}, [r8, \align], \strd
+.endif
+.ifnb \r4
+        vst1.8          {\r4}, [r0, \align], \strd
+        vst1.8          {\r5}, [r8, \align], \strd
+        vst1.8          {\r6}, [r0, \align], \strd
+        vst1.8          {\r7}, [r8, \align], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, q0, d0, d1, q1, d2, d3
+.ifc \type, put
+        vqrshrun_s16    6,     \q0, \d0, \q1, \d2
+        st_32           \strd, \d0, \d2
+.else
+        vrshr_s16       2,          \q0, \q1
+        st_reg          \strd, :64, \d0, \d1, \d2, \d3
+.endif
+.endm
+.macro shift_store_8 type, strd, q0, d0, q1, d1, q2, d2, q3, d3
+.ifc \type, put
+        vqrshrun_s16    6,          \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+        st_reg          \strd, :64, \d0, \d1, \d2, \d3
+.else
+        vrshr_s16       2,          \q0, \q1, \q2, \q3
+        st_reg          \strd, :128,\q0, \q1, \q2, \q3
+.endif
+.endm
+.macro shift_store_16 type, strd, q0, d0, d1, q1, q2, d4, d5, q3
+.ifc \type, put
+        vqrshrun.s16    \d0,   \q0, #6
+        vqrshrun.s16    \d1,   \q1, #6
+        vqrshrun.s16    \d4,   \q2, #6
+        vqrshrun.s16    \d5,   \q3, #6
+        st_reg          \strd, :128, \q0, \q2
+.else
+        vrshr_s16       2,     \q0, \q1, \q2, \q3
+        vst1.16         {\q0, \q1}, [r0, :128], \strd
+        vst1.16         {\q2, \q3}, [r8, :128], \strd
+.endif
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        movw            r8,  \type_h
+        movw            r9,  \type_v
+        b               \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH  ((1*15<<7)|4*15)
+#define SHARP   ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, ds2, sr2, shift_hv
+make_8tap_fn \type, regular,        REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
+make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
+make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
+make_8tap_fn \type, sharp,          SHARP,   SHARP
+make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
+make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
+
+function \type\()_8tap_neon
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+        movw            r10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
+        mul             \mx,  \mx, r10
+        mul             \my,  \my, r10
+        add             \mx,  \mx, r8 // mx, 8tap_h, 4tap_h
+        add             \my,  \my, r9 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+        lsl             \d_strd, \w, #1
+.endif
+
+        clz             r8,  \w
+        tst             \mx, #(0x7f << 14)
+        sub             r8,  r8,  #24
+        movrel          r10, X(mc_subpel_filters), -8
+        bne             L(\type\()_8tap_h)
+        tst             \my, #(0x7f << 14)
+        bne             L(\type\()_8tap_v)
+        b               \type\()_neon
+
+L(\type\()_8tap_h):
+        cmp             \w,  #4
+        ubfx            r9,  \mx, #7, #7
+        and             \mx, \mx, #0x7f
+        it              gt
+        movgt           \mx,  r9
+        tst             \my,  #(0x7f << 14)
+        add             \mx, r10, \mx, lsl #3
+        bne             L(\type\()_8tap_hv)
+
+        adr             r9,  L(\type\()_8tap_h_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(\type\()_8tap_h_tbl):
+        .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+
+20:     // 2xN h
+.ifc \type, put
+        add             \mx,  \mx,  #2
+        vld1.32         {d0[]}, [\mx]
+        sub             \src,  \src,  #1
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+2:
+        vld1.8          {d4},  [\src], \s_strd
+        vld1.8          {d6},  [\sr2], \s_strd
+        vmovl.u8        q2,  d4
+        vmovl.u8        q3,  d6
+        vext.8          d5,  d4,  d5,  #2
+        vext.8          d7,  d6,  d7,  #2
+        subs            \h,  \h,  #2
+        vtrn.32         d4,  d6
+        vtrn.32         d5,  d7
+        vmul.s16        d2,  d4,  d0[0]
+        vmla.s16        d2,  d5,  d0[1]
+        vmla.s16        d2,  d6,  d0[2]
+        vmla.s16        d2,  d7,  d0[3]
+        vrshr.s16       d2,  d2,  #2
+        vqrshrun.s16    d2,  q1,  #4
+        vst1.16         {d2[0]}, [\dst, :16], \d_strd
+        vst1.16         {d2[1]}, [\ds2, :16], \d_strd
+        bgt             2b
+        pop             {r4-r11,pc}
+.endif
+
+40:     // 4xN h
+        add             \mx,  \mx,  #2
+        vld1.32         {d0[]}, [\mx]
+        sub             \src,  \src,  #1
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+4:
+        vld1.8          {d16}, [\src], \s_strd
+        vld1.8          {d24}, [\sr2], \s_strd
+        vmovl.u8        q8,  d16
+        vmovl.u8        q12, d24
+        vext.8          q9,  q8,  q8,  #2
+        vext.8          q10, q8,  q8,  #4
+        vext.8          q11, q8,  q8,  #6
+        vext.8          q13, q12, q12, #2
+        vext.8          q14, q12, q12, #4
+        vext.8          q15, q12, q12, #6
+        subs            \h,  \h,  #2
+        vmul.s16        d4,  d16, d0[0]
+        vmla.s16        d4,  d18, d0[1]
+        vmla.s16        d4,  d20, d0[2]
+        vmla.s16        d4,  d22, d0[3]
+        vmul.s16        d5,  d24, d0[0]
+        vmla.s16        d5,  d26, d0[1]
+        vmla.s16        d5,  d28, d0[2]
+        vmla.s16        d5,  d30, d0[3]
+        vrshr.s16       q2,  q2,  #2
+.ifc \type, put
+        vqrshrun.s16    d4,  q2,  #4
+        vst1.32         {d4[0]}, [\dst, :32], \d_strd
+        vst1.32         {d4[1]}, [\ds2, :32], \d_strd
+.else
+        vst1.16         {d4}, [\dst, :64], \d_strd
+        vst1.16         {d5}, [\ds2, :64], \d_strd
+.endif
+        bgt             4b
+        pop             {r4-r11,pc}
+
+80:     // 8xN h
+        vld1.8          {d0}, [\mx]
+        sub             \src,  \src,  #3
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+8:
+        vld1.8          {q8},  [\src], \s_strd
+        vld1.8          {q12}, [\sr2], \s_strd
+        vmovl.u8        q9,  d17
+        vmovl.u8        q8,  d16
+        vmovl.u8        q13, d25
+        vmovl.u8        q12, d24
+
+        vmul.s16        q10, q8,  d0[0]
+        vmul.s16        q14, q12, d0[0]
+.irpc i, 1234567
+        vext.8          q11, q8,  q9,  #(2*\i)
+        vext.8          q15, q12, q13, #(2*\i)
+.if \i < 4
+        vmla.s16        q10, q11, d0[\i]
+        vmla.s16        q14, q15, d0[\i]
+.else
+        vmla.s16        q10, q11, d1[\i-4]
+        vmla.s16        q14, q15, d1[\i-4]
+.endif
+.endr
+        subs            \h,  \h,  #2
+        vrshr.s16       q10, q10, #2
+        vrshr.s16       q14, q14, #2
+.ifc \type, put
+        vqrshrun.s16    d20, q10, #4
+        vqrshrun.s16    d28, q14, #4
+        vst1.8          {d20}, [\dst, :64], \d_strd
+        vst1.8          {d28}, [\ds2, :64], \d_strd
+.else
+        vst1.16         {q10}, [\dst, :128], \d_strd
+        vst1.16         {q14}, [\ds2, :128], \d_strd
+.endif
+        bgt             8b
+        pop             {r4-r11,pc}
+
+160:
+320:
+640:
+1280:   // 16xN, 32xN, ... h
+        // This could be done without touching q4-q6, by using only
+        // one temporary for vext in the loop. That's slower on A7 and A53,
+        // (but surprisingly, marginally faster on A8 and A73).
+        vpush           {q4-q6}
+        vld1.8          {d0}, [\mx]
+        sub             \src,  \src,  #3
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+
+        sub             \s_strd,  \s_strd,  \w
+        sub             \s_strd,  \s_strd,  #8
+.ifc \type, put
+        lsl             \d_strd,  \d_strd,  #1
+        sub             \d_strd,  \d_strd,  \w
+.endif
+161:
+        vld1.8          {d16, d17, d18},  [\src]!
+        vld1.8          {d24, d25, d26},  [\sr2]!
+        mov             \mx, \w
+        vmovl.u8        q10, d18
+        vmovl.u8        q9,  d17
+        vmovl.u8        q8,  d16
+        vmovl.u8        q14, d26
+        vmovl.u8        q13, d25
+        vmovl.u8        q12, d24
+
+16:
+        vmul.s16        q1,  q8,  d0[0]
+        vmul.s16        q2,  q9,  d0[0]
+        vmul.s16        q3,  q12, d0[0]
+        vmul.s16        q4,  q13, d0[0]
+.irpc i, 1234567
+        vext.8          q5,  q8,  q9,  #(2*\i)
+        vext.8          q6,  q9,  q10, #(2*\i)
+        vext.8          q11, q12, q13, #(2*\i)
+        vext.8          q15, q13, q14, #(2*\i)
+.if \i < 4
+        vmla.s16        q1,  q5,  d0[\i]
+        vmla.s16        q2,  q6,  d0[\i]
+        vmla.s16        q3,  q11, d0[\i]
+        vmla.s16        q4,  q15, d0[\i]
+.else
+        vmla.s16        q1,  q5,  d1[\i-4]
+        vmla.s16        q2,  q6,  d1[\i-4]
+        vmla.s16        q3,  q11, d1[\i-4]
+        vmla.s16        q4,  q15, d1[\i-4]
+.endif
+.endr
+        vrshr.s16       q1,  q1,  #2
+        vrshr.s16       q2,  q2,  #2
+        vrshr.s16       q3,  q3,  #2
+        vrshr.s16       q4,  q4,  #2
+        subs            \mx, \mx, #16
+.ifc \type, put
+        vqrshrun.s16    d2,  q1,  #4
+        vqrshrun.s16    d3,  q2,  #4
+        vqrshrun.s16    d4,  q3,  #4
+        vqrshrun.s16    d5,  q4,  #4
+        vst1.8          {q1}, [\dst, :128]!
+        vst1.8          {q2}, [\ds2, :128]!
+.else
+        vst1.16         {q1, q2}, [\dst, :128]!
+        vst1.16         {q3, q4}, [\ds2, :128]!
+.endif
+        ble             9f
+
+        vmov            q8,  q10
+        vmov            q12, q14
+        vld1.8          {d18, d19}, [\src]!
+        vld1.8          {d26, d27}, [\sr2]!
+        vmovl.u8        q10, d19
+        vmovl.u8        q9,  d18
+        vmovl.u8        q14, d27
+        vmovl.u8        q13, d26
+        b               16b
+
+9:
+        add             \dst,  \dst,  \d_strd
+        add             \ds2,  \ds2,  \d_strd
+        add             \src,  \src,  \s_strd
+        add             \sr2,  \sr2,  \s_strd
+
+        subs            \h,  \h,  #2
+        bgt             161b
+        vpop            {q4-q6}
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_v):
+        cmp             \h,  #4
+        ubfx            r9,  \my, #7, #7
+        and             \my, \my, #0x7f
+        it              gt
+        movgt           \my, r9
+        add             \my, r10, \my, lsl #3
+
+        adr             r9,  L(\type\()_8tap_v_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(\type\()_8tap_v_tbl):
+        .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+
+20:     // 2xN v
+.ifc \type, put
+        bgt             28f
+
+        cmp             \h,  #2
+        add             \my, \my, #2
+        vld1.32         {d0[]}, [\my]
+        sub             \src,  \src,  \s_strd
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        vmovl.s8        q0,  d0
+
+        // 2x2 v
+        load_16         \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+        interleave_1_16 d1, d2, d3, d4, d5
+        bgt             24f
+        vmovl_u8        q8, d1, q9, d2, q10, d3, q11, d4
+        mul_mla_4       d6, d16, d18, d20, d22
+        vqrshrun_s16    6,   q3,  d6
+        st_16           \d_strd, d6, 2
+        pop             {r4-r11,pc}
+
+24:     // 2x4 v
+        load_16         \sr2, \src, \s_strd, d6, d7
+        interleave_1_16 d5, d6, d7
+        vmovl_u8        q8, d1, q9, d2, q10, d3, q11, d4, q12, d5, q13, d6
+        vmov            d17, d20
+        vmov            d19, d22
+        vmov            d21, d24
+        vmov            d23, d26
+        mul_mla_4       q3, q8, q9, q10, q11
+        vqrshrun_s16    6,   q3,  d6
+        st_16           \d_strd, d6, 4
+        pop             {r4-r11,pc}
+
+28:     // 2x8, 2x16 v
+        vpush           {q4-q7}
+        vld1.8          {d0}, [\my]
+        sub             \sr2,  \src,  \s_strd, lsl #1
+        add             \ds2,  \dst,  \d_strd
+        sub             \src,  \sr2,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+
+        load_16         \src, \sr2, \s_strd, d2,  d4,  d6,  d8,  d10, d12, d14
+        interleave_1_16 d2,  d4,  d6,  d8,  d10
+        interleave_1_16 d10, d12, d14
+        vmovl_u8        q1,  d2,  q2,  d4,  q3,  d6,  q4,  d8,  q5,  d10, q6,  d12
+        vmov            d3,  d6
+        vmov            d5,  d8
+        vmov            d7,  d10
+        vmov            d9,  d12
+216:
+        subs            \h,  \h,  #8
+        load_16         \sr2, \src, \s_strd, d16, d18, d20, d22
+        load_16         \sr2, \src, \s_strd, d24, d26, d28, d30
+        interleave_1_16 d14, d16, d18, d20, d22
+        interleave_1_16 d22, d24, d26, d28, d30
+        vmovl_u8        q7,  d14, q8,  d16, q9,  d18, q10, d20
+        vmovl_u8        q11, d22, q12, d24, q13, d26, q14, d28
+        vmov            d11, d14
+        vmov            d13, d16
+        vmov            d15, d18
+        vmov            d17, d20
+        vmov            d19, d22
+        vmov            d21, d24
+        vmov            d23, d26
+        vmov            d25, d28
+        mul_mla_8_4     q1,  q2,  q1,  q2,  q3,  q4,  q5,  q6,  q7,  q8,  q9,  q10, q11, q12
+        vqrshrun_s16    6,   q1,  d2,  q2,  d4
+        st_16           \d_strd, d2, 4
+        st_16           \d_strd, d4, 4
+        ble             0f
+        vmov            q1,  q9
+        vmov            q2,  q10
+        vmov            q3,  q11
+        vmov            q4,  q12
+        vmov            q5,  q13
+        vmov            q6,  q14
+        vmov            d14, d30
+        b               216b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+.endif
+
+40:
+        bgt            480f
+
+        // 4x2, 4x4 v
+        cmp             \h,  #2
+        add             \my, \my, #2
+        vld1.32         {d0[]}, [\my]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+
+        load_32         \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+        interleave_1_32 d1,  d2,  d3,  d4,  d5
+        vmovl_u8        q8,  d1,  q9,  d2,  q10, d3,  q11, d4
+        mul_mla_4       q3,  q8,  q9,  q10, q11
+        shift_store_4   \type, \d_strd, q3, d6, d7
+        ble             0f
+        load_32         \sr2, \src, \s_strd, d6, d7
+        interleave_1_32 d5,  d6,  d7
+        vmovl_u8        q12, d5,  q13, d6
+        mul_mla_4       q3,  q10, q11, q12, q13
+        shift_store_4   \type, \d_strd, q3, d6, d7
+0:
+        pop             {r4-r11,pc}
+
+480:    // 4x8, 4x16 v
+        vpush           {q4}
+        vld1.8          {d0}, [\my]
+        sub             \sr2, \src, \s_strd, lsl #1
+        add             \ds2, \dst, \d_strd
+        sub             \src, \sr2, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+
+        load_32         \src, \sr2, \s_strd, d2,  d4,  d6,  d8,  d16, d18, d20
+        interleave_1_32 d2,  d4,  d6
+        interleave_1_32 d6,  d8,  d16, d18, d20
+        vmovl_u8        q1,  d2,  q2,  d4,  q3,  d6,  q4,  d8,  q8,  d16, q9,  d18
+
+48:
+        subs            \h,  \h,  #4
+        load_32         \sr2, \src, \s_strd, d22, d24, d26, d28
+        interleave_1_32 d20, d22, d24, d26, d28
+        vmovl_u8        q10, d20, q11, d22, q12, d24, q13, d26
+        mul_mla_8_2     q1,  q2,  q1,  q2,  q3,  q4,  q8,  q9,  q10, q11, q12, q13
+        shift_store_4   \type, \d_strd, q1,  d2,  d3,  q2,  d4,  d5
+        ble             0f
+        subs            \h,  \h,  #4
+        load_32         \sr2,  \src, \s_strd, d30, d2,  d4,  d6
+        interleave_1_32 d28, d30, d2,  d4,  d6
+        vmovl_u8        q14, d28, q15, d30, q1,  d2,  q2,  d4
+        mul_mla_8_2     q8,  q9,  q8,  q9,  q10, q11, q12, q13, q14, q15, q1,  q2
+        shift_store_4   \type, \d_strd, q8,  d16, d17, q9,  d18, d19
+        ble             0f
+        subs            \h,  \h,  #4
+        load_32         \sr2, \src, \s_strd, d8,  d16, d18, d20
+        interleave_1_32 d6,  d8,  d16, d18, d20
+        vmovl_u8        q3,  d6,  q4,  d8,  q8,  d16, q9, d18
+        mul_mla_8_2     q12, q13, q12, q13, q14, q15, q1,  q2,  q3,  q4,  q8,  q9
+        shift_store_4   \type, \d_strd, q12, d24, d25, q13, d26, d27
+        bgt             48b
+0:
+        vpop            {q4}
+        pop             {r4-r11,pc}
+
+80:
+        bgt             880f
+
+        // 8x2, 8x4 v
+        cmp             \h,  #2
+        add             \my, \my, #2
+        vld1.32         {d0[]}, [\my]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+
+        load_reg        \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+        vmovl_u8        q8,  d1,  q9,  d2,  q10, d3,  q11, d4,  q12, d5
+        mul_mla_4       q1,  q8,  q9,  q10, q11
+        mul_mla_4       q2,  q9,  q10, q11, q12
+        shift_store_8   \type, \d_strd, q1, d2, q2, d4
+        ble             0f
+        load_reg        \sr2, \src, \s_strd, d6, d7
+        vmovl_u8        q13, d6,  q14, d7
+        mul_mla_4       q1,  q10, q11, q12, q13
+        mul_mla_4       q2,  q11, q12, q13, q14
+        shift_store_8   \type, \d_strd, q1, d2, q2, d4
+0:
+        pop             {r4-r11,pc}
+
+880:    // 8x6, 8x8, 8x16, 8x32 v
+1680:   // 16x8, 16x16, ...
+320:    // 32x8, 32x16, ...
+640:
+1280:
+        vpush           {q4}
+        vld1.8          {d0}, [\my]
+        sub             \src, \src, \s_strd
+        sub             \src, \src, \s_strd, lsl #1
+        vmovl.s8        q0,  d0
+        mov             \my, \h
+168:
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        load_reg        \src, \sr2, \s_strd, d2,  d4,  d6,  d8,  d16, d18, d20
+        vmovl_u8        q1,  d2,  q2,  d4,  q3,  d6,  q4,  d8,  q8,  d16, q9,  d18, q10, d20
+
+88:
+        subs            \h,  \h,  #2
+        load_reg        \sr2, \src, \s_strd, d22, d24
+        vmovl_u8        q11, d22, q12, d24
+        mul_mla_8_1     q1,  q2,  q1,  q2,  q3,  q4,  q8,  q9,  q10,  q11, q12
+        shift_store_8   \type, \d_strd, q1,  d2,  q2,  d4
+        ble             9f
+        subs            \h,  \h,  #2
+        load_reg        \sr2, \src, \s_strd, d26, d28
+        vmovl_u8        q13, d26, q14, d28
+        mul_mla_8_1     q3,  q4,  q3,  q4,  q8,  q9,  q10, q11, q12, q13, q14
+        shift_store_8   \type, \d_strd, q3,  d6,  q4,  d8
+        ble             9f
+        subs            \h,  \h,  #2
+        load_reg        \sr2, \src, \s_strd, d30, d2
+        vmovl_u8        q15, d30, q1,  d2
+        mul_mla_8_1     q8,  q9,  q8,  q9,  q10, q11, q12, q13, q14, q15, q1
+        shift_store_8   \type, \d_strd, q8,  d16, q9,  d18
+        ble             9f
+        subs            \h,  \h,  #2
+        load_reg        \sr2, \src, \s_strd, d4,  d6
+        vmovl_u8        q2,  d4,  q3,  d6
+        mul_mla_8_1     q10, q11, q10, q11, q12, q13, q14, q15, q1,  q2,  q3
+        shift_store_8   \type, \d_strd, q10, d20, q11, d22
+        ble             9f
+        subs            \h,  \h,  #4
+        load_reg        \sr2, \src, \s_strd, d8,  d16, d18, d20
+        vmovl_u8        q4,  d8,  q8,  d16, q9,  d18, q10, d20
+        mul_mla_8_1     q12, q13, q12, q13, q14, q15, q1,  q2,  q3,  q4,  q8
+        mul_mla_8_1     q14, q15, q14, q15, q1,  q2,  q3,  q4,  q8,  q9,  q10
+        shift_store_8   \type, \d_strd, q12, d24, q13, d26, q14, d28, q15, d30
+        bgt             88b
+9:
+        subs            \w,  \w,  #8
+        ble             0f
+        asr             \s_strd, \s_strd, #1
+        asr             \d_strd, \d_strd, #1
+        mls             \src, \s_strd, \my, \src
+        mls             \dst, \d_strd, \my, \dst
+        sub             \src, \src, \s_strd, lsl #3
+        mov             \h,  \my
+        add             \src, \src, #8
+.ifc \type, put
+        add             \dst, \dst, #8
+.else
+        add             \dst, \dst, #16
+.endif
+        b               168b
+0:
+        vpop            {q4}
+        pop             {r4-r11,pc}
+
+160:
+        bgt             1680b
+
+        // 16x2, 16x4 v
+        add             \my, \my, #2
+        vld1.32         {d0[]}, [\my]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+
+        cmp             \h,  #2
+        load_reg        \src, \sr2, \s_strd, q11, q12, q13, q14, q15
+        vmovl.u8        q1,  d22
+        vmovl.u8        q2,  d24
+        vmovl.u8        q3,  d26
+        vmovl.u8        q8,  d28
+        vmovl.u8        q9,  d30
+        vmovl.u8        q11, d23
+        vmovl.u8        q12, d25
+        vmovl.u8        q13, d27
+        vmovl.u8        q14, d29
+        vmovl.u8        q15, d31
+        mul_mla_4       q1,  q1,  q2,  q3,  q8
+        mul_mla_4       q10, q2,  q3,  q8,  q9
+        mul_mla_4       q2,  q11, q12, q13, q14
+        mul_mla_4       q11, q12, q13, q14, q15
+        shift_store_16  \type, \d_strd, q1, d2, d3, q2, q10, d20, d21, q11
+        ble             0f
+        load_reg        \sr2, \src, \s_strd, q10, q11
+        vmovl.u8        q1,  d20
+        vmovl.u8        q10, d21
+        vmovl.u8        q12, d22
+        vmovl.u8        q11, d23
+        mul_mla_4       q2,  q3,  q8,  q9,  q1
+        mul_mla_4       q3,  q13, q14, q15, q10
+        mul_mla_4       q13, q8,  q9,  q1,  q12
+        mul_mla_4       q14, q14, q15, q10, q11
+        shift_store_16  \type, \d_strd, q2, d4, d5, q3, q13, d26, d27, q14
+0:
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_hv):
+        cmp             \h,  #4
+        ubfx            r9,  \my, #7, #7
+        and             \my, \my, #0x7f
+        it              gt
+        movgt           \my, r9
+        add             \my,  r10, \my, lsl #3
+
+        adr             r9,  L(\type\()_8tap_hv_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(\type\()_8tap_hv_tbl):
+        .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+
+20:
+.ifc \type, put
+        add             \mx,  \mx,  #2
+        vld1.32         {d0[]},  [\mx]
+        bgt             280f
+        add             \my,  \my,  #2
+        vld1.32         {d2[]},  [\my]
+
+        // 2x2, 2x4 hv
+        sub             \sr2, \src, #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+
+
+        vld1.8          {d26}, [\src], \s_strd
+        vmovl.u8        q13, d26
+        vext.8          q14, q13, q13, #2
+        vmul.s16        d26, d26, d0
+        vmul.s16        d28, d28, d0
+        vpadd.s16       d26, d26, d28
+        vpadd.s16       d26, d26, d26
+        vrshr.s16       d16, d26, #2
+        bl              L(\type\()_8tap_filter_2)
+
+        vext.8          d16, d16, d16, #4
+        vmov            d17, d26
+        vext.8          d16, d16, d26, #4
+
+2:
+        bl              L(\type\()_8tap_filter_2)
+
+        vext.8          d18, d17, d26, #4
+        vmov            d19, d26
+        vmull.s16       q2,  d16, d2[0]
+        vmlal.s16       q2,  d17, d2[1]
+        vmlal.s16       q2,  d18, d2[2]
+        vmlal.s16       q2,  d19, d2[3]
+
+        vqrshrn.s32     d4,  q2,  #\shift_hv
+        vqmovun.s16     d4,  q2
+        subs            \h,  \h,  #2
+        vst1.16         {d4[0]}, [\dst, :16], \d_strd
+        vst1.16         {d4[1]}, [\ds2, :16], \d_strd
+        ble             0f
+        vmov            d16, d18
+        vmov            d17, d19
+        b               2b
+
+280:    // 2x8, 2x16, 2x32 hv
+        vld1.8          {d2},  [\my]
+        sub             \src, \src, #1
+        sub             \sr2, \src, \s_strd, lsl #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+
+        vld1.8          {d26}, [\src], \s_strd
+        vmovl.u8        q13, d26
+        vext.8          q14, q13, q13, #2
+        vmul.s16        d26, d26, d0
+        vmul.s16        d28, d28, d0
+        vpadd.s16       d26, d26, d28
+        vpadd.s16       d26, d26, d26
+        vrshr.s16       d16, d26, #2
+
+        bl              L(\type\()_8tap_filter_2)
+        vext.8          d16, d16, d16, #4
+        vmov            d17, d26
+        vext.8          d16, d16, d26, #4
+        bl              L(\type\()_8tap_filter_2)
+        vext.8          d18, d17, d26, #4
+        vmov            d19, d26
+        bl              L(\type\()_8tap_filter_2)
+        vext.8          d20, d19, d26, #4
+        vmov            d21, d26
+
+28:
+        bl              L(\type\()_8tap_filter_2)
+        vext.8          d22, d21, d26, #4
+        vmov            d23, d26
+        vmull.s16       q2,  d16, d2[0]
+        vmlal.s16       q2,  d17, d2[1]
+        vmlal.s16       q2,  d18, d2[2]
+        vmlal.s16       q2,  d19, d2[3]
+        vmlal.s16       q2,  d20, d3[0]
+        vmlal.s16       q2,  d21, d3[1]
+        vmlal.s16       q2,  d22, d3[2]
+        vmlal.s16       q2,  d23, d3[3]
+
+        vqrshrn.s32     d4,  q2,  #\shift_hv
+        vqmovun.s16     d4,  q2
+        subs            \h,  \h,  #2
+        vst1.16         {d4[0]}, [\dst, :16], \d_strd
+        vst1.16         {d4[1]}, [\ds2, :16], \d_strd
+        ble             0f
+        vmov            d16, d18
+        vmov            d17, d19
+        vmov            d18, d20
+        vmov            d19, d21
+        vmov            d20, d22
+        vmov            d21, d23
+        b               28b
+
+0:
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_filter_2):
+        vld1.8          {d28},  [\sr2], \s_strd
+        vld1.8          {d30},  [\src], \s_strd
+        vext.8          d29, d28, d28, #1
+        vext.8          d31, d30, d30, #1
+        vmovl.u8        q13, d28
+        vmovl.u8        q14, d29
+        vmov            d27, d28
+        vmovl.u8        q14, d30
+        vmovl.u8        q15, d31
+        vtrn.32         d26, d28
+        vtrn.32         d27, d30
+        vmul.s16        d26, d26, d0[0]
+        vmla.s16        d26, d27, d0[1]
+        vmla.s16        d26, d28, d0[2]
+        vmla.s16        d26, d30, d0[3]
+        vrshr.s16       d26, d26, #2
+        vext.8          d27, d26, d26, #4
+        bx              lr
+.endif
+
+40:
+        add             \mx, \mx, #2
+        vld1.32         {d0[]},  [\mx]
+        bgt             480f
+        add             \my, \my,  #2
+        vld1.32         {d2[]},  [\my]
+        sub             \sr2, \src, #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+
+        // 4x2, 4x4 hv
+        vld1.8          {d30}, [\src], \s_strd
+        vmovl.u8        q14, d30
+        vext.8          d27, d28, d29, #2
+        vext.8          d30, d28, d29, #4
+        vext.8          d31, d28, d29, #6
+        vmul.s16        d26, d28, d0[0]
+        vmla.s16        d26, d27, d0[1]
+        vmla.s16        d26, d30, d0[2]
+        vmla.s16        d26, d31, d0[3]
+        vrshr.s16       d16, d26, #2
+
+        bl              L(\type\()_8tap_filter_4)
+        vmov            d17, d26
+        vmov            d18, d27
+
+4:
+        bl              L(\type\()_8tap_filter_4)
+        vmull.s16       q2,  d16, d2[0]
+        vmlal.s16       q2,  d17, d2[1]
+        vmlal.s16       q2,  d18, d2[2]
+        vmlal.s16       q2,  d26, d2[3]
+        vmull.s16       q3,  d17, d2[0]
+        vmlal.s16       q3,  d18, d2[1]
+        vmlal.s16       q3,  d26, d2[2]
+        vmlal.s16       q3,  d27, d2[3]
+        vqrshrn.s32     d4,  q2,  #\shift_hv
+        vqrshrn.s32     d6,  q3,  #\shift_hv
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d6,  q3
+        vst1.32         {d4[0]}, [\dst, :32], \d_strd
+        vst1.32         {d6[0]}, [\ds2, :32], \d_strd
+.else
+        vst1.16         {d4}, [\dst, :64], \d_strd
+        vst1.16         {d6}, [\ds2, :64], \d_strd
+.endif
+        ble             0f
+        vmov            d16, d18
+        vmov            d17, d26
+        vmov            d18, d27
+        b               4b
+
+480:    // 4x8, 4x16, 4x32 hv
+        vld1.8          {d2},  [\my]
+        sub             \src, \src, #1
+        sub             \sr2, \src, \s_strd, lsl #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+
+        vld1.8          {d30}, [\src], \s_strd
+        vmovl.u8        q14, d30
+        vext.8          d27, d28, d29, #2
+        vext.8          d30, d28, d29, #4
+        vext.8          d31, d28, d29, #6
+        vmul.s16        d26, d28, d0[0]
+        vmla.s16        d26, d27, d0[1]
+        vmla.s16        d26, d30, d0[2]
+        vmla.s16        d26, d31, d0[3]
+        vrshr.s16       d16, d26, #2
+
+        bl              L(\type\()_8tap_filter_4)
+        vmov            d17, d26
+        vmov            d18, d27
+        bl              L(\type\()_8tap_filter_4)
+        vmov            d19, d26
+        vmov            d20, d27
+        bl              L(\type\()_8tap_filter_4)
+        vmov            d21, d26
+        vmov            d22, d27
+
+48:
+        bl              L(\type\()_8tap_filter_4)
+        vmull.s16       q2,  d16, d2[0]
+        vmlal.s16       q2,  d17, d2[1]
+        vmlal.s16       q2,  d18, d2[2]
+        vmlal.s16       q2,  d19, d2[3]
+        vmlal.s16       q2,  d20, d3[0]
+        vmlal.s16       q2,  d21, d3[1]
+        vmlal.s16       q2,  d22, d3[2]
+        vmlal.s16       q2,  d26, d3[3]
+        vmull.s16       q3,  d17, d2[0]
+        vmlal.s16       q3,  d18, d2[1]
+        vmlal.s16       q3,  d19, d2[2]
+        vmlal.s16       q3,  d20, d2[3]
+        vmlal.s16       q3,  d21, d3[0]
+        vmlal.s16       q3,  d22, d3[1]
+        vmlal.s16       q3,  d26, d3[2]
+        vmlal.s16       q3,  d27, d3[3]
+        vqrshrn.s32     d4,  q2,  #\shift_hv
+        vqrshrn.s32     d6,  q3,  #\shift_hv
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d6,  q3
+        vst1.32         {d4[0]}, [\dst, :32], \d_strd
+        vst1.32         {d6[0]}, [\ds2, :32], \d_strd
+.else
+        vst1.16         {d4}, [\dst, :64], \d_strd
+        vst1.16         {d6}, [\ds2, :64], \d_strd
+.endif
+        ble             0f
+        vmov            d16, d18
+        vmov            d17, d19
+        vmov            d18, d20
+        vmov            d19, d21
+        vmov            d20, d22
+        vmov            d21, d26
+        vmov            d22, d27
+        b               48b
+0:
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_filter_4):
+        vld1.8          {d30}, [\sr2], \s_strd
+        vld1.8          {d31}, [\src], \s_strd
+        vmovl.u8        q14, d30
+        vext.8          d27, d28, d29, #2
+        vext.8          d30, d28, d29, #4
+        vext.8          d1,  d28, d29, #6
+        vmul.s16        d26, d28, d0[0]
+        vmla.s16        d26, d27, d0[1]
+        vmla.s16        d26, d30, d0[2]
+        vmla.s16        d26, d1,  d0[3]
+
+        vmovl.u8        q14, d31
+        vext.8          d30, d28, d29, #2
+        vext.8          d31, d28, d29, #4
+        vext.8          d1,  d28, d29, #6
+        vmul.s16        d27, d28, d0[0]
+        vmla.s16        d27, d30, d0[1]
+        vmla.s16        d27, d31, d0[2]
+        vmla.s16        d27, d1,  d0[3]
+        vrshr.s16       d26, d26, #2
+        vrshr.s16       d27, d27, #2
+        bx              lr
+
+80:
+160:
+320:
+        bgt             880f
+        vpush           {q4-q7}
+        add             \my,  \my,  #2
+        vld1.8          {d0},  [\mx]
+        vld1.32         {d2[]},  [\my]
+        sub             \src,  \src,  #3
+        sub             \src,  \src,  \s_strd
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+        mov             \my, \h
+
+164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd, \d_strd, #1
+        lsl             \s_strd, \s_strd, #1
+
+        vld1.8          {q14},  [\src], \s_strd
+        vmovl.u8        q12, d28
+        vmovl.u8        q13, d29
+        vmul.s16        q10, q12, d0[0]
+.irpc i, 123
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q10, q14, d1[\i-4]
+.endr
+        vrshr.s16       q3,  q10, #2
+
+        bl              L(\type\()_8tap_filter_8)
+        vmov            q4,  q10
+        vmov            q5,  q11
+
+8:
+        bl              L(\type\()_8tap_filter_8)
+        vmull.s16       q12, d6,  d2[0]
+        vmull.s16       q13, d7,  d2[0]
+        vmull.s16       q14, d8,  d2[0]
+        vmull.s16       q15, d9,  d2[0]
+        vmlal.s16       q12, d8,  d2[1]
+        vmlal.s16       q13, d9,  d2[1]
+        vmlal.s16       q14, d10, d2[1]
+        vmlal.s16       q15, d11, d2[1]
+        vmlal.s16       q12, d10, d2[2]
+        vmlal.s16       q13, d11, d2[2]
+        vmlal.s16       q14, d20, d2[2]
+        vmlal.s16       q15, d21, d2[2]
+        vmlal.s16       q12, d20, d2[3]
+        vmlal.s16       q13, d21, d2[3]
+        vmlal.s16       q14, d22, d2[3]
+        vmlal.s16       q15, d23, d2[3]
+        vqrshrn.s32     d24, q12, #\shift_hv
+        vqrshrn.s32     d25, q13, #\shift_hv
+        vqrshrn.s32     d28, q14, #\shift_hv
+        vqrshrn.s32     d29, q15, #\shift_hv
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqmovun.s16     d24, q12
+        vqmovun.s16     d28, q14
+        vst1.8          {d24}, [\dst, :64], \d_strd
+        vst1.8          {d28}, [\ds2, :64], \d_strd
+.else
+        vst1.16         {q12}, [\dst, :128], \d_strd
+        vst1.16         {q14}, [\ds2, :128], \d_strd
+.endif
+        ble             9f
+        vmov            q3,  q5
+        vmov            q4,  q10
+        vmov            q5,  q11
+        b               8b
+9:
+        subs            \w,  \w,  #8
+        ble             0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        mls             \src,  \s_strd,  \my,  \src
+        mls             \dst,  \d_strd,  \my,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #2
+        mov             \h,  \my
+        add             \src,  \src,  #8
+.ifc \type, put
+        add             \dst,  \dst,  #8
+.else
+        add             \dst,  \dst,  #16
+.endif
+        b               164b
+
+880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+        vpush           {q4-q7}
+        vld1.8          {d0},  [\mx]
+        vld1.8          {d2},  [\my]
+        sub             \src,  \src,  #3
+        sub             \src,  \src,  \s_strd
+        sub             \src,  \src,  \s_strd, lsl #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+        mov             \my, \h
+
+168:
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd, \d_strd, #1
+        lsl             \s_strd, \s_strd, #1
+
+        vld1.8          {q14},  [\src], \s_strd
+        vmovl.u8        q12, d28
+        vmovl.u8        q13, d29
+        vmul.s16        q10, q12, d0[0]
+.irpc i, 123
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q10, q14, d1[\i-4]
+.endr
+        vrshr.s16       q3,  q10, #2
+
+        bl              L(\type\()_8tap_filter_8)
+        vmov            q4,  q10
+        vmov            q5,  q11
+        bl              L(\type\()_8tap_filter_8)
+        vmov            q6,  q10
+        vmov            q7,  q11
+        bl              L(\type\()_8tap_filter_8)
+        vmov            q8,  q10
+        vmov            q9,  q11
+
+88:
+        bl              L(\type\()_8tap_filter_8)
+        vmull.s16       q12, d6,  d2[0]
+        vmull.s16       q13, d7,  d2[0]
+        vmull.s16       q14, d8,  d2[0]
+        vmull.s16       q15, d9,  d2[0]
+        vmlal.s16       q12, d8,  d2[1]
+        vmlal.s16       q13, d9,  d2[1]
+        vmlal.s16       q14, d10, d2[1]
+        vmlal.s16       q15, d11, d2[1]
+        vmlal.s16       q12, d10, d2[2]
+        vmlal.s16       q13, d11, d2[2]
+        vmlal.s16       q14, d12, d2[2]
+        vmlal.s16       q15, d13, d2[2]
+        vmlal.s16       q12, d12, d2[3]
+        vmlal.s16       q13, d13, d2[3]
+        vmlal.s16       q14, d14, d2[3]
+        vmlal.s16       q15, d15, d2[3]
+        vmlal.s16       q12, d14, d3[0]
+        vmlal.s16       q13, d15, d3[0]
+        vmlal.s16       q14, d16, d3[0]
+        vmlal.s16       q15, d17, d3[0]
+        vmlal.s16       q12, d16, d3[1]
+        vmlal.s16       q13, d17, d3[1]
+        vmlal.s16       q14, d18, d3[1]
+        vmlal.s16       q15, d19, d3[1]
+        vmlal.s16       q12, d18, d3[2]
+        vmlal.s16       q13, d19, d3[2]
+        vmlal.s16       q14, d20, d3[2]
+        vmlal.s16       q15, d21, d3[2]
+        vmlal.s16       q12, d20, d3[3]
+        vmlal.s16       q13, d21, d3[3]
+        vmlal.s16       q14, d22, d3[3]
+        vmlal.s16       q15, d23, d3[3]
+        vqrshrn.s32     d24, q12, #\shift_hv
+        vqrshrn.s32     d25, q13, #\shift_hv
+        vqrshrn.s32     d28, q14, #\shift_hv
+        vqrshrn.s32     d29, q15, #\shift_hv
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqmovun.s16     d24, q12
+        vqmovun.s16     d28, q14
+        vst1.8          {d24}, [\dst, :64], \d_strd
+        vst1.8          {d28}, [\ds2, :64], \d_strd
+.else
+        vst1.16         {q12}, [\dst, :128], \d_strd
+        vst1.16         {q14}, [\ds2, :128], \d_strd
+.endif
+        ble             9f
+        vmov            q3,  q5
+        vmov            q4,  q6
+        vmov            q5,  q7
+        vmov            q6,  q8
+        vmov            q7,  q9
+        vmov            q8,  q10
+        vmov            q9,  q11
+        b               88b
+9:
+        subs            \w,  \w,  #8
+        ble             0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        mls             \src,  \s_strd,  \my,  \src
+        mls             \dst,  \d_strd,  \my,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #3
+        mov             \h,  \my
+        add             \src,  \src,  #8
+.ifc \type, put
+        add             \dst,  \dst,  #8
+.else
+        add             \dst,  \dst,  #16
+.endif
+        b               168b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_filter_8):
+        vld1.8          {q14},  [\sr2], \s_strd
+        vld1.8          {q15},  [\src], \s_strd
+        vmovl.u8        q12, d28
+        vmovl.u8        q13, d29
+        vmul.s16        q10, q12, d0[0]
+.irpc i, 123
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q10, q14, d1[\i-4]
+.endr
+        vmovl.u8        q12, d30
+        vmovl.u8        q13, d31
+        vmul.s16        q11, q12, d0[0]
+.irpc i, 123
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q11, q14, d0[\i]
+.endr
+.irpc i, 4567
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q11, q14, d1[\i-4]
+.endr
+        vrshr.s16       q10, q10, #2
+        vrshr.s16       q11, q11, #2
+        bx              lr
+endfunc
+
+
+function \type\()_bilin_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+        vdup.8          d1,  \mx
+        vdup.8          d3,  \my
+        rsb             r8,  \mx, #16
+        rsb             r9,  \my, #16
+        vdup.8          d0,  r8
+        vdup.8          d2,  r9
+.ifc \type, prep
+        lsl             \d_strd, \w, #1
+.endif
+        clz             r8,  \w
+        cmp             \mx, #0
+        sub             r8,  r8,  #24
+        bne             L(\type\()_bilin_h)
+        cmp             \my, #0
+        bne             L(\type\()_bilin_v)
+        b               \type\()_neon
+
+L(\type\()_bilin_h):
+        cmp             \my, #0
+        bne             L(\type\()_bilin_hv)
+
+        adr             r9,  L(\type\()_bilin_h_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(\type\()_bilin_h_tbl):
+        .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+
+20:     // 2xN h
+.ifc \type, put
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+2:
+        vld1.32         {d4[]},  [\src], \s_strd
+        vld1.32         {d6[]},  [\sr2], \s_strd
+        vext.8          d5,  d4,  d4, #1
+        vext.8          d7,  d6,  d6, #1
+        vtrn.16         q2,  q3
+        subs            \h,  \h,  #2
+        vmull.u8        q3,  d4,  d0
+        vmlal.u8        q3,  d5,  d1
+        vqrshrn.u16     d4,  q3,  #4
+        vst1.16         {d4[0]}, [\dst, :16], \d_strd
+        vst1.16         {d4[1]}, [\ds2, :16], \d_strd
+        bgt             2b
+        pop             {r4-r11,pc}
+.endif
+
+40:     // 4xN h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+4:
+        vld1.8          {d4}, [\src], \s_strd
+        vld1.8          {d6}, [\sr2], \s_strd
+        vext.8          d5,  d4,  d4, #1
+        vext.8          d7,  d6,  d6, #1
+        vtrn.32         q2,  q3
+        subs            \h,  \h,  #2
+        vmull.u8        q3,  d4,  d0
+        vmlal.u8        q3,  d5,  d1
+.ifc \type, put
+        vqrshrn.u16     d4,  q3,  #4
+        vst1.32         {d4[0]}, [\dst, :32], \d_strd
+        vst1.32         {d4[1]}, [\ds2, :32], \d_strd
+.else
+        vst1.16         {d6}, [\dst, :64], \d_strd
+        vst1.16         {d7}, [\ds2, :64], \d_strd
+.endif
+        bgt             4b
+        pop             {r4-r11,pc}
+
+80:     // 8xN h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+8:
+        vld1.8          {q8},  [\src], \s_strd
+        vld1.8          {q10}, [\sr2], \s_strd
+        vext.8          q9,  q8,  q8,  #1
+        vext.8          q11, q10, q10, #1
+        subs            \h,  \h,  #2
+        vmull.u8        q8,  d16, d0
+        vmull.u8        q10, d20, d0
+        vmlal.u8        q8,  d18, d1
+        vmlal.u8        q10, d22, d1
+.ifc \type, put
+        vqrshrn.u16     d16,  q8,  #4
+        vqrshrn.u16     d18,  q10, #4
+        vst1.8          {d16}, [\dst, :64], \d_strd
+        vst1.8          {d18}, [\ds2, :64], \d_strd
+.else
+        vst1.16         {q8},  [\dst, :128], \d_strd
+        vst1.16         {q10}, [\ds2, :128], \d_strd
+.endif
+        bgt             8b
+        pop             {r4-r11,pc}
+160:
+320:
+640:
+1280:   // 16xN, 32xN, ... h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+
+        sub             \s_strd,  \s_strd,  \w
+        sub             \s_strd,  \s_strd,  #8
+.ifc \type, put
+        lsl             \d_strd,  \d_strd,  #1
+        sub             \d_strd,  \d_strd,  \w
+.endif
+161:
+        vld1.8          {d16},  [\src]!
+        vld1.8          {d22},  [\sr2]!
+        mov             \mx, \w
+
+16:
+        vld1.8          {d17,d18},  [\src]!
+        vld1.8          {d23,d24},  [\sr2]!
+        vext.8          q10, q8,  q9,  #1
+        vext.8          q13, q11, q12, #1
+        vmull.u8        q2,  d16, d0
+        vmull.u8        q3,  d17, d0
+        vmull.u8        q14, d22, d0
+        vmull.u8        q15, d23, d0
+        vmlal.u8        q2,  d20, d1
+        vmlal.u8        q3,  d21, d1
+        vmlal.u8        q14, d26, d1
+        vmlal.u8        q15, d27, d1
+        subs            \mx, \mx, #16
+.ifc \type, put
+        vqrshrn.u16     d4,  q2,  #4
+        vqrshrn.u16     d5,  q3,  #4
+        vqrshrn.u16     d28, q14, #4
+        vqrshrn.u16     d29, q15, #4
+        vst1.8          {q2},  [\dst, :128]!
+        vst1.8          {q14}, [\ds2, :128]!
+.else
+        vst1.16         {q2,  q3},  [\dst, :128]!
+        vst1.16         {q14, q15}, [\ds2, :128]!
+.endif
+        ble             9f
+
+        vmov            d16, d18
+        vmov            d22, d24
+        b               16b
+
+9:
+        add             \dst,  \dst,  \d_strd
+        add             \ds2,  \ds2,  \d_strd
+        add             \src,  \src,  \s_strd
+        add             \sr2,  \sr2,  \s_strd
+
+        subs            \h,  \h,  #2
+        bgt             161b
+        pop             {r4-r11,pc}
+
+L(\type\()_bilin_v):
+        cmp             \h,  #4
+        adr             r9,  L(\type\()_bilin_v_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(\type\()_bilin_v_tbl):
+        .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+
+20:     // 2xN v
+.ifc \type, put
+        cmp             \h,  #2
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+
+        // 2x2 v
+        vld1.16         {d16[]}, [\src], \s_strd
+        bgt             24f
+        vld1.16         {d17[]}, [\sr2], \s_strd
+        vld1.16         {d18[]}, [\src], \s_strd
+        vext.8          d16, d16, d17, #6
+        vext.8          d17, d17, d18, #6
+        vmull.u8        q2,  d16, d2
+        vmlal.u8        q2,  d17, d3
+        vqrshrn.u16     d4,  q2,  #4
+        vst1.16         {d4[0]}, [\dst, :16]
+        vst1.16         {d4[1]}, [\ds2, :16]
+        pop             {r4-r11,pc}
+24:     // 2x4, 2x8, ... v
+        vld1.16         {d17[]}, [\sr2], \s_strd
+        vld1.16         {d18[]}, [\src], \s_strd
+        vld1.16         {d19[]}, [\sr2], \s_strd
+        vld1.16         {d20[]}, [\src], \s_strd
+        vext.8          d16, d16, d17, #6
+        vext.8          d17, d17, d18, #6
+        vext.8          d18, d18, d19, #6
+        vext.8          d19, d19, d20, #6
+        vtrn.32         d16, d18
+        vtrn.32         d17, d19
+        vmull.u8        q2,  d16, d2
+        vmlal.u8        q2,  d17, d3
+        subs            \h,  \h,  #4
+        vqrshrn.u16     d4,  q2,  #4
+        vst1.16         {d4[0]}, [\dst, :16], \d_strd
+        vst1.16         {d4[1]}, [\ds2, :16], \d_strd
+        vst1.16         {d4[2]}, [\dst, :16], \d_strd
+        vst1.16         {d4[3]}, [\ds2, :16], \d_strd
+        ble             0f
+        vmov            d16, d20
+        b               24b
+0:
+        pop             {r4-r11,pc}
+.endif
+
+40:     // 4xN v
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        vld1.32         {d16[]}, [\src], \s_strd
+4:
+        vld1.32         {d17[]}, [\sr2], \s_strd
+        vld1.32         {d18[]}, [\src], \s_strd
+        vext.8          d16, d16, d17, #4
+        vext.8          d17, d17, d18, #4
+        vmull.u8        q2,  d16, d2
+        vmlal.u8        q2,  d17, d3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqrshrn.u16     d4,  q2,  #4
+        vst1.32         {d4[0]}, [\dst, :32], \d_strd
+        vst1.32         {d4[1]}, [\ds2, :32], \d_strd
+.else
+        vst1.16         {d4}, [\dst, :64], \d_strd
+        vst1.16         {d5}, [\ds2, :64], \d_strd
+.endif
+        ble             0f
+        vmov            d16,  d18
+        b               4b
+0:
+        pop             {r4-r11,pc}
+
+80:     // 8xN v
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        vld1.8          {d16}, [\src], \s_strd
+8:
+        vld1.8          {d17}, [\sr2], \s_strd
+        vld1.8          {d18}, [\src], \s_strd
+        vmull.u8        q2,  d16, d2
+        vmull.u8        q3,  d17, d2
+        vmlal.u8        q2,  d17, d3
+        vmlal.u8        q3,  d18, d3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqrshrn.u16     d4,  q2,  #4
+        vqrshrn.u16     d6,  q3,  #4
+        vst1.8          {d4}, [\dst, :64], \d_strd
+        vst1.8          {d6}, [\ds2, :64], \d_strd
+.else
+        vst1.16         {q2}, [\dst, :128], \d_strd
+        vst1.16         {q3}, [\ds2, :128], \d_strd
+.endif
+        ble             0f
+        vmov            d16, d18
+        b               8b
+0:
+        pop             {r4-r11,pc}
+
+160:    // 16xN, 32xN, ...
+320:
+640:
+1280:
+        mov             \my, \h
+1:
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        vld1.8          {q8},  [\src], \s_strd
+2:
+        vld1.8          {q9},  [\sr2], \s_strd
+        vld1.8          {q10}, [\src], \s_strd
+        vmull.u8        q12, d16, d2
+        vmull.u8        q13, d17, d2
+        vmull.u8        q14, d18, d2
+        vmull.u8        q15, d19, d2
+        vmlal.u8        q12, d18, d3
+        vmlal.u8        q13, d19, d3
+        vmlal.u8        q14, d20, d3
+        vmlal.u8        q15, d21, d3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqrshrn.u16     d24, q12, #4
+        vqrshrn.u16     d25, q13, #4
+        vqrshrn.u16     d28, q14, #4
+        vqrshrn.u16     d29, q15, #4
+        vst1.8          {q12}, [\dst, :128], \d_strd
+        vst1.8          {q14}, [\ds2, :128], \d_strd
+.else
+        vst1.16         {q12, q13}, [\dst, :128], \d_strd
+        vst1.16         {q14, q15}, [\ds2, :128], \d_strd
+.endif
+        ble             9f
+        vmov            q8,  q10
+        b               2b
+9:
+        subs            \w,  \w,  #16
+        ble             0f
+        asr             \s_strd, \s_strd, #1
+        asr             \d_strd, \d_strd, #1
+        mls             \src, \s_strd, \my, \src
+        mls             \dst, \d_strd, \my, \dst
+        sub             \src, \src, \s_strd, lsl #1
+        mov             \h,  \my
+        add             \src, \src, #16
+.ifc \type, put
+        add             \dst, \dst, #16
+.else
+        add             \dst, \dst, #32
+.endif
+        b               1b
+0:
+        pop             {r4-r11,pc}
+
+L(\type\()_bilin_hv):
+        vmovl.u8        q2,  d2
+        vmovl.u8        q3,  d3
+        adr             r9,  L(\type\()_bilin_hv_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(\type\()_bilin_hv_tbl):
+        .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+
+20:     // 2xN hv
+.ifc \type, put
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        vld1.32         {d28[]},  [\src], \s_strd
+        vext.8          d29, d28, d28, #1
+        vmull.u8        q8,  d28, d0
+        vmlal.u8        q8,  d29, d1
+
+2:
+        vld1.32         {d28[]},  [\sr2], \s_strd
+        vld1.32         {d30[]},  [\src], \s_strd
+        vext.8          d29, d28, d28, #1
+        vext.8          d31, d30, d30, #1
+        vtrn.16         d28, d30
+        vtrn.16         d29, d31
+        vmull.u8        q9,  d28, d0
+        vmlal.u8        q9,  d29, d1
+
+        vtrn.32         d16, d18
+
+        vmul.u16        d20, d16, d4
+        vmla.u16        d20, d19, d6
+        vqrshrn.u16     d20, q10, #8
+        subs            \h,  \h,  #2
+        vst1.16         {d20[0]}, [\dst, :16], \d_strd
+        vst1.16         {d20[1]}, [\ds2, :16], \d_strd
+        ble             0f
+        vtrn.32         d19, d16
+        b               2b
+0:
+        pop             {r4-r11,pc}
+.endif
+
+40:     // 4xN hv
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        vld1.8          {d28},  [\src], \s_strd
+        vext.8          d29, d28, d28, #1
+        vmull.u8        q8,  d28, d0
+        vmlal.u8        q8,  d29, d1
+
+4:
+        vld1.8          {d28},  [\sr2], \s_strd
+        vld1.8          {d30},  [\src], \s_strd
+        vext.8          d29, d28, d28, #1
+        vext.8          d31, d30, d30, #1
+        vtrn.32         d28, d30
+        vtrn.32         d29, d31
+        vmull.u8        q9,  d28, d0
+        vmlal.u8        q9,  d29, d1
+
+        vmov            d17, d18
+
+        vmul.u16        q10, q8, q2
+        vmla.u16        q10, q9, q3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqrshrn.u16     d20, q10, #8
+        vst1.32         {d20[0]}, [\dst, :32], \d_strd
+        vst1.32         {d20[1]}, [\ds2, :32], \d_strd
+.else
+        vrshr.u16       q10, q10, #4
+        vst1.16         {d20}, [\dst, :64], \d_strd
+        vst1.16         {d21}, [\ds2, :64], \d_strd
+.endif
+        ble             0f
+        vmov            d16, d19
+        b               4b
+0:
+        pop             {r4-r11,pc}
+
+80:     // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+        mov             \my, \h
+
+1:
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        vld1.8          {q12},  [\src], \s_strd
+        vext.8          q13, q12, q12, #1
+        vmull.u8        q8,  d24, d0
+        vmlal.u8        q8,  d26, d1
+
+2:
+        vld1.8          {q12},  [\sr2], \s_strd
+        vld1.8          {q14},  [\src], \s_strd
+        vext.8          q13, q12, q12, #1
+        vext.8          q15, q14, q14, #1
+        vmull.u8        q9,  d24, d0
+        vmlal.u8        q9,  d26, d1
+        vmull.u8        q10, d28, d0
+        vmlal.u8        q10, d30, d1
+
+        vmul.u16        q8,  q8,  q2
+        vmla.u16        q8,  q9,  q3
+        vmul.u16        q9,  q9,  q2
+        vmla.u16        q9,  q10, q3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqrshrn.u16     d16, q8,  #8
+        vqrshrn.u16     d18, q9,  #8
+        vst1.8          {d16}, [\dst, :64], \d_strd
+        vst1.8          {d18}, [\ds2, :64], \d_strd
+.else
+        vrshr.u16       q8,  q8,  #4
+        vrshr.u16       q9,  q9,  #4
+        vst1.16         {q8}, [\dst, :128], \d_strd
+        vst1.16         {q9}, [\ds2, :128], \d_strd
+.endif
+        ble             9f
+        vmov            q8,  q10
+        b               2b
+9:
+        subs            \w,  \w,  #8
+        ble             0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        mls             \src,  \s_strd,  \my,  \src
+        mls             \dst,  \d_strd,  \my,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #1
+        mov             \h,  \my
+        add             \src,  \src,  #8
+.ifc \type, put
+        add             \dst,  \dst,  #8
+.else
+        add             \dst,  \dst,  #16
+.endif
+        b               1b
+0:
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+filter_fn put,  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10
+filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
+
+.macro load_filter_ptr src
+        asr             r12, \src, #10
+        add             r12, r11, r12, lsl #3
+.endm
+
+.macro load_filter_coef dst, src, inc
+        vld1.8          {\dst}, [r12, :64]
+        add             \src, \src, \inc
+.endm
+
+.macro load_filter_row dst, src, inc
+        load_filter_ptr \src
+        load_filter_coef \dst, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+        load_filter_ptr r5                  // filter 0
+        vld1.16         {q7}, [r2], r3
+
+        load_filter_coef d0, r5,  r7        // filter 0
+        vmovl.u8        q6,  d14            // original pixels
+        load_filter_row d2,  r5,  r7        // filter 1
+        vmovl.u8        q7,  d15            // original pixels
+        load_filter_row d4,  r5,  r7        // filter 2
+        vmovl.s8        q0,  d0             // filter 0
+        vext.8          q3,  q6,  q7,  #2*1 // filter 1 pixels
+        load_filter_ptr r5                  // filter 3
+        vmovl.s8        q1,  d2             // filter 1
+        vmul.i16        q5,  q6,  q0        // filter 0 output
+        load_filter_coef d0, r5,  r7        // filter 3
+        vmovl.s8        q2,  d4             // filter 2
+        load_filter_ptr r5                  // filter 4
+        vext.8          q4,  q6,  q7,  #2*2 // filter 2 pixels
+        vmul.i16        q3,  q3,  q1        // filter 1 output
+        load_filter_coef d2, r5,  r7        // filter 4
+        vmul.i16        q4,  q4,  q2        // filter 2 output
+        vext.8          q2,  q6,  q7,  #2*3 // filter 3 pixels
+        vmovl.s8        q0,  d0             // filter 3
+        vpaddl.s16      q5,  q5             // pixel 0 (4x32)
+        vpaddl.s16      q3,  q3             // pixel 1 (4x32)
+        vmul.i16        q0,  q2,  q0        // filter 3 output
+        load_filter_ptr r5                  // filter 5
+        vext.8          q2,  q6,  q7,  #2*4 // filter 4 pixels
+        vmovl.s8        q1,  d2             // filter 4
+        vpaddl.s16      q4,  q4             // pixel 2 (4x32)
+        vpadd.s32       d10, d10, d11       // pixel 0 (2x32)
+        vpadd.s32       d11, d6,  d7        // pixel 1 (2x32)
+        load_filter_coef d6, r5,  r7        // filter 5
+        vmul.i16        q1,  q2,  q1        // filter 4 output
+        vpadd.s32       d8,  d8,  d9        // pixel 2 (2x32)
+        load_filter_ptr r5                  // filter 6
+        vpaddl.s16      q0,  q0             // pixel 3 (4x32)
+        vpadd.s32       d10, d10, d11       // pixel 0,1
+        vext.8          q2,  q6,  q7,  #2*5 // filter 5 pixels
+        vmovl.s8        q3,  d6             // filter 5
+        vpaddl.s16      q1,  q1             // pixel 4 (4x32)
+        vpadd.s32       d9,  d0,  d1        // pixel 3 (2x32)
+        load_filter_coef d0, r5,  r7        // filter 6
+        vmul.i16        q2,  q2,  q3        // filter 5 output
+        vpadd.s32       d11, d8,  d9        // pixel 2,3
+        load_filter_ptr r5                  // filter 7
+        vpaddl.s16      q2,  q2             // pixel 5 (4x32)
+        vpadd.s32       d8,  d2,  d3        // pixel 4 (2x32)
+        vext.8          q3,  q6,  q7,  #2*6 // filter 6 pixels
+        vmovl.s8        q0,  d0             // filter 6
+        vpadd.s32       d9,  d4,  d5        // pixel 5 (2x32)
+        load_filter_coef d4, r5,  r7        // filter 7
+        vpadd.s32       d8,  d8,  d9        // pixel 4,5
+        vext.8          q1,  q6,  q7,  #2*7 // filter 7 pixels
+        vmovl.s8        q2,  d4             // filter 7
+        vmul.i16        q3,  q3,  q0        // filter 6 output
+        vmul.i16        q1,  q1,  q2        // filter 7 output
+        sub             r5,  r5,  r7, lsl #3
+        vpaddl.s16      q3,  q3             // pixel 6 (4x32)
+        vpaddl.s16      q1,  q1             // pixel 7 (4x32)
+        vpadd.s32       d6,  d6,  d7        // pixel 6 (2x32)
+        vpadd.s32       d2,  d2,  d3        // pixel 7 (2x32)
+        vpadd.s32       d9,  d6,  d2        // pixel 6,7
+
+        add             r5,  r5,  r8
+
+        vrshrn.s32      d10, q5,  #3
+        vrshrn.s32      d11, q4,  #3
+
+        bx              lr
+endfunc
+
+// void dav1d_warp_affine_8x8_8bpc_neon(
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *src, const ptrdiff_t src_stride,
+//         const int16_t *const abcd, int mx, int my)
+.macro warp t, shift
+function warp_affine_8x8\t\()_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldr             r6,  [sp, #108]
+        ldrd            r8,  r9,  [r4]
+        sxth            r7,  r8
+        asr             r8,  r8, #16
+        asr             r4,  r9, #16
+        sxth            r9,  r9
+        mov             r10, #8
+        sub             r2,  r2,  r3, lsl #1
+        sub             r2,  r2,  r3
+        sub             r2,  r2,  #3
+        movrel          r11, X(mc_warp_filter), 64*8
+.ifnb \t
+        lsl             r1,  r1,  #1
+.endif
+        add             r5,  r5,  #512
+        add             r6,  r6,  #512
+
+        bl              warp_filter_horz_neon
+        vmov            q8,  q5
+        bl              warp_filter_horz_neon
+        vmov            q9,  q5
+        bl              warp_filter_horz_neon
+        vmov            q10, q5
+        bl              warp_filter_horz_neon
+        vmov            q11, q5
+        bl              warp_filter_horz_neon
+        vmov            q12, q5
+        bl              warp_filter_horz_neon
+        vmov            q13, q5
+        bl              warp_filter_horz_neon
+        vmov            q14, q5
+
+1:
+        bl              warp_filter_horz_neon
+        vmov            q15, q5
+
+        load_filter_row d8,  r6,  r9
+        load_filter_row d9,  r6,  r9
+        load_filter_row d10, r6,  r9
+        load_filter_row d11, r6,  r9
+        load_filter_row d12, r6,  r9
+        load_filter_row d13, r6,  r9
+        load_filter_row d14, r6,  r9
+        load_filter_row d15, r6,  r9
+        transpose_8x8b  q4,  q5,  q6,  q7,  d8,  d9,  d10, d11, d12, d13, d14, d15
+        vmovl.s8        q1,  d8
+        vmovl.s8        q2,  d9
+        vmovl.s8        q3,  d10
+        vmovl.s8        q4,  d11
+        vmovl.s8        q5,  d12
+        vmovl.s8        q6,  d13
+
+        sub             r6,  r6,  r9, lsl #3
+
+        // This ordering of vmull/vmlal is highly beneficial for
+        // Cortex A8/A9/A53 here, but harmful for Cortex A7.
+        vmull.s16       q0,  d16,  d2
+        vmlal.s16       q0,  d18,  d4
+        vmlal.s16       q0,  d20,  d6
+        vmlal.s16       q0,  d22,  d8
+        vmlal.s16       q0,  d24,  d10
+        vmlal.s16       q0,  d26,  d12
+        vmull.s16       q1,  d17,  d3
+        vmlal.s16       q1,  d19,  d5
+        vmlal.s16       q1,  d21,  d7
+        vmlal.s16       q1,  d23,  d9
+        vmlal.s16       q1,  d25,  d11
+        vmlal.s16       q1,  d27,  d13
+
+        vmovl.s8        q2,  d14
+        vmovl.s8        q3,  d15
+
+        vmlal.s16       q0,  d28,  d4
+        vmlal.s16       q0,  d30,  d6
+        vmlal.s16       q1,  d29,  d5
+        vmlal.s16       q1,  d31,  d7
+
+        vmov            q8,  q9
+        vmov            q9,  q10
+        vqrshrn.s32     d0,  q0,  #\shift
+        vmov            q10, q11
+        vqrshrn.s32     d1,  q1,  #\shift
+        vmov            q11, q12
+        vmov            q12, q13
+.ifb \t
+        vqmovun.s16     d0,  q0
+.endif
+        vmov            q13, q14
+        vmov            q14, q15
+        subs            r10, r10, #1
+.ifnb \t
+        vst1.16         {q0}, [r0, :128], r1
+.else
+        vst1.8          {d0}, [r0, :64], r1
+.endif
+
+        add             r6,  r6,  r4
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+warp  , 11
+warp t, 7
+
+// void dav1d_emu_edge_8bpc_neon(
+//         const intptr_t bw, const intptr_t bh,
+//         const intptr_t iw, const intptr_t ih,
+//         const intptr_t x, const intptr_t y,
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+        ldrd            r8,  r9,  [sp, #52]
+
+        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+        // ref += iclip(x, 0, iw - 1)
+        sub             r12, r3,  #1           // ih - 1
+        cmp             r5,  r3
+        sub             lr,  r2,  #1           // iw - 1
+        it              lt
+        movlt           r12, r5                // min(y, ih - 1)
+        cmp             r4,  r2
+        bic             r12, r12, r12, asr #31 // max(min(y, ih - 1), 0)
+        it              lt
+        movlt           lr,  r4                // min(x, iw - 1)
+        bic             lr,  lr,  lr,  asr #31 // max(min(x, iw - 1), 0)
+        mla             r8,  r12, r9,  r8      // ref += iclip() * stride
+        add             r8,  r8,  lr           // ref += iclip()
+
+        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+        // top_ext = iclip(-y, 0, bh - 1)
+        add             r10, r5,  r1           // y + bh
+        neg             r5,  r5                // -y
+        sub             r10, r10, r3           // y + bh - ih
+        sub             r12, r1,  #1           // bh - 1
+        cmp             r10, r1
+        bic             r5,  r5,  r5,  asr #31 // max(-y, 0)
+        it              ge
+        movge           r10, r12               // min(y + bh - ih, bh-1)
+        cmp             r5,  r1
+        bic             r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0)
+        it              ge
+        movge           r5,  r12               // min(max(-y, 0), bh-1)
+
+        // right_ext = iclip(x + bw - iw, 0, bw - 1)
+        // left_ext = iclip(-x, 0, bw - 1)
+        add             r11, r4,  r0           // x + bw
+        neg             r4,  r4                // -x
+        sub             r11, r11, r2           // x + bw - iw
+        sub             lr,  r0,  #1           // bw - 1
+        cmp             r11, r0
+        bic             r4,  r4,  r4,  asr #31 // max(-x, 0)
+        it              ge
+        movge           r11, lr                // min(x + bw - iw, bw-1)
+        cmp             r4,  r0
+        bic             r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0)
+        it              ge
+        movge           r4,  lr                // min(max(-x, 0), bw - 1)
+
+        // center_h = bh - top_ext - bottom_ext
+        // dst += top_ext * PXSTRIDE(dst_stride)
+        // center_w = bw - left_ext - right_ext
+        sub             r1,  r1,  r5           // bh - top_ext
+        mla             r6,  r5,  r7,  r6
+        sub             r2,  r0,  r4           // bw - left_ext
+        sub             r1,  r1,  r10          // center_h = bh - top_ext - bottom_ext
+        sub             r2,  r2,  r11          // center_w = bw - left_ext - right_ext
+
+        mov             r0,  r6                // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+        vld1.8          {d0[]}, [r8]
+        mov             r12, r6                // out = dst
+        mov             r3,  r4
+        vmov            d1,  d0
+1:
+        subs            r3,  r3,  #16
+        vst1.8          {q0}, [r12, :128]!
+        bgt             1b
+.endif
+        mov             lr,  r8
+        add             r12, r6,  r4           // out = dst + left_ext
+        mov             r3,  r2
+1:
+        vld1.8          {q0, q1}, [lr]!
+        subs            r3,  r3,  #32
+.if \need_left
+        vst1.8          {q0, q1}, [r12]!
+.else
+        vst1.8          {q0, q1}, [r12, :128]!
+.endif
+        bgt             1b
+.if \need_right
+        add             r3,  r8,  r2           // in + center_w
+        sub             r3,  r3,  #1           // in + center_w - 1
+        add             r12, r6,  r4           // dst + left_ext
+        vld1.8          {d0[]}, [r3]
+        add             r12, r12, r2           // out = dst + left_ext + center_w
+        mov             r3,  r11
+        vmov            d1,  d0
+1:
+        subs            r3,  r3,  #16
+        vst1.8          {q0}, [r12]!
+        bgt             1b
+.endif
+
+        subs            r1,  r1,  #1           // center_h--
+        add             r6,  r6,  r7
+        add             r8,  r8,  r9
+        bgt             0b
+.endm
+
+        cmp             r4,  #0
+        beq             2f
+        // need_left
+        cmp             r11, #0
+        beq             3f
+        // need_left + need_right
+        v_loop          1,   1
+        b               5f
+
+2:
+        // !need_left
+        cmp             r11, #0
+        beq             4f
+        // !need_left + need_right
+        v_loop          0,   1
+        b               5f
+
+3:
+        // need_left + !need_right
+        v_loop          1,   0
+        b               5f
+
+4:
+        // !need_left + !need_right
+        v_loop          0,   0
+
+5:
+        cmp             r10, #0
+        // Storing the original dst in r0 overwrote bw, recalculate it here
+        add             r2,  r2,  r4           // center_w + left_ext
+        add             r2,  r2,  r11          // bw = center_w + left_ext + right_ext
+
+        beq             3f
+        // need_bottom
+        sub             r8,  r6,  r7           // ref = dst - stride
+        mov             r4,  r2
+1:
+        vld1.8          {q0, q1}, [r8, :128]!
+        mov             r3,  r10
+2:
+        subs            r3,  r3,  #1
+        vst1.8          {q0, q1}, [r6, :128], r7
+        bgt             2b
+        mls             r6,  r7,  r10,  r6     // dst -= bottom_ext * stride
+        subs            r4,  r4,  #32          // bw -= 32
+        add             r6,  r6,  #32          // dst += 32
+        bgt             1b
+
+3:
+        cmp             r5,  #0
+        beq             3f
+        // need_top
+        mls             r6,  r7,  r5,  r0      // dst = stored_dst - top_ext * stride
+1:
+        vld1.8          {q0, q1}, [r0, :128]!
+        mov             r3,  r5
+2:
+        subs            r3,  r3,  #1
+        vst1.8          {q0, q1}, [r6, :128], r7
+        bgt             2b
+        mls             r6,  r7,  r5,  r6      // dst -= top_ext * stride
+        subs            r2,  r2,  #32          // bw -= 32
+        add             r6,  r6,  #32          // dst += 32
+        bgt             1b
+
+3:
+        pop             {r4-r11,pc}
+endfunc
diff --git a/src/arm/32/msac.S b/src/arm/32/msac.S
new file mode 100644 (file)
index 0000000..b06e109
--- /dev/null
@@ -0,0 +1,575 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define BUF_POS 0
+#define BUF_END 4
+#define DIF 8
+#define RNG 12
+#define CNT 16
+#define ALLOW_UPDATE_CDF 20
+
+const coeffs
+        .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+        .short 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0
+endconst
+
+const bits, align=4
+        .short   0x1,   0x2,   0x4,   0x8,   0x10,   0x20,   0x40,   0x80
+        .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
+endconst
+
+.macro vld1_align_n d0, q0, q1, src, n
+.if \n == 4
+        vld1.16         {\d0},  [\src, :64]
+.elseif \n == 8
+        vld1.16         {\q0},  [\src, :128]
+.else
+        vld1.16         {\q0, \q1},  [\src, :128]
+.endif
+.endm
+
+.macro vld1_n d0, q0, q1, src, n
+.if \n == 4
+        vld1.16         {\d0},  [\src]
+.elseif \n == 8
+        vld1.16         {\q0},  [\src]
+.else
+        vld1.16         {\q0, \q1},  [\src]
+.endif
+.endm
+
+.macro vst1_align_n d0, q0, q1, src, n
+.if \n == 4
+        vst1.16         {\d0},  [\src, :64]
+.elseif \n == 8
+        vst1.16         {\q0},  [\src, :128]
+.else
+        vst1.16         {\q0, \q1},  [\src, :128]
+.endif
+.endm
+
+.macro vst1_n d0, q0, q1, src, n
+.if \n == 4
+        vst1.16         {\d0},  [\src]
+.elseif \n == 8
+        vst1.16         {\q0},  [\src]
+.else
+        vst1.16         {\q0, \q1},  [\src]
+.endif
+.endm
+
+.macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vshr.u16        \d0,  \s0,  \s3
+.else
+        vshr.u16        \d1,  \s1,  \s4
+.if \n == 16
+        vshr.u16        \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vadd.i16        \d0,  \s0,  \s3
+.else
+        vadd.i16        \d1,  \s1,  \s4
+.if \n == 16
+        vadd.i16        \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vsub.i16        \d0,  \s0,  \s3
+.else
+        vsub.i16        \d1,  \s1,  \s4
+.if \n == 16
+        vsub.i16        \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vand            \d0,  \s0,  \s3
+.else
+        vand            \d1,  \s1,  \s4
+.if \n == 16
+        vand            \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vcge.u16        \d0,  \s0,  \s3
+.else
+        vcge.u16        \d1,  \s1,  \s4
+.if \n == 16
+        vcge.u16        \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vrhadd.u16      \d0,  \s0,  \s3
+.else
+        vrhadd.u16      \d1,  \s1,  \s4
+.if \n == 16
+        vrhadd.u16      \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vshl.s16        \d0,  \s0,  \s3
+.else
+        vshl.s16        \d1,  \s1,  \s4
+.if \n == 16
+        vshl.s16        \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vqdmulh.s16     \d0,  \s0,  \s3
+.else
+        vqdmulh.s16     \d1,  \s1,  \s4
+.if \n == 16
+        vqdmulh.s16     \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+//                                               size_t n_symbols);
+
+function msac_decode_symbol_adapt4_neon, export=1
+.macro decode_update n
+        push            {r4-r10,lr}
+        sub             sp,  sp,  #48
+        add             r8,  r0,  #RNG
+
+        vld1_align_n    d0,  q0,  q1,  r1,  \n                         // cdf
+        vld1.16         {d16[]}, [r8, :16]                             // rng
+        movrel_local    r9,  coeffs, 30
+        vmov.i16        d30, #0x7f00                                   // 0x7f00
+        sub             r9,  r9,  r2, lsl #1
+        vmvn.i16        q14, #0x3f                                     // 0xffc0
+        add             r8,  sp,  #14
+        vand            d22, d16, d30                                  // rng & 0x7f00
+        vst1.16         {d16[0]}, [r8, :16]                            // store original u = s->rng
+        vand_n          d4,  q2,  q3,  d0,  q0,  q1, d28, q14, q14, \n // cdf & 0xffc0
+.if \n > 4
+        vmov            d23, d22
+.endif
+
+        vld1_n          d16, q8,  q9,  r9,  \n                          // EC_MIN_PROB * (n_symbols - ret)
+        vqdmulh_n       d20, q10, q11, d4,  q2,  q3,  d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+        add             r8,  r0,  #DIF + 2
+
+        vadd_n          d16, q8,  q9,  d4,  q2,  q3,  d16, q8,  q9,  \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+.if \n == 4
+        vmov.i16        d17, #0
+.endif
+        vadd_n          d16, q8,  q9,  d20, q10, q11, d16, q8,  q9,  \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+
+        add             r9,  sp,  #16
+        vld1.16         {d20[]}, [r8, :16]                              // dif >> (EC_WIN_SIZE - 16)
+        movrel_local    r8,  bits
+        vst1_n          q8,  q8,  q9,  r9,  \n                          // store v values to allow indexed access
+
+        vmov            d21, d20
+        vld1_align_n    q12, q12, q13, r8,  \n
+.if \n == 16
+        vmov            q11, q10
+.endif
+
+        vcge_n          q2,  q2,  q3,  q10, q10, q11, q8,  q8,  q9,  \n // c >= v
+
+        vand_n          q10, q10, q11, q2,  q2,  q3,  q12, q12, q13, \n // One bit per halfword set in the mask
+.if \n == 16
+        vadd.i16        q10, q10, q11
+.endif
+        vadd.i16        d20, d20, d21                                   // Aggregate mask bits
+        ldr             r4,  [r0, #ALLOW_UPDATE_CDF]
+        vpadd.i16       d20, d20, d20
+        lsl             r10, r2,  #1
+        vpadd.i16       d20, d20, d20
+        vmov.u16        r3,  d20[0]
+        cmp             r4,  #0
+        rbit            r3,  r3
+        clz             lr,  r3                                         // ret
+
+        beq             L(renorm)
+        // update_cdf
+        ldrh            r3,  [r1, r10]                                  // count = cdf[n_symbols]
+        vmov.i8         q10, #0xff
+.if \n == 16
+        mov             r4,  #-5
+.else
+        mvn             r12, r2
+        mov             r4,  #-4
+        cmn             r12, #3                                         // set C if n_symbols <= 2
+.endif
+        vrhadd_n        d16, q8,  q9,  d20, q10, q10, d4,  q2,  q3,  \n // i >= val ? -1 : 32768
+.if \n == 16
+        sub             r4,  r4,  r3, lsr #4                            // -((count >> 4) + 5)
+.else
+        lsr             r12, r3,  #4                                    // count >> 4
+        sbc             r4,  r4,  r12                                   // -((count >> 4) + (n_symbols > 2) + 4)
+.endif
+        vsub_n          d16, q8,  q9,  d16, q8,  q9,  d0,  q0,  q1,  \n // (32768 - cdf[i]) or (-1 - cdf[i])
+.if \n == 4
+        vdup.16         d20, r4                                         // -rate
+.else
+        vdup.16         q10, r4                                         // -rate
+.endif
+
+        sub             r3,  r3,  r3, lsr #5                            // count - (count == 32)
+        vsub_n          d0,  q0,  q1,  d0,  q0,  q1,  d4,  q2,  q3,  \n // cdf + (i >= val ? 1 : 0)
+        vshl_n          d16, q8,  q9,  d16, q8,  q9,  d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate
+        add             r3,  r3,  #1                                    // count + (count < 32)
+        vadd_n          d0,  q0,  q1,  d0,  q0,  q1,  d16, q8,  q9,  \n // cdf + (32768 - cdf[i]) >> rate
+        vst1_align_n    d0,  q0,  q1,  r1,  \n
+        strh            r3,  [r1, r10]
+.endm
+
+        decode_update   4
+
+L(renorm):
+        add             r8,  sp,  #16
+        add             r8,  r8,  lr, lsl #1
+        ldrh            r3,  [r8]              // v
+        ldrh            r4,  [r8, #-2]         // u
+        ldr             r6,  [r0, #CNT]
+        ldr             r7,  [r0, #DIF]
+        sub             r4,  r4,  r3           // rng = u - v
+        clz             r5,  r4                // clz(rng)
+        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
+        mvn             r7,  r7                // ~dif
+        add             r7,  r7,  r3, lsl #16  // ~dif + (v << 16)
+L(renorm2):
+        lsl             r4,  r4,  r5           // rng << d
+        subs            r6,  r6,  r5           // cnt -= d
+        lsl             r7,  r7,  r5           // (~dif + (v << 16)) << d
+        str             r4,  [r0, #RNG]
+        mvn             r7,  r7                // ~dif
+        bhs             9f
+
+        // refill
+        ldr             r3,  [r0, #BUF_POS]    // BUF_POS
+        ldr             r4,  [r0, #BUF_END]    // BUF_END
+        add             r5,  r3,  #4
+        cmp             r5,  r4
+        bgt             2f
+
+        ldr             r3,  [r3]              // next_bits
+        add             r8,  r6,  #23          // shift_bits = cnt + 23
+        add             r6,  r6,  #16          // cnt += 16
+        rev             r3,  r3                // next_bits = bswap(next_bits)
+        sub             r5,  r5,  r8, lsr #3   // buf_pos -= shift_bits >> 3
+        and             r8,  r8,  #24          // shift_bits &= 24
+        lsr             r3,  r3,  r8           // next_bits >>= shift_bits
+        sub             r8,  r8,  r6           // shift_bits -= 16 + cnt
+        str             r5,  [r0, #BUF_POS]
+        lsl             r3,  r3,  r8           // next_bits <<= shift_bits
+        rsb             r6,  r8,  #16          // cnt = cnt + 32 - shift_bits
+        eor             r7,  r7,  r3           // dif ^= next_bits
+        b               9f
+
+2:      // refill_eob
+        rsb             r5,  r6,  #8           // c = 8 - cnt
+3:
+        cmp             r3,  r4
+        bge             4f
+        ldrb            r8,  [r3], #1
+        lsl             r8,  r8,  r5
+        eor             r7,  r7,  r8
+        subs            r5,  r5,  #8
+        bge             3b
+
+4:      // refill_eob_end
+        str             r3,  [r0, #BUF_POS]
+        rsb             r6,  r5,  #8           // cnt = 8 - c
+
+9:
+        str             r6,  [r0, #CNT]
+        str             r7,  [r0, #DIF]
+
+        mov             r0,  lr
+        add             sp,  sp,  #48
+
+        pop             {r4-r10,pc}
+endfunc
+
+function msac_decode_symbol_adapt8_neon, export=1
+        decode_update   8
+        b               L(renorm)
+endfunc
+
+function msac_decode_symbol_adapt16_neon, export=1
+        decode_update   16
+        b               L(renorm)
+endfunc
+
+function msac_decode_hi_tok_neon, export=1
+        push            {r4-r10,lr}
+        vld1.16         {d0},  [r1, :64]       // cdf
+        add             r4,  r0,  #RNG
+        vmov.i16        d31, #0x7f00           // 0x7f00
+        movrel_local    r5,  coeffs, 30-2*3
+        vmvn.i16        d30, #0x3f             // 0xffc0
+        ldrh            r9,  [r1, #6]          // count = cdf[n_symbols]
+        vld1.16         {d1[]},  [r4, :16]     // rng
+        movrel_local    r4,  bits
+        vld1.16         {d29}, [r5]            // EC_MIN_PROB * (n_symbols - ret)
+        add             r5,  r0,  #DIF + 2
+        vld1.16         {q8}, [r4, :128]
+        mov             r2,  #-24
+        vand            d20, d0, d30           // cdf & 0xffc0
+        ldr             r10, [r0, #ALLOW_UPDATE_CDF]
+        vld1.16         {d2[]}, [r5, :16]      // dif >> (EC_WIN_SIZE - 16)
+        sub             sp,  sp,  #48
+        ldr             r6,  [r0, #CNT]
+        ldr             r7,  [r0, #DIF]
+        vmov            d3,  d2
+1:
+        vand            d23, d1,  d31          // rng & 0x7f00
+        vqdmulh.s16     d18, d20, d23          // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+        add             r12, sp,  #14
+        vadd.i16        d6,  d20, d29          // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+        vadd.i16        d6,  d18, d6           // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+        vmov.i16        d7,  #0
+        vst1.16         {d1[0]}, [r12, :16]    // store original u = s->rng
+        add             r12, sp,  #16
+        vcge.u16        q2,  q1,  q3           // c >= v
+        vst1.16         {q3},  [r12]           // store v values to allow indexed access
+        vand            q9,  q2,  q8           // One bit per halfword set in the mask
+
+        vadd.i16        d18, d18, d19          // Aggregate mask bits
+        vpadd.i16       d18, d18, d18
+        vpadd.i16       d18, d18, d18
+        vmov.u16        r3,  d18[0]
+        cmp             r10, #0
+        add             r2,  r2,  #5
+        rbit            r3,  r3
+        add             r8,  sp,  #16
+        clz             lr,  r3                // ret
+
+        beq             2f
+        // update_cdf
+        vmov.i8         d22, #0xff
+        mov             r4,  #-5
+        vrhadd.u16      d6,  d22, d4           // i >= val ? -1 : 32768
+        sub             r4,  r4,  r9, lsr #4   // -((count >> 4) + 5)
+        vsub.i16        d6,  d6,  d0           // (32768 - cdf[i]) or (-1 - cdf[i])
+        vdup.16         d18, r4                // -rate
+
+        sub             r9,  r9,  r9, lsr #5   // count - (count == 32)
+        vsub.i16        d0,  d0,  d4           // cdf + (i >= val ? 1 : 0)
+        vshl.s16        d6,  d6,  d18          // ({32768,-1} - cdf[i]) >> rate
+        add             r9,  r9,  #1           // count + (count < 32)
+        vadd.i16        d0,  d0,  d6           // cdf + (32768 - cdf[i]) >> rate
+        vst1.16         {d0},  [r1, :64]
+        vand            d20, d0,  d30          // cdf & 0xffc0
+        strh            r9,  [r1, #6]
+
+2:
+        add             r8,  r8,  lr, lsl #1
+        ldrh            r3,  [r8]              // v
+        ldrh            r4,  [r8, #-2]         // u
+        sub             r4,  r4,  r3           // rng = u - v
+        clz             r5,  r4                // clz(rng)
+        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
+        mvn             r7,  r7                // ~dif
+        add             r7,  r7,  r3, lsl #16  // ~dif + (v << 16)
+        lsl             r4,  r4,  r5           // rng << d
+        subs            r6,  r6,  r5           // cnt -= d
+        lsl             r7,  r7,  r5           // (~dif + (v << 16)) << d
+        str             r4,  [r0, #RNG]
+        vdup.16         d1,  r4
+        mvn             r7,  r7                // ~dif
+        bhs             9f
+
+        // refill
+        ldr             r3,  [r0, #BUF_POS]    // BUF_POS
+        ldr             r4,  [r0, #BUF_END]    // BUF_END
+        add             r5,  r3,  #4
+        cmp             r5,  r4
+        bgt             2f
+
+        ldr             r3,  [r3]              // next_bits
+        add             r8,  r6,  #23          // shift_bits = cnt + 23
+        add             r6,  r6,  #16          // cnt += 16
+        rev             r3,  r3                // next_bits = bswap(next_bits)
+        sub             r5,  r5,  r8, lsr #3   // buf_pos -= shift_bits >> 3
+        and             r8,  r8,  #24          // shift_bits &= 24
+        lsr             r3,  r3,  r8           // next_bits >>= shift_bits
+        sub             r8,  r8,  r6           // shift_bits -= 16 + cnt
+        str             r5,  [r0, #BUF_POS]
+        lsl             r3,  r3,  r8           // next_bits <<= shift_bits
+        rsb             r6,  r8,  #16          // cnt = cnt + 32 - shift_bits
+        eor             r7,  r7,  r3           // dif ^= next_bits
+        b               9f
+
+2:      // refill_eob
+        rsb             r5,  r6,  #8           // c = 40 - cnt
+3:
+        cmp             r3,  r4
+        bge             4f
+        ldrb            r8,  [r3], #1
+        lsl             r8,  r8,  r5
+        eor             r7,  r7,  r8
+        subs            r5,  r5,  #8
+        bge             3b
+
+4:      // refill_eob_end
+        str             r3,  [r0, #BUF_POS]
+        rsb             r6,  r5,  #8           // cnt = 40 - c
+
+9:
+        lsl             lr,  lr,  #1
+        sub             lr,  lr,  #5
+        lsr             r12, r7,  #16
+        adds            r2,  r2,  lr           // carry = tok_br < 3 || tok == 15
+        vdup.16         q1,  r12
+        bcc             1b                     // loop if !carry
+        add             r2,  r2,  #30
+        str             r6,  [r0, #CNT]
+        add             sp,  sp,  #48
+        str             r7,  [r0, #DIF]
+        lsr             r0,  r2,  #1
+        pop             {r4-r10,pc}
+endfunc
+
+function msac_decode_bool_equi_neon, export=1
+        push            {r4-r10,lr}
+        ldr             r5,  [r0, #RNG]
+        ldr             r6,  [r0, #CNT]
+        sub             sp,  sp,  #48
+        ldr             r7,  [r0, #DIF]
+        bic             r4,  r5,  #0xff        // r &= 0xff00
+        add             r4,  r4,  #8
+        mov             r2,  #0
+        subs            r8,  r7,  r4, lsl #15  // dif - vw
+        lsr             r4,  r4,  #1           // v
+        sub             r5,  r5,  r4           // r - v
+        itee            lo
+        movlo           r2,  #1
+        movhs           r4,  r5                // if (ret) v = r - v;
+        movhs           r7,  r8                // if (ret) dif = dif - vw;
+
+        clz             r5,  r4                // clz(rng)
+        mvn             r7,  r7                // ~dif
+        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
+        mov             lr,  r2
+        b               L(renorm2)
+endfunc
+
+function msac_decode_bool_neon, export=1
+        push            {r4-r10,lr}
+        ldr             r5,  [r0, #RNG]
+        ldr             r6,  [r0, #CNT]
+        sub             sp,  sp,  #48
+        ldr             r7,  [r0, #DIF]
+        lsr             r4,  r5,  #8           // r >> 8
+        bic             r1,  r1,  #0x3f        // f &= ~63
+        mul             r4,  r4,  r1
+        mov             r2,  #0
+        lsr             r4,  r4,  #7
+        add             r4,  r4,  #4           // v
+        subs            r8,  r7,  r4, lsl #16  // dif - vw
+        sub             r5,  r5,  r4           // r - v
+        itee            lo
+        movlo           r2,  #1
+        movhs           r4,  r5                // if (ret) v = r - v;
+        movhs           r7,  r8                // if (ret) dif = dif - vw;
+
+        clz             r5,  r4                // clz(rng)
+        mvn             r7,  r7                // ~dif
+        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
+        mov             lr,  r2
+        b               L(renorm2)
+endfunc
+
+function msac_decode_bool_adapt_neon, export=1
+        push            {r4-r10,lr}
+        ldr             r9,  [r1]              // cdf[0-1]
+        ldr             r5,  [r0, #RNG]
+        movw            lr,  #0xffc0
+        ldr             r6,  [r0, #CNT]
+        sub             sp,  sp,  #48
+        ldr             r7,  [r0, #DIF]
+        lsr             r4,  r5,  #8           // r >> 8
+        and             r2,  r9,  lr           // f &= ~63
+        mul             r4,  r4,  r2
+        mov             r2,  #0
+        lsr             r4,  r4,  #7
+        add             r4,  r4,  #4           // v
+        subs            r8,  r7,  r4, lsl #16  // dif - vw
+        sub             r5,  r5,  r4           // r - v
+        ldr             r10, [r0, #ALLOW_UPDATE_CDF]
+        itee            lo
+        movlo           r2,  #1
+        movhs           r4,  r5                // if (ret) v = r - v;
+        movhs           r7,  r8                // if (ret) dif = dif - vw;
+
+        cmp             r10, #0
+        clz             r5,  r4                // clz(rng)
+        mvn             r7,  r7                // ~dif
+        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
+        mov             lr,  r2
+
+        beq             L(renorm2)
+
+        lsr             r2,  r9,  #16          // count = cdf[1]
+        uxth            r9,  r9                // cdf[0]
+
+        sub             r3,  r2,  r2,  lsr #5  // count - (count >= 32)
+        lsr             r2,  r2,  #4           // count >> 4
+        add             r10, r3,  #1           // count + (count < 32)
+        add             r2,  r2,  #4           // rate = (count >> 4) | 4
+
+        sub             r9,  r9,  lr           // cdf[0] -= bit
+        sub             r3,  r9,  lr,  lsl #15 // {cdf[0], cdf[0] - 32769}
+        asr             r3,  r3,  r2           // {cdf[0], cdf[0] - 32769} >> rate
+        sub             r9,  r9,  r3           // cdf[0]
+
+        strh            r9,  [r1]
+        strh            r10, [r1, #2]
+
+        b               L(renorm2)
+endfunc
diff --git a/src/arm/32/util.S b/src/arm/32/util.S
new file mode 100644 (file)
index 0000000..6af0158
--- /dev/null
@@ -0,0 +1,126 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2015 Martin Storsjo
+ * Copyright © 2015 Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#ifndef DAV1D_SRC_ARM_32_UTIL_S
+#define DAV1D_SRC_ARM_32_UTIL_S
+
+#include "config.h"
+#include "src/arm/asm.S"
+
+.macro movrel_local rd, val, offset=0
+#if defined(PIC)
+        ldr             \rd,  90001f
+        b               90002f
+90001:
+        .word           \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB)
+90002:
+        add             \rd,  \rd,  pc
+#else
+        movw            \rd, #:lower16:\val+\offset
+        movt            \rd, #:upper16:\val+\offset
+#endif
+.endm
+
+.macro movrel rd, val, offset=0
+#if defined(PIC) && defined(__APPLE__)
+        ldr             \rd,  1f
+        b               2f
+1:
+        .word           3f - (2f + 8 - 4 * CONFIG_THUMB)
+2:
+        ldr             \rd,  [pc, \rd]
+.if \offset < 0
+        sub             \rd,  \rd,  #-(\offset)
+.elseif \offset > 0
+        add             \rd,  \rd,  #\offset
+.endif
+        .non_lazy_symbol_pointer
+3:
+        .indirect_symbol \val
+        .word       0
+        .text
+#else
+        movrel_local    \rd, \val, \offset
+#endif
+.endm
+
+.macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+        vtrn.32         \q0,  \q2
+        vtrn.32         \q1,  \q3
+
+        vtrn.16         \r0,  \r2
+        vtrn.16         \r1,  \r3
+        vtrn.16         \r4,  \r6
+        vtrn.16         \r5,  \r7
+
+        vtrn.8          \r0,  \r1
+        vtrn.8          \r2,  \r3
+        vtrn.8          \r4,  \r5
+        vtrn.8          \r6,  \r7
+.endm
+
+.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, d0, d1, d2, d3, d4, d5, d6, d7
+        vswp            \d0,  \d4
+        vswp            \d1,  \d5
+        vswp            \d2,  \d6
+        vswp            \d3,  \d7
+
+        vtrn.32         \r0,  \r2
+        vtrn.32         \r1,  \r3
+        vtrn.32         \r4,  \r6
+        vtrn.32         \r5,  \r7
+
+        vtrn.16         \r0,  \r1
+        vtrn.16         \r2,  \r3
+        vtrn.16         \r4,  \r5
+        vtrn.16         \r6,  \r7
+.endm
+
+.macro transpose_4x8b q0, q1, r0, r1, r2, r3
+        vtrn.16         \q0,  \q1
+
+        vtrn.8          \r0,  \r1
+        vtrn.8          \r2,  \r3
+.endm
+
+.macro transpose_4x4h q0, q1, r0, r1, r2, r3
+        vtrn.32         \q0,  \q1
+
+        vtrn.16         \r0,  \r1
+        vtrn.16         \r2,  \r3
+.endm
+
+.macro transpose_4x8h r0, r1, r2, r3
+        vtrn.32         \r0,  \r2
+        vtrn.32         \r1,  \r3
+
+        vtrn.16         \r0,  \r1
+        vtrn.16         \r2,  \r3
+.endm
+
+#endif /* DAV1D_SRC_ARM_32_UTIL_S */
diff --git a/src/arm/64/cdef.S b/src/arm/64/cdef.S
new file mode 100644 (file)
index 0000000..6104470
--- /dev/null
@@ -0,0 +1,517 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
+        tst             w6,  #1 // CDEF_HAVE_LEFT
+        b.eq            2f
+        // CDEF_HAVE_LEFT
+        sub             \s1,  \s1,  #2
+        sub             \s2,  \s2,  #2
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        ldr             \rn\()0, [\s1]
+        ldr             s1,      [\s1, #\w]
+        ldr             \rn\()2, [\s2]
+        ldr             s3,      [\s2, #\w]
+        uxtl            v0.8h,   v0.8b
+        uxtl            v1.8h,   v1.8b
+        uxtl            v2.8h,   v2.8b
+        uxtl            v3.8h,   v3.8b
+        str             \rw\()0, [x0]
+        str             d1,      [x0, #2*\w]
+        add             x0,  x0,  #2*\stride
+        str             \rw\()2, [x0]
+        str             d3,      [x0, #2*\w]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ldr             \rn\()0, [\s1]
+        ldr             h1,      [\s1, #\w]
+        ldr             \rn\()2, [\s2]
+        ldr             h3,      [\s2, #\w]
+        uxtl            v0.8h,   v0.8b
+        uxtl            v1.8h,   v1.8b
+        uxtl            v2.8h,   v2.8b
+        uxtl            v3.8h,   v3.8b
+        str             \rw\()0, [x0]
+        str             s1,      [x0, #2*\w]
+        str             s31,     [x0, #2*\w+4]
+        add             x0,  x0,  #2*\stride
+        str             \rw\()2, [x0]
+        str             s3,      [x0, #2*\w]
+        str             s31,     [x0, #2*\w+4]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+        b               3f
+.endif
+
+2:
+        // !CDEF_HAVE_LEFT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        ldr             \rn\()0, [\s1]
+        ldr             h1,      [\s1, #\w]
+        ldr             \rn\()2, [\s2]
+        ldr             h3,      [\s2, #\w]
+        uxtl            v0.8h,  v0.8b
+        uxtl            v1.8h,  v1.8b
+        uxtl            v2.8h,  v2.8b
+        uxtl            v3.8h,  v3.8b
+        str             s31, [x0]
+        stur            \rw\()0, [x0, #4]
+        str             s1,      [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        str             s31, [x0]
+        stur            \rw\()2, [x0, #4]
+        str             s3,      [x0, #4+2*\w]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ldr             \rn\()0, [\s1]
+        ldr             \rn\()1, [\s2]
+        uxtl            v0.8h,  v0.8b
+        uxtl            v1.8h,  v1.8b
+        str             s31,     [x0]
+        stur            \rw\()0, [x0, #4]
+        str             s31,     [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        str             s31,     [x0]
+        stur            \rw\()1, [x0, #4]
+        str             s31,     [x0, #4+2*\w]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr dst, src, incr, w
+.if \w == 4
+        ld1             {\dst\().s}[0], [\src], \incr
+.else
+        ld1             {\dst\().8b},   [\src], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
+//                                    ptrdiff_t src_stride, const pixel (*left)[2],
+//                                    const pixel *const top, int h,
+//                                    enum CdefEdgeFlags edges);
+
+.macro padding_func w, stride, rn, rw
+function cdef_padding\w\()_8bpc_neon, export=1
+        cmp             w6,  #0xf // fully edged
+        b.eq            cdef_padding\w\()_edged_8bpc_neon
+        movi            v30.8h,  #0x80, lsl #8
+        mov             v31.16b, v30.16b
+        sub             x0,  x0,  #2*(2*\stride+2)
+        tst             w6,  #4 // CDEF_HAVE_TOP
+        b.ne            1f
+        // !CDEF_HAVE_TOP
+        st1             {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+        st1             {v30.8h, v31.8h}, [x0], #32
+.endif
+        b               3f
+1:
+        // CDEF_HAVE_TOP
+        add             x9,  x4,  x2
+        pad_top_bottom  x4,  x9, \w, \stride, \rn, \rw, 0
+
+        // Middle section
+3:
+        tst             w6,  #1 // CDEF_HAVE_LEFT
+        b.eq            2f
+        // CDEF_HAVE_LEFT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        ld1             {v0.h}[0], [x3], #2
+        ldr             h2,      [x1, #\w]
+        load_n_incr     v1,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        uxtl            v0.8h,  v0.8b
+        uxtl            v1.8h,  v1.8b
+        uxtl            v2.8h,  v2.8b
+        str             s0,      [x0]
+        stur            \rw\()1, [x0, #4]
+        str             s2,      [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            0b
+        b               3f
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ld1             {v0.h}[0], [x3], #2
+        load_n_incr     v1,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        uxtl            v0.8h,  v0.8b
+        uxtl            v1.8h,  v1.8b
+        str             s0,      [x0]
+        stur            \rw\()1, [x0, #4]
+        str             s31,     [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            1b
+        b               3f
+2:
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        ldr             h1,      [x1, #\w]
+        load_n_incr     v0,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        uxtl            v0.8h,  v0.8b
+        uxtl            v1.8h,  v1.8b
+        str             s31,     [x0]
+        stur            \rw\()0, [x0, #4]
+        str             s1,      [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            0b
+        b               3f
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        load_n_incr     v0,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        uxtl            v0.8h,  v0.8b
+        str             s31,     [x0]
+        stur            \rw\()0, [x0, #4]
+        str             s31,     [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            1b
+
+3:
+        tst             w6,  #8 // CDEF_HAVE_BOTTOM
+        b.ne            1f
+        // !CDEF_HAVE_BOTTOM
+        st1             {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+        st1             {v30.8h, v31.8h}, [x0], #32
+.endif
+        ret
+1:
+        // CDEF_HAVE_BOTTOM
+        add             x9,  x1,  x2
+        pad_top_bottom  x1,  x9, \w, \stride, \rn, \rw, 1
+endfunc
+.endm
+
+padding_func 8, 16, d, q
+padding_func 4, 8,  s, d
+
+// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
+//                                    ptrdiff_t src_stride, const pixel (*left)[2],
+//                                    const pixel *const top, int h,
+//                                    enum CdefEdgeFlags edges);
+
+.macro padding_func_edged w, stride, reg
+function cdef_padding\w\()_edged_8bpc_neon, export=1
+        sub             x4,  x4,  #2
+        sub             x0,  x0,  #(2*\stride+2)
+
+.if \w == 4
+        ldr             d0, [x4]
+        ldr             d1, [x4, x2]
+        st1             {v0.8b, v1.8b}, [x0], #16
+.else
+        add             x9,  x4,  x2
+        ldr             d0, [x4]
+        ldr             s1, [x4, #8]
+        ldr             d2, [x9]
+        ldr             s3, [x9, #8]
+        str             d0, [x0]
+        str             s1, [x0, #8]
+        str             d2, [x0, #\stride]
+        str             s3, [x0, #\stride+8]
+        add             x0,  x0,  #2*\stride
+.endif
+
+0:
+        ld1             {v0.h}[0], [x3], #2
+        ldr             h2,      [x1, #\w]
+        load_n_incr     v1,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        str             h0,      [x0]
+        stur            \reg\()1, [x0, #2]
+        str             h2,      [x0, #2+\w]
+        add             x0,  x0,  #\stride
+        b.gt            0b
+
+        sub             x1,  x1,  #2
+.if \w == 4
+        ldr             d0, [x1]
+        ldr             d1, [x1, x2]
+        st1             {v0.8b, v1.8b}, [x0], #16
+.else
+        add             x9,  x1,  x2
+        ldr             d0, [x1]
+        ldr             s1, [x1, #8]
+        ldr             d2, [x9]
+        ldr             s3, [x9, #8]
+        str             d0, [x0]
+        str             s1, [x0, #8]
+        str             d2, [x0, #\stride]
+        str             s3, [x0, #\stride+8]
+.endif
+        ret
+endfunc
+.endm
+
+padding_func_edged 8, 16, d
+padding_func_edged 4, 8,  s
+
+tables
+
+filter 8, 8
+filter 4, 8
+
+find_dir 8
+
+.macro load_px_8 d1, d2, w
+.if \w == 8
+        add             x6,  x2,  w9, sxtb          // x + off
+        sub             x9,  x2,  w9, sxtb          // x - off
+        ld1             {\d1\().d}[0], [x6]         // p0
+        add             x6,  x6,  #16               // += stride
+        ld1             {\d2\().d}[0], [x9]         // p1
+        add             x9,  x9,  #16               // += stride
+        ld1             {\d1\().d}[1], [x6]         // p0
+        ld1             {\d2\().d}[1], [x9]         // p0
+.else
+        add             x6,  x2,  w9, sxtb          // x + off
+        sub             x9,  x2,  w9, sxtb          // x - off
+        ld1             {\d1\().s}[0], [x6]         // p0
+        add             x6,  x6,  #8                // += stride
+        ld1             {\d2\().s}[0], [x9]         // p1
+        add             x9,  x9,  #8                // += stride
+        ld1             {\d1\().s}[1], [x6]         // p0
+        add             x6,  x6,  #8                // += stride
+        ld1             {\d2\().s}[1], [x9]         // p1
+        add             x9,  x9,  #8                // += stride
+        ld1             {\d1\().s}[2], [x6]         // p0
+        add             x6,  x6,  #8                // += stride
+        ld1             {\d2\().s}[2], [x9]         // p1
+        add             x9,  x9,  #8                // += stride
+        ld1             {\d1\().s}[3], [x6]         // p0
+        ld1             {\d2\().s}[3], [x9]         // p1
+.endif
+.endm
+.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
+.if \min
+        umin            v3.16b,  v3.16b,  \s1\().16b
+        umax            v4.16b,  v4.16b,  \s1\().16b
+        umin            v3.16b,  v3.16b,  \s2\().16b
+        umax            v4.16b,  v4.16b,  \s2\().16b
+.endif
+        uabd            v16.16b, v0.16b,  \s1\().16b  // abs(diff)
+        uabd            v20.16b, v0.16b,  \s2\().16b  // abs(diff)
+        ushl            v17.16b, v16.16b, \shift      // abs(diff) >> shift
+        ushl            v21.16b, v20.16b, \shift      // abs(diff) >> shift
+        uqsub           v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
+        uqsub           v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
+        cmhi            v18.16b, v0.16b,  \s1\().16b  // px > p0
+        cmhi            v22.16b, v0.16b,  \s2\().16b  // px > p1
+        umin            v17.16b, v17.16b, v16.16b     // imin(abs(diff), clip)
+        umin            v21.16b, v21.16b, v20.16b     // imin(abs(diff), clip)
+        dup             v19.16b, \tap                 // taps[k]
+        neg             v16.16b, v17.16b              // -imin()
+        neg             v20.16b, v21.16b              // -imin()
+        bsl             v18.16b, v16.16b, v17.16b     // constrain() = apply_sign()
+        bsl             v22.16b, v20.16b, v21.16b     // constrain() = apply_sign()
+        smlal           v1.8h,   v18.8b,  v19.8b      // sum += taps[k] * constrain()
+        smlal           v1.8h,   v22.8b,  v19.8b      // sum += taps[k] * constrain()
+        smlal2          v2.8h,   v18.16b, v19.16b     // sum += taps[k] * constrain()
+        smlal2          v2.8h,   v22.16b, v19.16b     // sum += taps[k] * constrain()
+.endm
+
+// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
+//                                   const uint8_t *tmp, int pri_strength,
+//                                   int sec_strength, int dir, int damping,
+//                                   int h);
+.macro filter_func_8 w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_edged_8bpc_neon
+.if \pri
+        movrel          x8,  pri_taps
+        and             w9,  w3,  #1
+        add             x8,  x8,  w9, uxtw #1
+.endif
+        movrel          x9,  directions\w
+        add             x5,  x9,  w5, uxtw #1
+        movi            v30.8b,  #7
+        dup             v28.8b,  w6                 // damping
+
+.if \pri
+        dup             v25.16b, w3                 // threshold
+.endif
+.if \sec
+        dup             v27.16b, w4                 // threshold
+.endif
+        trn1            v24.8b,  v25.8b, v27.8b
+        clz             v24.8b,  v24.8b             // clz(threshold)
+        sub             v24.8b,  v30.8b, v24.8b     // ulog2(threshold)
+        uqsub           v24.8b,  v28.8b, v24.8b     // shift = imax(0, damping - ulog2(threshold))
+        neg             v24.8b,  v24.8b             // -shift
+.if \sec
+        dup             v26.16b, v24.b[1]
+.endif
+.if \pri
+        dup             v24.16b, v24.b[0]
+.endif
+
+1:
+.if \w == 8
+        add             x12, x2,  #16
+        ld1             {v0.d}[0], [x2]             // px
+        ld1             {v0.d}[1], [x12]            // px
+.else
+        add             x12, x2,  #1*8
+        add             x13, x2,  #2*8
+        add             x14, x2,  #3*8
+        ld1             {v0.s}[0], [x2]             // px
+        ld1             {v0.s}[1], [x12]            // px
+        ld1             {v0.s}[2], [x13]            // px
+        ld1             {v0.s}[3], [x14]            // px
+.endif
+
+        movi            v1.8h,  #0                  // sum
+        movi            v2.8h,  #0                  // sum
+.if \min
+        mov             v3.16b, v0.16b              // min
+        mov             v4.16b, v0.16b              // max
+.endif
+
+        // Instead of loading sec_taps 2, 1 from memory, just set it
+        // to 2 initially and decrease for the second round.
+        // This is also used as loop counter.
+        mov             w11, #2                     // sec_taps[0]
+
+2:
+.if \pri
+        ldrb            w9,  [x5]                   // off1
+
+        load_px_8       v5,  v6, \w
+.endif
+
+.if \sec
+        add             x5,  x5,  #4                // +2*2
+        ldrb            w9,  [x5]                   // off2
+        load_px_8       v28, v29, \w
+.endif
+
+.if \pri
+        ldrb            w10, [x8]                   // *pri_taps
+
+        handle_pixel_8  v5,  v6,  v25.16b, v24.16b, w10, \min
+.endif
+
+.if \sec
+        add             x5,  x5,  #8                // +2*4
+        ldrb            w9,  [x5]                   // off3
+        load_px_8       v5,  v6,  \w
+
+        handle_pixel_8  v28, v29, v27.16b, v26.16b, w11, \min
+
+        handle_pixel_8  v5,  v6,  v27.16b, v26.16b, w11, \min
+
+        sub             x5,  x5,  #11               // x5 -= 2*(2+4); x5 += 1;
+.else
+        add             x5,  x5,  #1                // x5 += 1
+.endif
+        subs            w11, w11, #1                // sec_tap-- (value)
+.if \pri
+        add             x8,  x8,  #1                // pri_taps++ (pointer)
+.endif
+        b.ne            2b
+
+        sshr            v5.8h,   v1.8h,   #15       // -(sum < 0)
+        sshr            v6.8h,   v2.8h,   #15       // -(sum < 0)
+        add             v1.8h,   v1.8h,   v5.8h     // sum - (sum < 0)
+        add             v2.8h,   v2.8h,   v6.8h     // sum - (sum < 0)
+        srshr           v1.8h,   v1.8h,   #4        // (8 + sum - (sum < 0)) >> 4
+        srshr           v2.8h,   v2.8h,   #4        // (8 + sum - (sum < 0)) >> 4
+        uaddw           v1.8h,   v1.8h,   v0.8b     // px + (8 + sum ...) >> 4
+        uaddw2          v2.8h,   v2.8h,   v0.16b    // px + (8 + sum ...) >> 4
+        sqxtun          v0.8b,   v1.8h
+        sqxtun2         v0.16b,  v2.8h
+.if \min
+        umin            v0.16b,  v0.16b,  v4.16b
+        umax            v0.16b,  v0.16b,  v3.16b    // iclip(px + .., min, max)
+.endif
+.if \w == 8
+        st1             {v0.d}[0], [x0], x1
+        add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
+        subs            w7,  w7,  #2                // h -= 2
+        st1             {v0.d}[1], [x0], x1
+.else
+        st1             {v0.s}[0], [x0], x1
+        add             x2,  x2,  #4*8              // tmp += 4*tmp_stride
+        st1             {v0.s}[1], [x0], x1
+        subs            w7,  w7,  #4                // h -= 4
+        st1             {v0.s}[2], [x0], x1
+        st1             {v0.s}[3], [x0], x1
+.endif
+
+        // Reset pri_taps and directions back to the original point
+        sub             x5,  x5,  #2
+.if \pri
+        sub             x8,  x8,  #2
+.endif
+
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+.macro filter_8 w
+filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
+.endm
+
+filter_8 8
+filter_8 4
diff --git a/src/arm/64/cdef16.S b/src/arm/64/cdef16.S
new file mode 100644 (file)
index 0000000..125ecb2
--- /dev/null
@@ -0,0 +1,228 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+.macro pad_top_bot_16 s1, s2, w, stride, reg, ret
+        tst             w6,  #1 // CDEF_HAVE_LEFT
+        b.eq            2f
+        // CDEF_HAVE_LEFT
+        sub             \s1,  \s1,  #4
+        sub             \s2,  \s2,  #4
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        ldr             \reg\()0, [\s1]
+        ldr             d1,       [\s1, #2*\w]
+        ldr             \reg\()2, [\s2]
+        ldr             d3,       [\s2, #2*\w]
+        str             \reg\()0, [x0]
+        str             d1,       [x0, #2*\w]
+        add             x0,  x0,  #2*\stride
+        str             \reg\()2, [x0]
+        str             d3,       [x0, #2*\w]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ldr             \reg\()0, [\s1]
+        ldr             s1,       [\s1, #2*\w]
+        ldr             \reg\()2, [\s2]
+        ldr             s3,       [\s2, #2*\w]
+        str             \reg\()0, [x0]
+        str             s1,       [x0, #2*\w]
+        str             s31,      [x0, #2*\w+4]
+        add             x0,  x0,  #2*\stride
+        str             \reg\()2, [x0]
+        str             s3,       [x0, #2*\w]
+        str             s31,      [x0, #2*\w+4]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+        b               3f
+.endif
+
+2:
+        // !CDEF_HAVE_LEFT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        ldr             \reg\()0, [\s1]
+        ldr             s1,       [\s1, #2*\w]
+        ldr             \reg\()2, [\s2]
+        ldr             s3,       [\s2, #2*\w]
+        str             s31, [x0]
+        stur            \reg\()0, [x0, #4]
+        str             s1,       [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        str             s31, [x0]
+        stur            \reg\()2, [x0, #4]
+        str             s3,       [x0, #4+2*\w]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ldr             \reg\()0, [\s1]
+        ldr             \reg\()1, [\s2]
+        str             s31,      [x0]
+        stur            \reg\()0, [x0, #4]
+        str             s31,      [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        str             s31,      [x0]
+        stur            \reg\()1, [x0, #4]
+        str             s31,      [x0, #4+2*\w]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr_16 dst, src, incr, w
+.if \w == 4
+        ld1             {\dst\().4h}, [\src], \incr
+.else
+        ld1             {\dst\().8h}, [\src], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
+//                                     ptrdiff_t src_stride, const pixel (*left)[2],
+//                                     const pixel *const top, int h,
+//                                     enum CdefEdgeFlags edges);
+
+.macro padding_func_16 w, stride, reg
+function cdef_padding\w\()_16bpc_neon, export=1
+        movi            v30.8h,  #0x80, lsl #8
+        mov             v31.16b, v30.16b
+        sub             x0,  x0,  #2*(2*\stride+2)
+        tst             w6,  #4 // CDEF_HAVE_TOP
+        b.ne            1f
+        // !CDEF_HAVE_TOP
+        st1             {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+        st1             {v30.8h, v31.8h}, [x0], #32
+.endif
+        b               3f
+1:
+        // CDEF_HAVE_TOP
+        add             x9,  x4,  x2
+        pad_top_bot_16  x4,  x9, \w, \stride, \reg, 0
+
+        // Middle section
+3:
+        tst             w6,  #1 // CDEF_HAVE_LEFT
+        b.eq            2f
+        // CDEF_HAVE_LEFT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        ld1             {v0.s}[0], [x3], #4
+        ldr             s2,       [x1, #2*\w]
+        load_n_incr_16  v1,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        str             s0,       [x0]
+        stur            \reg\()1, [x0, #4]
+        str             s2,       [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            0b
+        b               3f
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ld1             {v0.s}[0], [x3], #4
+        load_n_incr_16  v1,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        str             s0,       [x0]
+        stur            \reg\()1, [x0, #4]
+        str             s31,      [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            1b
+        b               3f
+2:
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        ldr             s1,       [x1, #2*\w]
+        load_n_incr_16  v0,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        str             s31,      [x0]
+        stur            \reg\()0, [x0, #4]
+        str             s1,       [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            0b
+        b               3f
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        load_n_incr_16  v0,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        str             s31,      [x0]
+        stur            \reg\()0, [x0, #4]
+        str             s31,      [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            1b
+
+3:
+        tst             w6,  #8 // CDEF_HAVE_BOTTOM
+        b.ne            1f
+        // !CDEF_HAVE_BOTTOM
+        st1             {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+        st1             {v30.8h, v31.8h}, [x0], #32
+.endif
+        ret
+1:
+        // CDEF_HAVE_BOTTOM
+        add             x9,  x1,  x2
+        pad_top_bot_16  x1,  x9, \w, \stride, \reg, 1
+endfunc
+.endm
+
+padding_func_16 8, 16, q
+padding_func_16 4, 8,  d
+
+tables
+
+filter 8, 16
+filter 4, 16
+
+find_dir 16
diff --git a/src/arm/64/cdef_tmpl.S b/src/arm/64/cdef_tmpl.S
new file mode 100644 (file)
index 0000000..e8c7faa
--- /dev/null
@@ -0,0 +1,482 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro dir_table w, stride
+const directions\w
+        .byte           -1 * \stride + 1, -2 * \stride + 2
+        .byte            0 * \stride + 1, -1 * \stride + 2
+        .byte            0 * \stride + 1,  0 * \stride + 2
+        .byte            0 * \stride + 1,  1 * \stride + 2
+        .byte            1 * \stride + 1,  2 * \stride + 2
+        .byte            1 * \stride + 0,  2 * \stride + 1
+        .byte            1 * \stride + 0,  2 * \stride + 0
+        .byte            1 * \stride + 0,  2 * \stride - 1
+// Repeated, to avoid & 7
+        .byte           -1 * \stride + 1, -2 * \stride + 2
+        .byte            0 * \stride + 1, -1 * \stride + 2
+        .byte            0 * \stride + 1,  0 * \stride + 2
+        .byte            0 * \stride + 1,  1 * \stride + 2
+        .byte            1 * \stride + 1,  2 * \stride + 2
+        .byte            1 * \stride + 0,  2 * \stride + 1
+endconst
+.endm
+
+.macro tables
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+        .byte           4, 2, 3, 3
+endconst
+.endm
+
+.macro load_px d1, d2, w
+.if \w == 8
+        add             x6,  x2,  w9, sxtb #1       // x + off
+        sub             x9,  x2,  w9, sxtb #1       // x - off
+        ld1             {\d1\().8h}, [x6]           // p0
+        ld1             {\d2\().8h}, [x9]           // p1
+.else
+        add             x6,  x2,  w9, sxtb #1       // x + off
+        sub             x9,  x2,  w9, sxtb #1       // x - off
+        ld1             {\d1\().4h}, [x6]           // p0
+        add             x6,  x6,  #2*8              // += stride
+        ld1             {\d2\().4h}, [x9]           // p1
+        add             x9,  x9,  #2*8              // += stride
+        ld1             {\d1\().d}[1], [x6]         // p0
+        ld1             {\d2\().d}[1], [x9]         // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
+.if \min
+        umin            v2.8h,   v2.8h,  \s1\().8h
+        smax            v3.8h,   v3.8h,  \s1\().8h
+        umin            v2.8h,   v2.8h,  \s2\().8h
+        smax            v3.8h,   v3.8h,  \s2\().8h
+.endif
+        uabd            v16.8h, v0.8h,  \s1\().8h   // abs(diff)
+        uabd            v20.8h, v0.8h,  \s2\().8h   // abs(diff)
+        ushl            v17.8h, v16.8h, \shift      // abs(diff) >> shift
+        ushl            v21.8h, v20.8h, \shift      // abs(diff) >> shift
+        uqsub           v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+        uqsub           v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+        sub             v18.8h, \s1\().8h,  v0.8h   // diff = p0 - px
+        sub             v22.8h, \s2\().8h,  v0.8h   // diff = p1 - px
+        neg             v16.8h, v17.8h              // -clip
+        neg             v20.8h, v21.8h              // -clip
+        smin            v18.8h, v18.8h, v17.8h      // imin(diff, clip)
+        smin            v22.8h, v22.8h, v21.8h      // imin(diff, clip)
+        dup             v19.8h, \tap                // taps[k]
+        smax            v18.8h, v18.8h, v16.8h      // constrain() = imax(imin(diff, clip), -clip)
+        smax            v22.8h, v22.8h, v20.8h      // constrain() = imax(imin(diff, clip), -clip)
+        mla             v1.8h,  v18.8h, v19.8h      // sum += taps[k] * constrain()
+        mla             v1.8h,  v22.8h, v19.8h      // sum += taps[k] * constrain()
+.endm
+
+// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
+//                                   const uint16_t *tmp, int pri_strength,
+//                                   int sec_strength, int dir, int damping,
+//                                   int h, size_t edges);
+.macro filter_func w, bpc, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_\bpc\()bpc_neon
+.if \bpc == 8
+        ldr             w8,  [sp]                   // bitdepth_max
+        cmp             w8,  #0xf
+        b.eq            cdef_filter\w\suffix\()_edged_8bpc_neon
+.endif
+.if \pri
+.if \bpc == 16
+        ldr             w9,  [sp, #8]               // bitdepth_max
+        clz             w9,  w9
+        sub             w9,  w9,  #24               // -bitdepth_min_8
+        neg             w9,  w9                     // bitdepth_min_8
+.endif
+        movrel          x8,  pri_taps
+.if \bpc == 16
+        lsr             w9,  w3,  w9                // pri_strength >> bitdepth_min_8
+        and             w9,  w9,  #1                // (pri_strength >> bitdepth_min_8) & 1
+.else
+        and             w9,  w3,  #1
+.endif
+        add             x8,  x8,  w9, uxtw #1
+.endif
+        movrel          x9,  directions\w
+        add             x5,  x9,  w5, uxtw #1
+        movi            v30.4h,   #15
+        dup             v28.4h,   w6                // damping
+
+.if \pri
+        dup             v25.8h, w3                  // threshold
+.endif
+.if \sec
+        dup             v27.8h, w4                  // threshold
+.endif
+        trn1            v24.4h, v25.4h, v27.4h
+        clz             v24.4h, v24.4h              // clz(threshold)
+        sub             v24.4h, v30.4h, v24.4h      // ulog2(threshold)
+        uqsub           v24.4h, v28.4h, v24.4h      // shift = imax(0, damping - ulog2(threshold))
+        neg             v24.4h, v24.4h              // -shift
+.if \sec
+        dup             v26.8h, v24.h[1]
+.endif
+.if \pri
+        dup             v24.8h, v24.h[0]
+.endif
+
+1:
+.if \w == 8
+        ld1             {v0.8h}, [x2]               // px
+.else
+        add             x12, x2,  #2*8
+        ld1             {v0.4h},   [x2]             // px
+        ld1             {v0.d}[1], [x12]            // px
+.endif
+
+        movi            v1.8h,  #0                  // sum
+.if \min
+        mov             v2.16b, v0.16b              // min
+        mov             v3.16b, v0.16b              // max
+.endif
+
+        // Instead of loading sec_taps 2, 1 from memory, just set it
+        // to 2 initially and decrease for the second round.
+        // This is also used as loop counter.
+        mov             w11, #2                     // sec_taps[0]
+
+2:
+.if \pri
+        ldrb            w9,  [x5]                   // off1
+
+        load_px         v4,  v5, \w
+.endif
+
+.if \sec
+        add             x5,  x5,  #4                // +2*2
+        ldrb            w9,  [x5]                   // off2
+        load_px         v6,  v7,  \w
+.endif
+
+.if \pri
+        ldrb            w10, [x8]                   // *pri_taps
+
+        handle_pixel    v4,  v5,  v25.8h, v24.8h, w10, \min
+.endif
+
+.if \sec
+        add             x5,  x5,  #8                // +2*4
+        ldrb            w9,  [x5]                   // off3
+        load_px         v4,  v5,  \w
+
+        handle_pixel    v6,  v7,  v27.8h, v26.8h, w11, \min
+
+        handle_pixel    v4,  v5,  v27.8h, v26.8h, w11, \min
+
+        sub             x5,  x5,  #11               // x5 -= 2*(2+4); x5 += 1;
+.else
+        add             x5,  x5,  #1                // x5 += 1
+.endif
+        subs            w11, w11, #1                // sec_tap-- (value)
+.if \pri
+        add             x8,  x8,  #1                // pri_taps++ (pointer)
+.endif
+        b.ne            2b
+
+        sshr            v4.8h,  v1.8h,  #15         // -(sum < 0)
+        add             v1.8h,  v1.8h,  v4.8h       // sum - (sum < 0)
+        srshr           v1.8h,  v1.8h,  #4          // (8 + sum - (sum < 0)) >> 4
+        add             v0.8h,  v0.8h,  v1.8h       // px + (8 + sum ...) >> 4
+.if \min
+        smin            v0.8h,  v0.8h,  v3.8h
+        smax            v0.8h,  v0.8h,  v2.8h       // iclip(px + .., min, max)
+.endif
+.if \bpc == 8
+        xtn             v0.8b,  v0.8h
+.endif
+.if \w == 8
+        add             x2,  x2,  #2*16             // tmp += tmp_stride
+        subs            w7,  w7,  #1                // h--
+.if \bpc == 8
+        st1             {v0.8b}, [x0], x1
+.else
+        st1             {v0.8h}, [x0], x1
+.endif
+.else
+.if \bpc == 8
+        st1             {v0.s}[0], [x0], x1
+.else
+        st1             {v0.d}[0], [x0], x1
+.endif
+        add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
+        subs            w7,  w7,  #2                // h -= 2
+.if \bpc == 8
+        st1             {v0.s}[1], [x0], x1
+.else
+        st1             {v0.d}[1], [x0], x1
+.endif
+.endif
+
+        // Reset pri_taps and directions back to the original point
+        sub             x5,  x5,  #2
+.if \pri
+        sub             x8,  x8,  #2
+.endif
+
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+.macro filter w, bpc
+filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_\bpc\()bpc_neon, export=1
+        cbnz            w3,  1f // pri_strength
+        b               cdef_filter\w\()_sec_\bpc\()bpc_neon     // only sec
+1:
+        cbnz            w4,  1f // sec_strength
+        b               cdef_filter\w\()_pri_\bpc\()bpc_neon     // only pri
+1:
+        b               cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
+endfunc
+.endm
+
+const div_table
+        .short         840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact
+        .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+.macro cost_alt d1, d2, s1, s2, s3, s4
+        smull           v22.4s,  \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
+        smull2          v23.4s,  \s1\().8h, \s1\().8h
+        smull           v24.4s,  \s2\().4h, \s2\().4h
+        smull           v25.4s,  \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
+        smull2          v26.4s,  \s3\().8h, \s3\().8h
+        smull           v27.4s,  \s4\().4h, \s4\().4h
+        mul             v22.4s,  v22.4s,  v29.4s      // sum_alt[n]^2*fact
+        mla             v22.4s,  v23.4s,  v30.4s
+        mla             v22.4s,  v24.4s,  v31.4s
+        mul             v25.4s,  v25.4s,  v29.4s      // sum_alt[n]^2*fact
+        mla             v25.4s,  v26.4s,  v30.4s
+        mla             v25.4s,  v27.4s,  v31.4s
+        addv            \d1, v22.4s                   // *cost_ptr
+        addv            \d2, v25.4s                   // *cost_ptr
+.endm
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+        mov             w5,  \s2\().s[0]
+.endif
+        cmp             w4,  w1                       // cost[n] > best_cost
+        csel            w0,  w3,  w0,  gt             // best_dir = n
+        csel            w1,  w4,  w1,  gt             // best_cost = cost[n]
+.ifnb \s2
+        add             w3,  w3,  #1                  // n++
+        cmp             w5,  w1                       // cost[n] > best_cost
+        mov             w4,  \s3\().s[0]
+        csel            w0,  w3,  w0,  gt             // best_dir = n
+        csel            w1,  w5,  w1,  gt             // best_cost = cost[n]
+        add             w3,  w3,  #1                  // n++
+.endif
+.endm
+
+// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
+//                                   unsigned *const var)
+.macro find_dir bpc
+function cdef_find_dir_\bpc\()bpc_neon, export=1
+.if \bpc == 16
+        str             d8,  [sp, #-0x10]!
+        clz             w3,  w3                       // clz(bitdepth_max)
+        sub             w3,  w3,  #24                 // -bitdepth_min_8
+        dup             v8.8h,   w3
+.endif
+        sub             sp,  sp,  #32 // cost
+        mov             w3,  #8
+.if \bpc == 8
+        movi            v31.16b, #128
+.else
+        movi            v31.8h,  #128
+.endif
+        movi            v30.16b, #0
+        movi            v1.8h,   #0 // v0-v1 sum_diag[0]
+        movi            v3.8h,   #0 // v2-v3 sum_diag[1]
+        movi            v5.8h,   #0 // v4-v5 sum_hv[0-1]
+        movi            v7.8h,   #0 // v6-v7 sum_alt[0]
+        movi            v17.8h,  #0 // v16-v17 sum_alt[1]
+        movi            v18.8h,  #0 // v18-v19 sum_alt[2]
+        movi            v19.8h,  #0
+        movi            v21.8h,  #0 // v20-v21 sum_alt[3]
+
+.irpc i, 01234567
+.if \bpc == 8
+        ld1             {v26.8b}, [x0], x1
+        usubl           v26.8h,  v26.8b, v31.8b
+.else
+        ld1             {v26.8h}, [x0], x1
+        ushl            v26.8h,  v26.8h, v8.8h
+        sub             v26.8h,  v26.8h, v31.8h
+.endif
+
+        addv            h25,     v26.8h               // [y]
+        rev64           v27.8h,  v26.8h
+        addp            v28.8h,  v26.8h,  v30.8h      // [(x >> 1)]
+        add             v5.8h,   v5.8h,   v26.8h      // sum_hv[1]
+        ext             v27.16b, v27.16b, v27.16b, #8 // [-x]
+        rev64           v29.4h,  v28.4h               // [-(x >> 1)]
+        ins             v4.h[\i], v25.h[0]            // sum_hv[0]
+
+.if \i == 0
+        mov             v0.16b,  v26.16b              // sum_diag[0]
+        mov             v2.16b,  v27.16b              // sum_diag[1]
+        mov             v6.16b,  v28.16b              // sum_alt[0]
+        mov             v16.16b, v29.16b              // sum_alt[1]
+.else
+        ext             v22.16b, v30.16b, v26.16b, #(16-2*\i)
+        ext             v23.16b, v26.16b, v30.16b, #(16-2*\i)
+        ext             v24.16b, v30.16b, v27.16b, #(16-2*\i)
+        ext             v25.16b, v27.16b, v30.16b, #(16-2*\i)
+        add             v0.8h,   v0.8h,   v22.8h      // sum_diag[0]
+        add             v1.8h,   v1.8h,   v23.8h      // sum_diag[0]
+        add             v2.8h,   v2.8h,   v24.8h      // sum_diag[1]
+        add             v3.8h,   v3.8h,   v25.8h      // sum_diag[1]
+        ext             v22.16b, v30.16b, v28.16b, #(16-2*\i)
+        ext             v23.16b, v28.16b, v30.16b, #(16-2*\i)
+        ext             v24.16b, v30.16b, v29.16b, #(16-2*\i)
+        ext             v25.16b, v29.16b, v30.16b, #(16-2*\i)
+        add             v6.8h,   v6.8h,   v22.8h      // sum_alt[0]
+        add             v7.4h,   v7.4h,   v23.4h      // sum_alt[0]
+        add             v16.8h,  v16.8h,  v24.8h      // sum_alt[1]
+        add             v17.4h,  v17.4h,  v25.4h      // sum_alt[1]
+.endif
+.if \i < 6
+        ext             v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
+        ext             v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
+        add             v18.8h,  v18.8h,  v22.8h      // sum_alt[2]
+        add             v19.4h,  v19.4h,  v23.4h      // sum_alt[2]
+.else
+        add             v18.8h,  v18.8h,  v26.8h      // sum_alt[2]
+.endif
+.if \i == 0
+        mov             v20.16b, v26.16b              // sum_alt[3]
+.elseif \i == 1
+        add             v20.8h,  v20.8h,  v26.8h      // sum_alt[3]
+.else
+        ext             v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
+        ext             v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
+        add             v20.8h,  v20.8h,  v24.8h      // sum_alt[3]
+        add             v21.4h,  v21.4h,  v25.4h      // sum_alt[3]
+.endif
+.endr
+
+        movi            v31.4s,  #105
+
+        smull           v26.4s,  v4.4h,   v4.4h       // sum_hv[0]*sum_hv[0]
+        smlal2          v26.4s,  v4.8h,   v4.8h
+        smull           v27.4s,  v5.4h,   v5.4h       // sum_hv[1]*sum_hv[1]
+        smlal2          v27.4s,  v5.8h,   v5.8h
+        mul             v26.4s,  v26.4s,  v31.4s      // cost[2] *= 105
+        mul             v27.4s,  v27.4s,  v31.4s      // cost[6] *= 105
+        addv            s4,  v26.4s                   // cost[2]
+        addv            s5,  v27.4s                   // cost[6]
+
+        rev64           v1.8h,   v1.8h
+        rev64           v3.8h,   v3.8h
+        ext             v1.16b,  v1.16b,  v1.16b, #10 // sum_diag[0][14-n]
+        ext             v3.16b,  v3.16b,  v3.16b, #10 // sum_diag[1][14-n]
+
+        str             s4,  [sp, #2*4]               // cost[2]
+        str             s5,  [sp, #6*4]               // cost[6]
+
+        movrel          x4,  div_table
+        ld1             {v31.8h}, [x4]
+
+        smull           v22.4s,  v0.4h,   v0.4h       // sum_diag[0]*sum_diag[0]
+        smull2          v23.4s,  v0.8h,   v0.8h
+        smlal           v22.4s,  v1.4h,   v1.4h
+        smlal2          v23.4s,  v1.8h,   v1.8h
+        smull           v24.4s,  v2.4h,   v2.4h       // sum_diag[1]*sum_diag[1]
+        smull2          v25.4s,  v2.8h,   v2.8h
+        smlal           v24.4s,  v3.4h,   v3.4h
+        smlal2          v25.4s,  v3.8h,   v3.8h
+        uxtl            v30.4s,  v31.4h               // div_table
+        uxtl2           v31.4s,  v31.8h
+        mul             v22.4s,  v22.4s,  v30.4s      // cost[0]
+        mla             v22.4s,  v23.4s,  v31.4s      // cost[0]
+        mul             v24.4s,  v24.4s,  v30.4s      // cost[4]
+        mla             v24.4s,  v25.4s,  v31.4s      // cost[4]
+        addv            s0,  v22.4s                   // cost[0]
+        addv            s2,  v24.4s                   // cost[4]
+
+        movrel          x5,  alt_fact
+        ld1             {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
+
+        str             s0,  [sp, #0*4]               // cost[0]
+        str             s2,  [sp, #4*4]               // cost[4]
+
+        uxtl            v29.4s,  v29.4h               // div_table[2*m+1] + 105
+        uxtl            v30.4s,  v30.4h
+        uxtl            v31.4s,  v31.4h
+
+        cost_alt        s6,  s16, v6,  v7,  v16, v17  // cost[1], cost[3]
+        cost_alt        s18, s20, v18, v19, v20, v21  // cost[5], cost[7]
+        str             s6,  [sp, #1*4]               // cost[1]
+        str             s16, [sp, #3*4]               // cost[3]
+
+        mov             w0,  #0                       // best_dir
+        mov             w1,  v0.s[0]                  // best_cost
+        mov             w3,  #1                       // n
+
+        str             s18, [sp, #5*4]               // cost[5]
+        str             s20, [sp, #7*4]               // cost[7]
+
+        mov             w4,  v6.s[0]
+
+        find_best       v6,  v4, v16
+        find_best       v16, v2, v18
+        find_best       v18, v5, v20
+        find_best       v20
+
+        eor             w3,  w0,  #4                  // best_dir ^4
+        ldr             w4,  [sp, w3, uxtw #2]
+        sub             w1,  w1,  w4                  // best_cost - cost[best_dir ^ 4]
+        lsr             w1,  w1,  #10
+        str             w1,  [x2]                     // *var
+
+        add             sp,  sp,  #32
+.if \bpc == 16
+        ldr             d8,  [sp], 0x10
+.endif
+        ret
+endfunc
+.endm
diff --git a/src/arm/64/ipred.S b/src/arm/64/ipred.S
new file mode 100644 (file)
index 0000000..e53665a
--- /dev/null
@@ -0,0 +1,2476 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_dc_128_8bpc_neon, export=1
+        clz             w3,  w3
+        adr             x5,  L(ipred_dc_128_tbl)
+        sub             w3,  w3,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        movi            v0.16b,  #128
+        sub             x5,  x5,  w3, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+4:
+        st1             {v0.s}[0],  [x0], x1
+        st1             {v0.s}[0],  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.s}[0],  [x0], x1
+        st1             {v0.s}[0],  [x6], x1
+        b.gt            4b
+        ret
+8:
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x6], x1
+        b.gt            8b
+        ret
+16:
+        st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        b.gt            16b
+        ret
+320:
+        movi            v1.16b,  #128
+32:
+        st1             {v0.16b, v1.16b}, [x0], x1
+        st1             {v0.16b, v1.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b, v1.16b}, [x0], x1
+        st1             {v0.16b, v1.16b}, [x6], x1
+        b.gt            32b
+        ret
+640:
+        movi            v1.16b,  #128
+        movi            v2.16b,  #128
+        movi            v3.16b,  #128
+64:
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+        b.gt            64b
+        ret
+
+L(ipred_dc_128_tbl):
+        .hword L(ipred_dc_128_tbl) - 640b
+        .hword L(ipred_dc_128_tbl) - 320b
+        .hword L(ipred_dc_128_tbl) -  16b
+        .hword L(ipred_dc_128_tbl) -   8b
+        .hword L(ipred_dc_128_tbl) -   4b
+endfunc
+
+// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                        const pixel *const topleft,
+//                        const int width, const int height, const int a,
+//                        const int max_width, const int max_height);
+function ipred_v_8bpc_neon, export=1
+        clz             w3,  w3
+        adr             x5,  L(ipred_v_tbl)
+        sub             w3,  w3,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        add             x2,  x2,  #1
+        sub             x5,  x5,  w3, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1             {v0.s}[0],  [x2]
+4:
+        st1             {v0.s}[0],  [x0], x1
+        st1             {v0.s}[0],  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.s}[0],  [x0], x1
+        st1             {v0.s}[0],  [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v0.8b},  [x2]
+8:
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x6], x1
+        b.gt            8b
+        ret
+160:
+        ld1             {v0.16b}, [x2]
+16:
+        st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        b.gt            16b
+        ret
+320:
+        ld1             {v0.16b, v1.16b}, [x2]
+32:
+        st1             {v0.16b, v1.16b}, [x0], x1
+        st1             {v0.16b, v1.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b, v1.16b}, [x0], x1
+        st1             {v0.16b, v1.16b}, [x6], x1
+        b.gt            32b
+        ret
+640:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+64:
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+        b.gt            64b
+        ret
+
+L(ipred_v_tbl):
+        .hword L(ipred_v_tbl) - 640b
+        .hword L(ipred_v_tbl) - 320b
+        .hword L(ipred_v_tbl) - 160b
+        .hword L(ipred_v_tbl) -  80b
+        .hword L(ipred_v_tbl) -  40b
+endfunc
+
+// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                        const pixel *const topleft,
+//                        const int width, const int height, const int a,
+//                        const int max_width, const int max_height);
+function ipred_h_8bpc_neon, export=1
+        clz             w3,  w3
+        adr             x5,  L(ipred_h_tbl)
+        sub             w3,  w3,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        sub             x2,  x2,  #4
+        sub             x5,  x5,  w3, uxtw
+        mov             x7,  #-4
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+4:
+        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
+        st1             {v3.s}[0],  [x0], x1
+        st1             {v2.s}[0],  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v1.s}[0],  [x0], x1
+        st1             {v0.s}[0],  [x6], x1
+        b.gt            4b
+        ret
+8:
+        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
+        st1             {v3.8b},  [x0], x1
+        st1             {v2.8b},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v1.8b},  [x0], x1
+        st1             {v0.8b},  [x6], x1
+        b.gt            8b
+        ret
+16:
+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
+        st1             {v3.16b}, [x0], x1
+        st1             {v2.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v1.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        b.gt            16b
+        ret
+32:
+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
+        str             q3,  [x0, #16]
+        str             q2,  [x6, #16]
+        st1             {v3.16b}, [x0], x1
+        st1             {v2.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        str             q1,  [x0, #16]
+        str             q0,  [x6, #16]
+        st1             {v1.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        b.gt            32b
+        ret
+64:
+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
+        str             q3,  [x0, #16]
+        str             q2,  [x6, #16]
+        stp             q3,  q3,  [x0, #32]
+        stp             q2,  q2,  [x6, #32]
+        st1             {v3.16b}, [x0], x1
+        st1             {v2.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        str             q1,  [x0, #16]
+        str             q0,  [x6, #16]
+        stp             q1,  q1,  [x0, #32]
+        stp             q0,  q0,  [x6, #32]
+        st1             {v1.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        b.gt            64b
+        ret
+
+L(ipred_h_tbl):
+        .hword L(ipred_h_tbl) - 64b
+        .hword L(ipred_h_tbl) - 32b
+        .hword L(ipred_h_tbl) - 16b
+        .hword L(ipred_h_tbl) -  8b
+        .hword L(ipred_h_tbl) -  4b
+endfunc
+
+// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_dc_top_8bpc_neon, export=1
+        clz             w3,  w3
+        adr             x5,  L(ipred_dc_top_tbl)
+        sub             w3,  w3,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        add             x2,  x2,  #1
+        sub             x5,  x5,  w3, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1r            {v0.2s},  [x2]
+        uaddlv          h0,      v0.8b
+        rshrn           v0.8b,   v0.8h,   #3
+        dup             v0.8b,   v0.b[0]
+4:
+        st1             {v0.s}[0],  [x0], x1
+        st1             {v0.s}[0],  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.s}[0],  [x0], x1
+        st1             {v0.s}[0],  [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v0.8b},  [x2]
+        uaddlv          h0,      v0.8b
+        rshrn           v0.8b,   v0.8h,   #3
+        dup             v0.8b,   v0.b[0]
+8:
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x6], x1
+        b.gt            8b
+        ret
+160:
+        ld1             {v0.16b}, [x2]
+        uaddlv          h0,      v0.16b
+        rshrn           v0.8b,   v0.8h,   #4
+        dup             v0.16b,  v0.b[0]
+16:
+        st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        b.gt            16b
+        ret
+320:
+        ld1             {v0.16b, v1.16b}, [x2]
+        uaddlv          h0,      v0.16b
+        uaddlv          h1,      v1.16b
+        add             v2.4h,   v0.4h,   v1.4h
+        rshrn           v2.8b,   v2.8h,   #5
+        dup             v0.16b,  v2.b[0]
+        dup             v1.16b,  v2.b[0]
+32:
+        st1             {v0.16b, v1.16b}, [x0], x1
+        st1             {v0.16b, v1.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b, v1.16b}, [x0], x1
+        st1             {v0.16b, v1.16b}, [x6], x1
+        b.gt            32b
+        ret
+640:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+        uaddlv          h0,      v0.16b
+        uaddlv          h1,      v1.16b
+        uaddlv          h2,      v2.16b
+        uaddlv          h3,      v3.16b
+        add             v4.4h,   v0.4h,   v1.4h
+        add             v5.4h,   v2.4h,   v3.4h
+        add             v4.4h,   v4.4h,   v5.4h
+        rshrn           v4.8b,   v4.8h,   #6
+        dup             v0.16b,  v4.b[0]
+        dup             v1.16b,  v4.b[0]
+        dup             v2.16b,  v4.b[0]
+        dup             v3.16b,  v4.b[0]
+64:
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+        b.gt            64b
+        ret
+
+L(ipred_dc_top_tbl):
+        .hword L(ipred_dc_top_tbl) - 640b
+        .hword L(ipred_dc_top_tbl) - 320b
+        .hword L(ipred_dc_top_tbl) - 160b
+        .hword L(ipred_dc_top_tbl) -  80b
+        .hword L(ipred_dc_top_tbl) -  40b
+endfunc
+
+// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height, const int a,
+//                              const int max_width, const int max_height);
+function ipred_dc_left_8bpc_neon, export=1
+        sub             x2,  x2,  w4, uxtw
+        clz             w3,  w3
+        clz             w7,  w4
+        adr             x5,  L(ipred_dc_left_tbl)
+        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
+        sub             w7,  w7,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrh            w7,  [x5, w7, uxtw #1]
+        sub             x3,  x5,  w3, uxtw
+        sub             x5,  x5,  w7, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+
+L(ipred_dc_left_h4):
+        ld1r            {v0.2s},  [x2]
+        uaddlv          h0,      v0.8b
+        rshrn           v0.8b,   v0.8h,   #3
+        dup             v0.16b,  v0.b[0]
+        br              x3
+L(ipred_dc_left_w4):
+        st1             {v0.s}[0],  [x0], x1
+        st1             {v0.s}[0],  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.s}[0],  [x0], x1
+        st1             {v0.s}[0],  [x6], x1
+        b.gt            L(ipred_dc_left_w4)
+        ret
+
+L(ipred_dc_left_h8):
+        ld1             {v0.8b},  [x2]
+        uaddlv          h0,      v0.8b
+        rshrn           v0.8b,   v0.8h,   #3
+        dup             v0.16b,  v0.b[0]
+        br              x3
+L(ipred_dc_left_w8):
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x6], x1
+        b.gt            L(ipred_dc_left_w8)
+        ret
+
+L(ipred_dc_left_h16):
+        ld1             {v0.16b}, [x2]
+        uaddlv          h0,      v0.16b
+        rshrn           v0.8b,   v0.8h,   #4
+        dup             v0.16b,  v0.b[0]
+        br              x3
+L(ipred_dc_left_w16):
+        st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        b.gt            L(ipred_dc_left_w16)
+        ret
+
+L(ipred_dc_left_h32):
+        ld1             {v0.16b, v1.16b}, [x2]
+        uaddlv          h0,      v0.16b
+        uaddlv          h1,      v1.16b
+        add             v0.4h,   v0.4h,   v1.4h
+        rshrn           v0.8b,   v0.8h,   #5
+        dup             v0.16b,  v0.b[0]
+        br              x3
+L(ipred_dc_left_w32):
+        mov             v1.16b,  v0.16b
+1:
+        st1             {v0.16b, v1.16b}, [x0], x1
+        st1             {v0.16b, v1.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b, v1.16b}, [x0], x1
+        st1             {v0.16b, v1.16b}, [x6], x1
+        b.gt            1b
+        ret
+
+L(ipred_dc_left_h64):
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+        uaddlv          h0,      v0.16b
+        uaddlv          h1,      v1.16b
+        uaddlv          h2,      v2.16b
+        uaddlv          h3,      v3.16b
+        add             v0.4h,   v0.4h,   v1.4h
+        add             v2.4h,   v2.4h,   v3.4h
+        add             v0.4h,   v0.4h,   v2.4h
+        rshrn           v0.8b,   v0.8h,   #6
+        dup             v0.16b,  v0.b[0]
+        br              x3
+L(ipred_dc_left_w64):
+        mov             v1.16b,  v0.16b
+        mov             v2.16b,  v0.16b
+        mov             v3.16b,  v0.16b
+1:
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+        b.gt            1b
+        ret
+
+L(ipred_dc_left_tbl):
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+endfunc
+
+// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                         const pixel *const topleft,
+//                         const int width, const int height, const int a,
+//                         const int max_width, const int max_height);
+function ipred_dc_8bpc_neon, export=1
+        sub             x2,  x2,  w4, uxtw
+        add             w7,  w3,  w4             // width + height
+        clz             w3,  w3
+        clz             w6,  w4
+        dup             v16.8h, w7               // width + height
+        adr             x5,  L(ipred_dc_tbl)
+        rbit            w7,  w7                  // rbit(width + height)
+        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
+        sub             w6,  w6,  #25
+        clz             w7,  w7                  // ctz(width + height)
+        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrh            w6,  [x5, w6, uxtw #1]
+        neg             w7,  w7                  // -ctz(width + height)
+        sub             x3,  x5,  w3, uxtw
+        sub             x5,  x5,  w6, uxtw
+        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
+        dup             v17.8h,  w7              // -ctz(width + height)
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+
+L(ipred_dc_h4):
+        ld1             {v0.s}[0],  [x2], #4
+        ins             v0.s[1], wzr
+        uaddlv          h0,      v0.8b
+        br              x3
+L(ipred_dc_w4):
+        add             x2,  x2,  #1
+        ld1             {v1.s}[0],  [x2]
+        ins             v1.s[1], wzr
+        add             v0.4h,   v0.4h,   v16.4h
+        uaddlv          h1,      v1.8b
+        cmp             w4,  #4
+        add             v0.4h,   v0.4h,   v1.4h
+        ushl            v0.4h,   v0.4h,   v17.4h
+        b.eq            1f
+        // h = 8/16
+        mov             w16, #(0x3334/2)
+        movk            w16, #(0x5556/2), lsl #16
+        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
+        lsr             w16, w16, w17
+        dup             v16.4h,  w16
+        sqdmulh         v0.4h,   v0.4h,   v16.4h
+1:
+        dup             v0.8b,   v0.b[0]
+2:
+        st1             {v0.s}[0],  [x0], x1
+        st1             {v0.s}[0],  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.s}[0],  [x0], x1
+        st1             {v0.s}[0],  [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_h8):
+        ld1             {v0.8b},  [x2], #8
+        uaddlv          h0,      v0.8b
+        br              x3
+L(ipred_dc_w8):
+        add             x2,  x2,  #1
+        ld1             {v1.8b},  [x2]
+        add             v0.4h,   v0.4h,   v16.4h
+        uaddlv          h1,      v1.8b
+        cmp             w4,  #8
+        add             v0.4h,   v0.4h,   v1.4h
+        ushl            v0.4h,   v0.4h,   v17.4h
+        b.eq            1f
+        // h = 4/16/32
+        cmp             w4,  #32
+        mov             w16, #(0x3334/2)
+        mov             w17, #(0x5556/2)
+        csel            w16, w16, w17, eq
+        dup             v16.4h,  w16
+        sqdmulh         v0.4h,   v0.4h,   v16.4h
+1:
+        dup             v0.8b,   v0.b[0]
+2:
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_h16):
+        ld1             {v0.16b}, [x2], #16
+        uaddlv          h0,      v0.16b
+        br              x3
+L(ipred_dc_w16):
+        add             x2,  x2,  #1
+        ld1             {v1.16b}, [x2]
+        add             v0.4h,   v0.4h,   v16.4h
+        uaddlv          h1,      v1.16b
+        cmp             w4,  #16
+        add             v0.4h,   v0.4h,   v1.4h
+        ushl            v0.4h,   v0.4h,   v17.4h
+        b.eq            1f
+        // h = 4/8/32/64
+        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
+        mov             w16, #(0x3334/2)
+        mov             w17, #(0x5556/2)
+        csel            w16, w16, w17, eq
+        dup             v16.4h,  w16
+        sqdmulh         v0.4h,   v0.4h,   v16.4h
+1:
+        dup             v0.16b,  v0.b[0]
+2:
+        st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_h32):
+        ld1             {v0.16b, v1.16b}, [x2], #32
+        uaddlv          h0,      v0.16b
+        uaddlv          h1,      v1.16b
+        add             v0.4h,   v0.4h,   v1.4h
+        br              x3
+L(ipred_dc_w32):
+        add             x2,  x2,  #1
+        ld1             {v1.16b, v2.16b}, [x2]
+        add             v0.4h,   v0.4h,   v16.4h
+        uaddlv          h1,      v1.16b
+        uaddlv          h2,      v2.16b
+        cmp             w4,  #32
+        add             v0.4h,   v0.4h,   v1.4h
+        add             v0.4h,   v0.4h,   v2.4h
+        ushl            v4.4h,   v0.4h,   v17.4h
+        b.eq            1f
+        // h = 8/16/64
+        cmp             w4,  #8
+        mov             w16, #(0x3334/2)
+        mov             w17, #(0x5556/2)
+        csel            w16, w16, w17, eq
+        dup             v16.4h,  w16
+        sqdmulh         v4.4h,   v4.4h,   v16.4h
+1:
+        dup             v0.16b,  v4.b[0]
+        dup             v1.16b,  v4.b[0]
+2:
+        st1             {v0.16b, v1.16b}, [x0], x1
+        st1             {v0.16b, v1.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b, v1.16b}, [x0], x1
+        st1             {v0.16b, v1.16b}, [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_h64):
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
+        uaddlv          h0,      v0.16b
+        uaddlv          h1,      v1.16b
+        uaddlv          h2,      v2.16b
+        uaddlv          h3,      v3.16b
+        add             v0.4h,   v0.4h,   v1.4h
+        add             v2.4h,   v2.4h,   v3.4h
+        add             v0.4h,   v0.4h,   v2.4h
+        br              x3
+L(ipred_dc_w64):
+        add             x2,  x2,  #1
+        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
+        add             v0.4h,   v0.4h,   v16.4h
+        uaddlv          h1,      v1.16b
+        uaddlv          h2,      v2.16b
+        uaddlv          h3,      v3.16b
+        uaddlv          h4,      v4.16b
+        add             v1.4h,   v1.4h,   v2.4h
+        add             v3.4h,   v3.4h,   v4.4h
+        cmp             w4,  #64
+        add             v0.4h,   v0.4h,   v1.4h
+        add             v0.4h,   v0.4h,   v3.4h
+        ushl            v4.4h,   v0.4h,   v17.4h
+        b.eq            1f
+        // h = 16/32
+        mov             w16, #(0x5556/2)
+        movk            w16, #(0x3334/2), lsl #16
+        lsr             w16, w16, w4
+        dup             v16.4h,  w16
+        sqdmulh         v4.4h,   v4.4h,   v16.4h
+1:
+        dup             v0.16b,  v4.b[0]
+        dup             v1.16b,  v4.b[0]
+        dup             v2.16b,  v4.b[0]
+        dup             v3.16b,  v4.b[0]
+2:
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_tbl):
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+endfunc
+
+// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                            const pixel *const topleft,
+//                            const int width, const int height, const int a,
+//                            const int max_width, const int max_height);
+function ipred_paeth_8bpc_neon, export=1
+        clz             w9,  w3
+        adr             x5,  L(ipred_paeth_tbl)
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x5, w9, uxtw #1]
+        ld1r            {v4.16b},  [x2]
+        add             x8,  x2,  #1
+        sub             x2,  x2,  #4
+        sub             x5,  x5,  w9, uxtw
+        mov             x7,  #-4
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1r            {v5.4s},  [x8]
+        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
+4:
+        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
+        zip1            v0.2s,   v0.2s,   v1.2s
+        zip1            v2.2s,   v2.2s,   v3.2s
+        uaddw           v16.8h,  v6.8h,   v0.8b
+        uaddw           v17.8h,  v6.8h,   v2.8b
+        sqxtun          v16.8b,  v16.8h           // base
+        sqxtun2         v16.16b, v17.8h
+        zip1            v0.2d,   v0.2d,   v2.2d
+        uabd            v20.16b, v5.16b,  v16.16b // tdiff
+        uabd            v22.16b, v4.16b,  v16.16b // tldiff
+        uabd            v16.16b, v0.16b,  v16.16b // ldiff
+        umin            v18.16b, v20.16b, v22.16b // min(tdiff, tldiff)
+        cmhs            v20.16b, v22.16b, v20.16b // tldiff >= tdiff
+        cmhs            v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff
+        bsl             v20.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
+        bit             v20.16b, v0.16b,  v16.16b // ldiff <= min ? left : ...
+        st1             {v20.s}[3], [x0], x1
+        st1             {v20.s}[2], [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v20.s}[1], [x0], x1
+        st1             {v20.s}[0], [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1r            {v5.2d},  [x8]
+        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
+8:
+        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
+        uaddw           v16.8h,  v6.8h,   v0.8b
+        uaddw           v17.8h,  v6.8h,   v1.8b
+        uaddw           v18.8h,  v6.8h,   v2.8b
+        uaddw           v19.8h,  v6.8h,   v3.8b
+        sqxtun          v16.8b,  v16.8h           // base
+        sqxtun2         v16.16b, v17.8h
+        sqxtun          v18.8b,  v18.8h
+        sqxtun2         v18.16b, v19.8h
+        zip1            v2.2d,   v2.2d,   v3.2d
+        zip1            v0.2d,   v0.2d,   v1.2d
+        uabd            v21.16b, v5.16b,  v18.16b // tdiff
+        uabd            v20.16b, v5.16b,  v16.16b
+        uabd            v23.16b, v4.16b,  v18.16b // tldiff
+        uabd            v22.16b, v4.16b,  v16.16b
+        uabd            v17.16b, v2.16b,  v18.16b // ldiff
+        uabd            v16.16b, v0.16b,  v16.16b
+        umin            v19.16b, v21.16b, v23.16b // min(tdiff, tldiff)
+        umin            v18.16b, v20.16b, v22.16b
+        cmhs            v21.16b, v23.16b, v21.16b // tldiff >= tdiff
+        cmhs            v20.16b, v22.16b, v20.16b
+        cmhs            v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff
+        cmhs            v16.16b, v18.16b, v16.16b
+        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
+        bsl             v20.16b, v5.16b,  v4.16b
+        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
+        bit             v20.16b, v0.16b,  v16.16b
+        st1             {v21.d}[1], [x0], x1
+        st1             {v21.d}[0], [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v20.d}[1], [x0], x1
+        st1             {v20.d}[0], [x6], x1
+        b.gt            8b
+        ret
+160:
+320:
+640:
+        ld1             {v5.16b},  [x8], #16
+        mov             w9,  w3
+        // Set up pointers for four rows in parallel; x0, x6, x5, x10
+        add             x5,  x0,  x1
+        add             x10, x6,  x1
+        lsl             x1,  x1,  #1
+        sub             x1,  x1,  w3, uxtw
+1:
+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
+2:
+        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
+        usubl2          v7.8h,   v5.16b,  v4.16b
+        uaddw           v24.8h,  v6.8h,   v0.8b
+        uaddw           v25.8h,  v7.8h,   v0.8b
+        uaddw           v26.8h,  v6.8h,   v1.8b
+        uaddw           v27.8h,  v7.8h,   v1.8b
+        uaddw           v28.8h,  v6.8h,   v2.8b
+        uaddw           v29.8h,  v7.8h,   v2.8b
+        uaddw           v30.8h,  v6.8h,   v3.8b
+        uaddw           v31.8h,  v7.8h,   v3.8b
+        sqxtun          v17.8b,  v26.8h           // base
+        sqxtun2         v17.16b, v27.8h
+        sqxtun          v16.8b,  v24.8h
+        sqxtun2         v16.16b, v25.8h
+        sqxtun          v19.8b,  v30.8h
+        sqxtun2         v19.16b, v31.8h
+        sqxtun          v18.8b,  v28.8h
+        sqxtun2         v18.16b, v29.8h
+        uabd            v23.16b, v5.16b,  v19.16b // tdiff
+        uabd            v22.16b, v5.16b,  v18.16b
+        uabd            v21.16b, v5.16b,  v17.16b
+        uabd            v20.16b, v5.16b,  v16.16b
+        uabd            v27.16b, v4.16b,  v19.16b // tldiff
+        uabd            v26.16b, v4.16b,  v18.16b
+        uabd            v25.16b, v4.16b,  v17.16b
+        uabd            v24.16b, v4.16b,  v16.16b
+        uabd            v19.16b, v3.16b,  v19.16b // ldiff
+        uabd            v18.16b, v2.16b,  v18.16b
+        uabd            v17.16b, v1.16b,  v17.16b
+        uabd            v16.16b, v0.16b,  v16.16b
+        umin            v31.16b, v23.16b, v27.16b // min(tdiff, tldiff)
+        umin            v30.16b, v22.16b, v26.16b
+        umin            v29.16b, v21.16b, v25.16b
+        umin            v28.16b, v20.16b, v24.16b
+        cmhs            v23.16b, v27.16b, v23.16b // tldiff >= tdiff
+        cmhs            v22.16b, v26.16b, v22.16b
+        cmhs            v21.16b, v25.16b, v21.16b
+        cmhs            v20.16b, v24.16b, v20.16b
+        cmhs            v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff
+        cmhs            v18.16b, v30.16b, v18.16b
+        cmhs            v17.16b, v29.16b, v17.16b
+        cmhs            v16.16b, v28.16b, v16.16b
+        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
+        bsl             v22.16b, v5.16b,  v4.16b
+        bsl             v21.16b, v5.16b,  v4.16b
+        bsl             v20.16b, v5.16b,  v4.16b
+        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
+        bit             v22.16b, v2.16b,  v18.16b
+        bit             v21.16b, v1.16b,  v17.16b
+        bit             v20.16b, v0.16b,  v16.16b
+        subs            w3,  w3,  #16
+        st1             {v23.16b}, [x0],  #16
+        st1             {v22.16b}, [x6],  #16
+        st1             {v21.16b}, [x5],  #16
+        st1             {v20.16b}, [x10], #16
+        b.le            8f
+        ld1             {v5.16b},  [x8], #16
+        b               2b
+8:
+        subs            w4,  w4,  #4
+        b.le            9f
+        // End of horizontal loop, move pointers to next four rows
+        sub             x8,  x8,  w9, uxtw
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        // Load the top row as early as possible
+        ld1             {v5.16b},  [x8], #16
+        add             x5,  x5,  x1
+        add             x10, x10, x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_paeth_tbl):
+        .hword L(ipred_paeth_tbl) - 640b
+        .hword L(ipred_paeth_tbl) - 320b
+        .hword L(ipred_paeth_tbl) - 160b
+        .hword L(ipred_paeth_tbl) -  80b
+        .hword L(ipred_paeth_tbl) -  40b
+endfunc
+
+// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_smooth_8bpc_neon, export=1
+        movrel          x10, X(sm_weights)
+        add             x11, x10, w4, uxtw
+        add             x10, x10, w3, uxtw
+        clz             w9,  w3
+        adr             x5,  L(ipred_smooth_tbl)
+        sub             x12, x2,  w4, uxtw
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x5, w9, uxtw #1]
+        ld1r            {v4.16b},  [x12] // bottom
+        add             x8,  x2,  #1
+        sub             x5,  x5,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+        ld1r            {v6.2s}, [x8]             // top
+        ld1r            {v7.2s}, [x10]            // weights_hor
+        dup             v5.16b,  v6.b[3]          // right
+        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
+        uxtl            v7.8h,   v7.8b            // weights_hor
+4:
+        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
+        shll            v20.8h,  v5.8b,   #8      // right*256
+        shll            v21.8h,  v5.8b,   #8
+        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
+        zip1            v0.2s,   v3.2s,   v2.2s
+        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
+        zip1            v18.2s,  v18.2s,  v19.2s
+        shll            v22.8h,  v4.8b,   #8      // bottom*256
+        shll            v23.8h,  v4.8b,   #8
+        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
+        usubl           v1.8h,   v1.8b,   v5.8b
+        uxtl            v16.8h,  v16.8b           // weights_ver
+        uxtl            v18.8h,  v18.8b
+        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
+        mla             v21.8h,  v1.8h,   v7.8h
+        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
+        mla             v23.8h,  v6.8h,   v18.8h
+        uhadd           v20.8h,  v20.8h,  v22.8h
+        uhadd           v21.8h,  v21.8h,  v23.8h
+        rshrn           v20.8b,  v20.8h,  #8
+        rshrn           v21.8b,  v21.8h,  #8
+        st1             {v20.s}[0], [x0], x1
+        st1             {v20.s}[1], [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v21.s}[0], [x0], x1
+        st1             {v21.s}[1], [x6], x1
+        b.gt            4b
+        ret
+80:
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+        ld1             {v6.8b}, [x8]             // top
+        ld1             {v7.8b}, [x10]            // weights_hor
+        dup             v5.16b,  v6.b[7]          // right
+        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
+        uxtl            v7.8h,   v7.8b            // weights_hor
+8:
+        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
+        shll            v20.8h,  v5.8b,   #8      // right*256
+        shll            v21.8h,  v5.8b,   #8
+        shll            v22.8h,  v5.8b,   #8
+        shll            v23.8h,  v5.8b,   #8
+        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
+        usubl           v1.8h,   v1.8b,   v5.8b
+        usubl           v2.8h,   v2.8b,   v5.8b
+        usubl           v3.8h,   v3.8b,   v5.8b
+        shll            v24.8h,  v4.8b,   #8      // bottom*256
+        shll            v25.8h,  v4.8b,   #8
+        shll            v26.8h,  v4.8b,   #8
+        shll            v27.8h,  v4.8b,   #8
+        uxtl            v16.8h,  v16.8b           // weights_ver
+        uxtl            v17.8h,  v17.8b
+        uxtl            v18.8h,  v18.8b
+        uxtl            v19.8h,  v19.8b
+        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
+        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
+        mla             v22.8h,  v1.8h,   v7.8h
+        mla             v23.8h,  v0.8h,   v7.8h
+        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
+        mla             v25.8h,  v6.8h,   v17.8h
+        mla             v26.8h,  v6.8h,   v18.8h
+        mla             v27.8h,  v6.8h,   v19.8h
+        uhadd           v20.8h,  v20.8h,  v24.8h
+        uhadd           v21.8h,  v21.8h,  v25.8h
+        uhadd           v22.8h,  v22.8h,  v26.8h
+        uhadd           v23.8h,  v23.8h,  v27.8h
+        rshrn           v20.8b,  v20.8h,  #8
+        rshrn           v21.8b,  v21.8h,  #8
+        rshrn           v22.8b,  v22.8h,  #8
+        rshrn           v23.8b,  v23.8h,  #8
+        st1             {v20.8b}, [x0], x1
+        st1             {v21.8b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v22.8b}, [x0], x1
+        st1             {v23.8b}, [x6], x1
+        b.gt            8b
+        ret
+160:
+320:
+640:
+        add             x12, x2,  w3, uxtw
+        sub             x2,  x2,  #2
+        mov             x7,  #-2
+        ld1r            {v5.16b}, [x12]           // right
+        sub             x1,  x1,  w3, uxtw
+        mov             w9,  w3
+
+1:
+        ld2r            {v0.8b, v1.8b},   [x2],  x7 // left
+        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
+        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
+        usubl           v1.8h,   v1.8b,   v5.8b
+        uxtl            v16.8h,  v16.8b           // weights_ver
+        uxtl            v17.8h,  v17.8b
+2:
+        ld1             {v7.16b}, [x10],  #16     // weights_hor
+        ld1             {v3.16b}, [x8],   #16     // top
+        shll            v20.8h,  v5.8b,   #8      // right*256
+        shll            v21.8h,  v5.8b,   #8
+        shll            v22.8h,  v5.8b,   #8
+        shll            v23.8h,  v5.8b,   #8
+        uxtl            v6.8h,   v7.8b            // weights_hor
+        uxtl2           v7.8h,   v7.16b
+        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
+        usubl2          v3.8h,   v3.16b,  v4.16b
+        mla             v20.8h,  v1.8h,   v6.8h   // right*256  + (left-right)*weights_hor
+        mla             v21.8h,  v1.8h,   v7.8h   // (left flipped)
+        mla             v22.8h,  v0.8h,   v6.8h
+        mla             v23.8h,  v0.8h,   v7.8h
+        shll            v24.8h,  v4.8b,   #8      // bottom*256
+        shll            v25.8h,  v4.8b,   #8
+        shll            v26.8h,  v4.8b,   #8
+        shll            v27.8h,  v4.8b,   #8
+        mla             v24.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
+        mla             v25.8h,  v3.8h,   v16.8h
+        mla             v26.8h,  v2.8h,   v17.8h
+        mla             v27.8h,  v3.8h,   v17.8h
+        uhadd           v20.8h,  v20.8h,  v24.8h
+        uhadd           v21.8h,  v21.8h,  v25.8h
+        uhadd           v22.8h,  v22.8h,  v26.8h
+        uhadd           v23.8h,  v23.8h,  v27.8h
+        rshrn           v20.8b,  v20.8h,  #8
+        rshrn2          v20.16b, v21.8h,  #8
+        rshrn           v22.8b,  v22.8h,  #8
+        rshrn2          v22.16b, v23.8h,  #8
+        subs            w3,  w3,  #16
+        st1             {v20.16b}, [x0],  #16
+        st1             {v22.16b}, [x6],  #16
+        b.gt            2b
+        subs            w4,  w4,  #2
+        b.le            9f
+        sub             x8,  x8,  w9, uxtw
+        sub             x10, x10, w9, uxtw
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_smooth_tbl):
+        .hword L(ipred_smooth_tbl) - 640b
+        .hword L(ipred_smooth_tbl) - 320b
+        .hword L(ipred_smooth_tbl) - 160b
+        .hword L(ipred_smooth_tbl) -  80b
+        .hword L(ipred_smooth_tbl) -  40b
+endfunc
+
+// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height, const int a,
+//                               const int max_width, const int max_height);
+function ipred_smooth_v_8bpc_neon, export=1
+        movrel          x7,  X(sm_weights)
+        add             x7,  x7,  w4, uxtw
+        clz             w9,  w3
+        adr             x5,  L(ipred_smooth_v_tbl)
+        sub             x8,  x2,  w4, uxtw
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x5, w9, uxtw #1]
+        ld1r            {v4.16b},  [x8] // bottom
+        add             x2,  x2,  #1
+        sub             x5,  x5,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1r            {v6.2s}, [x2]             // top
+        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
+4:
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
+        shll            v22.8h,  v4.8b,   #8      // bottom*256
+        shll            v23.8h,  v4.8b,   #8
+        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
+        zip1            v18.2s,  v18.2s,  v19.2s
+        uxtl            v16.8h,  v16.8b           // weights_ver
+        uxtl            v18.8h,  v18.8b
+        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
+        mla             v23.8h,  v6.8h,   v18.8h
+        rshrn           v22.8b,  v22.8h,  #8
+        rshrn           v23.8b,  v23.8h,  #8
+        st1             {v22.s}[0], [x0], x1
+        st1             {v22.s}[1], [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v23.s}[0], [x0], x1
+        st1             {v23.s}[1], [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v6.8b}, [x2]             // top
+        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
+8:
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
+        shll            v24.8h,  v4.8b,   #8      // bottom*256
+        shll            v25.8h,  v4.8b,   #8
+        shll            v26.8h,  v4.8b,   #8
+        shll            v27.8h,  v4.8b,   #8
+        uxtl            v16.8h,  v16.8b           // weights_ver
+        uxtl            v17.8h,  v17.8b
+        uxtl            v18.8h,  v18.8b
+        uxtl            v19.8h,  v19.8b
+        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
+        mla             v25.8h,  v6.8h,   v17.8h
+        mla             v26.8h,  v6.8h,   v18.8h
+        mla             v27.8h,  v6.8h,   v19.8h
+        rshrn           v24.8b,  v24.8h,  #8
+        rshrn           v25.8b,  v25.8h,  #8
+        rshrn           v26.8b,  v26.8h,  #8
+        rshrn           v27.8b,  v27.8h,  #8
+        st1             {v24.8b}, [x0], x1
+        st1             {v25.8b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v26.8b}, [x0], x1
+        st1             {v27.8b}, [x6], x1
+        b.gt            8b
+        ret
+160:
+320:
+640:
+        // Set up pointers for four rows in parallel; x0, x6, x5, x8
+        add             x5,  x0,  x1
+        add             x8,  x6,  x1
+        lsl             x1,  x1,  #1
+        sub             x1,  x1,  w3, uxtw
+        mov             w9,  w3
+
+1:
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+        uxtl            v16.8h,  v16.8b           // weights_ver
+        uxtl            v17.8h,  v17.8b
+        uxtl            v18.8h,  v18.8b
+        uxtl            v19.8h,  v19.8b
+2:
+        ld1             {v3.16b}, [x2],   #16     // top
+        shll            v20.8h,  v4.8b,   #8      // bottom*256
+        shll            v21.8h,  v4.8b,   #8
+        shll            v22.8h,  v4.8b,   #8
+        shll            v23.8h,  v4.8b,   #8
+        shll            v24.8h,  v4.8b,   #8
+        shll            v25.8h,  v4.8b,   #8
+        shll            v26.8h,  v4.8b,   #8
+        shll            v27.8h,  v4.8b,   #8
+        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
+        usubl2          v3.8h,   v3.16b,  v4.16b
+        mla             v20.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
+        mla             v21.8h,  v3.8h,   v16.8h
+        mla             v22.8h,  v2.8h,   v17.8h
+        mla             v23.8h,  v3.8h,   v17.8h
+        mla             v24.8h,  v2.8h,   v18.8h
+        mla             v25.8h,  v3.8h,   v18.8h
+        mla             v26.8h,  v2.8h,   v19.8h
+        mla             v27.8h,  v3.8h,   v19.8h
+        rshrn           v20.8b,  v20.8h,  #8
+        rshrn2          v20.16b, v21.8h,  #8
+        rshrn           v22.8b,  v22.8h,  #8
+        rshrn2          v22.16b, v23.8h,  #8
+        rshrn           v24.8b,  v24.8h,  #8
+        rshrn2          v24.16b, v25.8h,  #8
+        rshrn           v26.8b,  v26.8h,  #8
+        rshrn2          v26.16b, v27.8h,  #8
+        subs            w3,  w3,  #16
+        st1             {v20.16b}, [x0],  #16
+        st1             {v22.16b}, [x6],  #16
+        st1             {v24.16b}, [x5],  #16
+        st1             {v26.16b}, [x8],  #16
+        b.gt            2b
+        subs            w4,  w4,  #4
+        b.le            9f
+        sub             x2,  x2,  w9, uxtw
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        add             x5,  x5,  x1
+        add             x8,  x8,  x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_smooth_v_tbl):
+        .hword L(ipred_smooth_v_tbl) - 640b
+        .hword L(ipred_smooth_v_tbl) - 320b
+        .hword L(ipred_smooth_v_tbl) - 160b
+        .hword L(ipred_smooth_v_tbl) -  80b
+        .hword L(ipred_smooth_v_tbl) -  40b
+endfunc
+
+// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height, const int a,
+//                               const int max_width, const int max_height);
+function ipred_smooth_h_8bpc_neon, export=1
+        movrel          x8,  X(sm_weights)
+        add             x8,  x8,  w3, uxtw
+        clz             w9,  w3
+        adr             x5,  L(ipred_smooth_h_tbl)
+        add             x12, x2,  w3, uxtw
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x5, w9, uxtw #1]
+        ld1r            {v5.16b},  [x12] // right
+        sub             x5,  x5,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1r            {v7.2s}, [x8]             // weights_hor
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+        uxtl            v7.8h,   v7.8b            // weights_hor
+4:
+        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
+        shll            v20.8h,  v5.8b,   #8      // right*256
+        shll            v21.8h,  v5.8b,   #8
+        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
+        zip1            v0.2s,   v3.2s,   v2.2s
+        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
+        usubl           v1.8h,   v1.8b,   v5.8b
+        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
+        mla             v21.8h,  v1.8h,   v7.8h
+        rshrn           v20.8b,  v20.8h,  #8
+        rshrn           v21.8b,  v21.8h,  #8
+        st1             {v20.s}[0], [x0], x1
+        st1             {v20.s}[1], [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v21.s}[0], [x0], x1
+        st1             {v21.s}[1], [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v7.8b}, [x8]             // weights_hor
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+        uxtl            v7.8h,   v7.8b            // weights_hor
+8:
+        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
+        shll            v20.8h,  v5.8b,   #8      // right*256
+        shll            v21.8h,  v5.8b,   #8
+        shll            v22.8h,  v5.8b,   #8
+        shll            v23.8h,  v5.8b,   #8
+        usubl           v3.8h,   v3.8b,   v5.8b   // left-right
+        usubl           v2.8h,   v2.8b,   v5.8b
+        usubl           v1.8h,   v1.8b,   v5.8b
+        usubl           v0.8h,   v0.8b,   v5.8b
+        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
+        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
+        mla             v22.8h,  v1.8h,   v7.8h
+        mla             v23.8h,  v0.8h,   v7.8h
+        rshrn           v20.8b,  v20.8h,  #8
+        rshrn           v21.8b,  v21.8h,  #8
+        rshrn           v22.8b,  v22.8h,  #8
+        rshrn           v23.8b,  v23.8h,  #8
+        st1             {v20.8b}, [x0], x1
+        st1             {v21.8b}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v22.8b}, [x0], x1
+        st1             {v23.8b}, [x6], x1
+        b.gt            8b
+        ret
+160:
+320:
+640:
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+        // Set up pointers for four rows in parallel; x0, x6, x5, x10
+        add             x5,  x0,  x1
+        add             x10, x6,  x1
+        lsl             x1,  x1,  #1
+        sub             x1,  x1,  w3, uxtw
+        mov             w9,  w3
+
+1:
+        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},   [x2],  x7 // left
+        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
+        usubl           v1.8h,   v1.8b,   v5.8b
+        usubl           v2.8h,   v2.8b,   v5.8b
+        usubl           v3.8h,   v3.8b,   v5.8b
+2:
+        ld1             {v7.16b}, [x8],   #16     // weights_hor
+        shll            v20.8h,  v5.8b,   #8      // right*256
+        shll            v21.8h,  v5.8b,   #8
+        shll            v22.8h,  v5.8b,   #8
+        shll            v23.8h,  v5.8b,   #8
+        shll            v24.8h,  v5.8b,   #8
+        shll            v25.8h,  v5.8b,   #8
+        shll            v26.8h,  v5.8b,   #8
+        shll            v27.8h,  v5.8b,   #8
+        uxtl            v6.8h,   v7.8b            // weights_hor
+        uxtl2           v7.8h,   v7.16b
+        mla             v20.8h,  v3.8h,   v6.8h   // right*256  + (left-right)*weights_hor
+        mla             v21.8h,  v3.8h,   v7.8h   // (left flipped)
+        mla             v22.8h,  v2.8h,   v6.8h
+        mla             v23.8h,  v2.8h,   v7.8h
+        mla             v24.8h,  v1.8h,   v6.8h
+        mla             v25.8h,  v1.8h,   v7.8h
+        mla             v26.8h,  v0.8h,   v6.8h
+        mla             v27.8h,  v0.8h,   v7.8h
+        rshrn           v20.8b,  v20.8h,  #8
+        rshrn2          v20.16b, v21.8h,  #8
+        rshrn           v22.8b,  v22.8h,  #8
+        rshrn2          v22.16b, v23.8h,  #8
+        rshrn           v24.8b,  v24.8h,  #8
+        rshrn2          v24.16b, v25.8h,  #8
+        rshrn           v26.8b,  v26.8h,  #8
+        rshrn2          v26.16b, v27.8h,  #8
+        subs            w3,  w3,  #16
+        st1             {v20.16b}, [x0],  #16
+        st1             {v22.16b}, [x6],  #16
+        st1             {v24.16b}, [x5],  #16
+        st1             {v26.16b}, [x10], #16
+        b.gt            2b
+        subs            w4,  w4,  #4
+        b.le            9f
+        sub             x8,  x8,  w9, uxtw
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        add             x5,  x5,  x1
+        add             x10, x10, x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_smooth_h_tbl):
+        .hword L(ipred_smooth_h_tbl) - 640b
+        .hword L(ipred_smooth_h_tbl) - 320b
+        .hword L(ipred_smooth_h_tbl) - 160b
+        .hword L(ipred_smooth_h_tbl) -  80b
+        .hword L(ipred_smooth_h_tbl) -  40b
+endfunc
+
+// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int filt_idx,
+//                             const int max_width, const int max_height);
+function ipred_filter_8bpc_neon, export=1
+        and             w5,  w5,  #511
+        movrel          x6,  X(filter_intra_taps)
+        lsl             w5,  w5,  #6
+        add             x6,  x6,  w5, uxtw
+        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
+        clz             w9,  w3
+        adr             x5,  L(ipred_filter_tbl)
+        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
+        sub             w9,  w9,  #26
+        ldrh            w9,  [x5, w9, uxtw #1]
+        sxtl            v16.8h,  v16.8b
+        sxtl            v17.8h,  v17.8b
+        sub             x5,  x5,  w9, uxtw
+        sxtl            v18.8h,  v18.8b
+        sxtl            v19.8h,  v19.8b
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        sxtl            v20.8h,  v20.8b
+        sxtl            v21.8h,  v21.8b
+        sxtl            v22.8h,  v22.8b
+        br              x5
+40:
+        ldur            s0,  [x2, #1]             // top (0-3)
+        sub             x2,  x2,  #2
+        mov             x7,  #-2
+        uxtl            v0.8h,   v0.8b            // top (0-3)
+4:
+        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
+        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
+        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
+        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
+        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
+        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
+        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
+        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
+        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
+        sqrshrun        v2.8b,   v2.8h,   #4
+        subs            w4,  w4,  #2
+        st1             {v2.s}[0], [x0], x1
+        uxtl            v0.8h,   v2.8b
+        st1             {v2.s}[1], [x6], x1
+        ext             v0.16b,  v0.16b,  v0.16b, #8 // move top from [4-7] to [0-3]
+        b.gt            4b
+        ret
+80:
+        ldur            d0,  [x2, #1]             // top (0-7)
+        sub             x2,  x2,  #2
+        mov             x7,  #-2
+        uxtl            v0.8h,   v0.8b            // top (0-7)
+8:
+        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
+        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
+        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
+        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
+        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
+        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
+        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
+        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
+        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
+        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
+        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
+        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
+        sqrshrun        v2.8b,   v2.8h,   #4
+        uxtl            v1.8h,   v2.8b            // first block, in 16 bit
+        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
+        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
+        mla             v3.8h,   v21.8h,  v1.h[3] // p5(left[0]) * filter(5)
+        mla             v3.8h,   v22.8h,  v1.h[7] // p6(left[1]) * filter(6)
+        sqrshrun        v3.8b,   v3.8h,   #4
+        subs            w4,  w4,  #2
+        st2             {v2.s, v3.s}[0], [x0], x1
+        zip2            v0.2s,   v2.2s,   v3.2s
+        st2             {v2.s, v3.s}[1], [x6], x1
+        uxtl            v0.8h,   v0.8b
+        b.gt            8b
+        ret
+160:
+320:
+        add             x8,  x2,  #1
+        sub             x2,  x2,  #2
+        mov             x7,  #-2
+        sub             x1,  x1,  w3, uxtw
+        mov             w9,  w3
+
+1:
+        ld1             {v0.s}[0], [x2], x7       // left (0-1) + topleft (2)
+        uxtl            v0.8h,   v0.8b            // left (0-1) + topleft (2)
+2:
+        ld1             {v2.16b}, [x8],   #16     // top(0-15)
+        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
+        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
+        uxtl            v1.8h,   v2.8b            // top(0-7)
+        uxtl2           v2.8h,   v2.16b           // top(8-15)
+        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
+        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
+        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
+        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
+        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
+
+        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
+        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
+        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
+        sqrshrun        v3.8b,   v3.8h,   #4
+        uxtl            v0.8h,   v3.8b            // first block, in 16 bit
+        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
+        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
+        mla             v4.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
+        mla             v4.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
+
+        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
+        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
+        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
+        sqrshrun        v4.8b,   v4.8h,   #4
+        uxtl            v0.8h,   v4.8b            // second block, in 16 bit
+        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
+        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
+        mla             v5.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
+        mla             v5.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
+
+        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
+        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
+        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
+        sqrshrun        v5.8b,   v5.8h,   #4
+        uxtl            v0.8h,   v5.8b            // third block, in 16 bit
+        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
+        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
+        mla             v6.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
+        mla             v6.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
+
+        subs            w3,  w3,  #16
+        sqrshrun        v6.8b,   v6.8h,   #4
+
+        ins             v0.h[2], v2.h[7]
+        st4             {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
+        ins             v0.b[0], v6.b[7]
+        st4             {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
+        ins             v0.b[2], v6.b[3]
+        b.gt            2b
+        subs            w4,  w4,  #2
+        b.le            9f
+        sub             x8,  x6,  w9, uxtw
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_filter_tbl):
+        .hword L(ipred_filter_tbl) - 320b
+        .hword L(ipred_filter_tbl) - 160b
+        .hword L(ipred_filter_tbl) -  80b
+        .hword L(ipred_filter_tbl) -  40b
+endfunc
+
+// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                         const uint16_t *const pal, const uint8_t *idx,
+//                         const int w, const int h);
+function pal_pred_8bpc_neon, export=1
+        ld1             {v0.8h}, [x2]
+        clz             w9,  w4
+        adr             x6,  L(pal_pred_tbl)
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x6, w9, uxtw #1]
+        xtn             v0.8b,  v0.8h
+        sub             x6,  x6,  w9, uxtw
+        add             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x6
+4:
+        ld1             {v1.16b}, [x3], #16
+        subs            w5,  w5,  #4
+        tbl             v1.16b, {v0.16b}, v1.16b
+        st1             {v1.s}[0], [x0], x1
+        st1             {v1.s}[1], [x2], x1
+        st1             {v1.s}[2], [x0], x1
+        st1             {v1.s}[3], [x2], x1
+        b.gt            4b
+        ret
+8:
+        ld1             {v1.16b, v2.16b}, [x3], #32
+        subs            w5,  w5,  #4
+        tbl             v1.16b, {v0.16b}, v1.16b
+        st1             {v1.d}[0], [x0], x1
+        tbl             v2.16b, {v0.16b}, v2.16b
+        st1             {v1.d}[1], [x2], x1
+        st1             {v2.d}[0], [x0], x1
+        st1             {v2.d}[1], [x2], x1
+        b.gt            8b
+        ret
+16:
+        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64
+        subs            w5,  w5,  #4
+        tbl             v1.16b, {v0.16b}, v1.16b
+        tbl             v2.16b, {v0.16b}, v2.16b
+        st1             {v1.16b}, [x0], x1
+        tbl             v3.16b, {v0.16b}, v3.16b
+        st1             {v2.16b}, [x2], x1
+        tbl             v4.16b, {v0.16b}, v4.16b
+        st1             {v3.16b}, [x0], x1
+        st1             {v4.16b}, [x2], x1
+        b.gt            16b
+        ret
+32:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
+        subs            w5,  w5,  #4
+        tbl             v16.16b, {v0.16b}, v16.16b
+        tbl             v17.16b, {v0.16b}, v17.16b
+        tbl             v18.16b, {v0.16b}, v18.16b
+        tbl             v19.16b, {v0.16b}, v19.16b
+        tbl             v20.16b, {v0.16b}, v20.16b
+        st1             {v16.16b, v17.16b}, [x0], x1
+        tbl             v21.16b, {v0.16b}, v21.16b
+        st1             {v18.16b, v19.16b}, [x2], x1
+        tbl             v22.16b, {v0.16b}, v22.16b
+        st1             {v20.16b, v21.16b}, [x0], x1
+        tbl             v23.16b, {v0.16b}, v23.16b
+        st1             {v22.16b, v23.16b}, [x2], x1
+        b.gt            32b
+        ret
+64:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
+        subs            w5,  w5,  #2
+        tbl             v16.16b, {v0.16b}, v16.16b
+        tbl             v17.16b, {v0.16b}, v17.16b
+        tbl             v18.16b, {v0.16b}, v18.16b
+        tbl             v19.16b, {v0.16b}, v19.16b
+        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+        tbl             v20.16b, {v0.16b}, v20.16b
+        tbl             v21.16b, {v0.16b}, v21.16b
+        tbl             v22.16b, {v0.16b}, v22.16b
+        tbl             v23.16b, {v0.16b}, v23.16b
+        st1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
+        b.gt            64b
+        ret
+
+L(pal_pred_tbl):
+        .hword L(pal_pred_tbl) - 64b
+        .hword L(pal_pred_tbl) - 32b
+        .hword L(pal_pred_tbl) - 16b
+        .hword L(pal_pred_tbl) -  8b
+        .hword L(pal_pred_tbl) -  4b
+endfunc
+
+// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height,
+//                              const int16_t *ac, const int alpha);
+function ipred_cfl_128_8bpc_neon, export=1
+        clz             w9,  w3
+        adr             x7,  L(ipred_cfl_128_tbl)
+        sub             w9,  w9,  #26
+        ldrh            w9,  [x7, w9, uxtw #1]
+        movi            v0.8h,   #128 // dc
+        dup             v1.8h,   w6   // alpha
+        sub             x7,  x7,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x7
+L(ipred_cfl_splat_w4):
+        ld1             {v2.8h, v3.8h}, [x5], #32
+        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
+        mul             v3.8h,   v3.8h,   v1.8h
+        sshr            v4.8h,   v2.8h,   #15    // sign = diff >> 15
+        sshr            v5.8h,   v3.8h,   #15
+        add             v2.8h,   v2.8h,   v4.8h  // diff + sign
+        add             v3.8h,   v3.8h,   v5.8h
+        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
+        srshr           v3.8h,   v3.8h,   #6
+        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
+        add             v3.8h,   v3.8h,   v0.8h
+        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
+        sqxtun          v3.8b,   v3.8h
+        st1             {v2.s}[0],  [x0], x1
+        st1             {v2.s}[1],  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v3.s}[0],  [x0], x1
+        st1             {v3.s}[1],  [x6], x1
+        b.gt            L(ipred_cfl_splat_w4)
+        ret
+L(ipred_cfl_splat_w8):
+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
+        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
+        mul             v3.8h,   v3.8h,   v1.8h
+        mul             v4.8h,   v4.8h,   v1.8h
+        mul             v5.8h,   v5.8h,   v1.8h
+        sshr            v16.8h,  v2.8h,   #15    // sign = diff >> 15
+        sshr            v17.8h,  v3.8h,   #15
+        sshr            v18.8h,  v4.8h,   #15
+        sshr            v19.8h,  v5.8h,   #15
+        add             v2.8h,   v2.8h,   v16.8h // diff + sign
+        add             v3.8h,   v3.8h,   v17.8h
+        add             v4.8h,   v4.8h,   v18.8h
+        add             v5.8h,   v5.8h,   v19.8h
+        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
+        srshr           v3.8h,   v3.8h,   #6
+        srshr           v4.8h,   v4.8h,   #6
+        srshr           v5.8h,   v5.8h,   #6
+        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
+        add             v3.8h,   v3.8h,   v0.8h
+        add             v4.8h,   v4.8h,   v0.8h
+        add             v5.8h,   v5.8h,   v0.8h
+        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
+        sqxtun          v3.8b,   v3.8h
+        sqxtun          v4.8b,   v4.8h
+        sqxtun          v5.8b,   v5.8h
+        st1             {v2.8b},  [x0], x1
+        st1             {v3.8b},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v4.8b},  [x0], x1
+        st1             {v5.8b},  [x6], x1
+        b.gt            L(ipred_cfl_splat_w8)
+        ret
+L(ipred_cfl_splat_w16):
+        add             x7,  x5,  w3, uxtw #1
+        sub             x1,  x1,  w3, uxtw
+        mov             w9,  w3
+1:
+        ld1             {v2.8h, v3.8h}, [x5], #32
+        ld1             {v4.8h, v5.8h}, [x7], #32
+        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
+        mul             v3.8h,   v3.8h,   v1.8h
+        mul             v4.8h,   v4.8h,   v1.8h
+        mul             v5.8h,   v5.8h,   v1.8h
+        sshr            v16.8h,  v2.8h,   #15    // sign = diff >> 15
+        sshr            v17.8h,  v3.8h,   #15
+        sshr            v18.8h,  v4.8h,   #15
+        sshr            v19.8h,  v5.8h,   #15
+        add             v2.8h,   v2.8h,   v16.8h // diff + sign
+        add             v3.8h,   v3.8h,   v17.8h
+        add             v4.8h,   v4.8h,   v18.8h
+        add             v5.8h,   v5.8h,   v19.8h
+        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
+        srshr           v3.8h,   v3.8h,   #6
+        srshr           v4.8h,   v4.8h,   #6
+        srshr           v5.8h,   v5.8h,   #6
+        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
+        add             v3.8h,   v3.8h,   v0.8h
+        add             v4.8h,   v4.8h,   v0.8h
+        add             v5.8h,   v5.8h,   v0.8h
+        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
+        sqxtun          v3.8b,   v3.8h
+        sqxtun          v4.8b,   v4.8h
+        sqxtun          v5.8b,   v5.8h
+        subs            w3,  w3,  #16
+        st1             {v2.8b, v3.8b},  [x0], #16
+        st1             {v4.8b, v5.8b},  [x6], #16
+        b.gt            1b
+        subs            w4,  w4,  #2
+        add             x5,  x5,  w9, uxtw #1
+        add             x7,  x7,  w9, uxtw #1
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        mov             w3,  w9
+        b.gt            1b
+        ret
+
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
+endfunc
+
+// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height,
+//                              const int16_t *ac, const int alpha);
+function ipred_cfl_top_8bpc_neon, export=1
+        clz             w9,  w3
+        adr             x7,  L(ipred_cfl_top_tbl)
+        sub             w9,  w9,  #26
+        ldrh            w9,  [x7, w9, uxtw #1]
+        dup             v1.8h,   w6   // alpha
+        add             x2,  x2,  #1
+        sub             x7,  x7,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x7
+4:
+        ld1r            {v0.2s},  [x2]
+        uaddlv          h0,      v0.8b
+        urshr           v0.4h,   v0.4h,   #3
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w4)
+8:
+        ld1             {v0.8b},  [x2]
+        uaddlv          h0,      v0.8b
+        urshr           v0.4h,   v0.4h,   #3
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w8)
+16:
+        ld1             {v0.16b}, [x2]
+        uaddlv          h0,      v0.16b
+        urshr           v0.4h,   v0.4h,   #4
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w16)
+32:
+        ld1             {v2.16b, v3.16b}, [x2]
+        uaddlv          h2,      v2.16b
+        uaddlv          h3,      v3.16b
+        add             v2.4h,   v2.4h,   v3.4h
+        urshr           v2.4h,   v2.4h,   #5
+        dup             v0.8h,   v2.h[0]
+        b               L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_top_tbl):
+        .hword L(ipred_cfl_top_tbl) - 32b
+        .hword L(ipred_cfl_top_tbl) - 16b
+        .hword L(ipred_cfl_top_tbl) -  8b
+        .hword L(ipred_cfl_top_tbl) -  4b
+endfunc
+
+// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height,
+//                               const int16_t *ac, const int alpha);
+function ipred_cfl_left_8bpc_neon, export=1
+        sub             x2,  x2,  w4, uxtw
+        clz             w9,  w3
+        clz             w8,  w4
+        adr             x10, L(ipred_cfl_splat_tbl)
+        adr             x7,  L(ipred_cfl_left_tbl)
+        sub             w9,  w9,  #26
+        sub             w8,  w8,  #26
+        ldrh            w9,  [x10, w9, uxtw #1]
+        ldrh            w8,  [x7,  w8, uxtw #1]
+        dup             v1.8h,   w6   // alpha
+        sub             x9,  x10, w9, uxtw
+        sub             x7,  x7,  w8, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x7
+
+L(ipred_cfl_left_h4):
+        ld1r            {v0.2s},  [x2]
+        uaddlv          h0,      v0.8b
+        urshr           v0.4h,   v0.4h,   #3
+        dup             v0.8h,   v0.h[0]
+        br              x9
+
+L(ipred_cfl_left_h8):
+        ld1             {v0.8b},  [x2]
+        uaddlv          h0,      v0.8b
+        urshr           v0.4h,   v0.4h,   #3
+        dup             v0.8h,   v0.h[0]
+        br              x9
+
+L(ipred_cfl_left_h16):
+        ld1             {v0.16b}, [x2]
+        uaddlv          h0,      v0.16b
+        urshr           v0.4h,   v0.4h,   #4
+        dup             v0.8h,   v0.h[0]
+        br              x9
+
+L(ipred_cfl_left_h32):
+        ld1             {v2.16b, v3.16b}, [x2]
+        uaddlv          h2,      v2.16b
+        uaddlv          h3,      v3.16b
+        add             v2.4h,   v2.4h,   v3.4h
+        urshr           v2.4h,   v2.4h,   #5
+        dup             v0.8h,   v2.h[0]
+        br              x9
+
+L(ipred_cfl_left_tbl):
+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
+endfunc
+
+// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                          const pixel *const topleft,
+//                          const int width, const int height,
+//                          const int16_t *ac, const int alpha);
+function ipred_cfl_8bpc_neon, export=1
+        sub             x2,  x2,  w4, uxtw
+        add             w8,  w3,  w4             // width + height
+        dup             v1.8h,   w6              // alpha
+        clz             w9,  w3
+        clz             w6,  w4
+        dup             v16.8h, w8               // width + height
+        adr             x7,  L(ipred_cfl_tbl)
+        rbit            w8,  w8                  // rbit(width + height)
+        sub             w9,  w9,  #22            // 22 leading bits, minus table offset 4
+        sub             w6,  w6,  #26
+        clz             w8,  w8                  // ctz(width + height)
+        ldrh            w9,  [x7, w9, uxtw #1]
+        ldrh            w6,  [x7, w6, uxtw #1]
+        neg             w8,  w8                  // -ctz(width + height)
+        sub             x9,  x7,  w9, uxtw
+        sub             x7,  x7,  w6, uxtw
+        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
+        dup             v17.8h,  w8              // -ctz(width + height)
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x7
+
+L(ipred_cfl_h4):
+        ld1             {v0.s}[0],  [x2], #4
+        ins             v0.s[1], wzr
+        uaddlv          h0,      v0.8b
+        br              x9
+L(ipred_cfl_w4):
+        add             x2,  x2,  #1
+        ld1             {v2.s}[0],  [x2]
+        ins             v2.s[1], wzr
+        add             v0.4h,   v0.4h,   v16.4h
+        uaddlv          h2,      v2.8b
+        cmp             w4,  #4
+        add             v0.4h,   v0.4h,   v2.4h
+        ushl            v0.4h,   v0.4h,   v17.4h
+        b.eq            1f
+        // h = 8/16
+        mov             w16, #(0x3334/2)
+        movk            w16, #(0x5556/2), lsl #16
+        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
+        lsr             w16, w16, w17
+        dup             v16.4h,  w16
+        sqdmulh         v0.4h,   v0.4h,   v16.4h
+1:
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+        ld1             {v0.8b},  [x2], #8
+        uaddlv          h0,      v0.8b
+        br              x9
+L(ipred_cfl_w8):
+        add             x2,  x2,  #1
+        ld1             {v2.8b},  [x2]
+        add             v0.4h,   v0.4h,   v16.4h
+        uaddlv          h2,      v2.8b
+        cmp             w4,  #8
+        add             v0.4h,   v0.4h,   v2.4h
+        ushl            v0.4h,   v0.4h,   v17.4h
+        b.eq            1f
+        // h = 4/16/32
+        cmp             w4,  #32
+        mov             w16, #(0x3334/2)
+        mov             w17, #(0x5556/2)
+        csel            w16, w16, w17, eq
+        dup             v16.4h,  w16
+        sqdmulh         v0.4h,   v0.4h,   v16.4h
+1:
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+        ld1             {v0.16b}, [x2], #16
+        uaddlv          h0,      v0.16b
+        br              x9
+L(ipred_cfl_w16):
+        add             x2,  x2,  #1
+        ld1             {v2.16b}, [x2]
+        add             v0.4h,   v0.4h,   v16.4h
+        uaddlv          h2,      v2.16b
+        cmp             w4,  #16
+        add             v0.4h,   v0.4h,   v2.4h
+        ushl            v0.4h,   v0.4h,   v17.4h
+        b.eq            1f
+        // h = 4/8/32
+        cmp             w4,  #4
+        mov             w16, #(0x3334/2)
+        mov             w17, #(0x5556/2)
+        csel            w16, w16, w17, eq
+        dup             v16.4h,  w16
+        sqdmulh         v0.4h,   v0.4h,   v16.4h
+1:
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+        ld1             {v2.16b, v3.16b}, [x2], #32
+        uaddlv          h2,      v2.16b
+        uaddlv          h3,      v3.16b
+        add             v0.4h,   v2.4h,   v3.4h
+        br              x9
+L(ipred_cfl_w32):
+        add             x2,  x2,  #1
+        ld1             {v2.16b, v3.16b}, [x2]
+        add             v0.4h,   v0.4h,   v16.4h
+        uaddlv          h2,      v2.16b
+        uaddlv          h3,      v3.16b
+        cmp             w4,  #32
+        add             v0.4h,   v0.4h,   v2.4h
+        add             v0.4h,   v0.4h,   v3.4h
+        ushl            v0.4h,   v0.4h,   v17.4h
+        b.eq            1f
+        // h = 8/16
+        mov             w16, #(0x5556/2)
+        movk            w16, #(0x3334/2), lsl #16
+        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
+        lsr             w16, w16, w17
+        dup             v16.4h,  w16
+        sqdmulh         v0.4h,   v0.4h,   v16.4h
+1:
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_tbl):
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
+endfunc
+
+// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                           const ptrdiff_t stride, const int w_pad,
+//                           const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_8bpc_neon, export=1
+        clz             w8,  w5
+        lsl             w4,  w4,  #2
+        adr             x7,  L(ipred_cfl_ac_420_tbl)
+        sub             w8,  w8,  #27
+        ldrh            w8,  [x7, w8, uxtw #1]
+        movi            v16.8h,  #0
+        movi            v17.8h,  #0
+        movi            v18.8h,  #0
+        movi            v19.8h,  #0
+        sub             x7,  x7,  w8, uxtw
+        sub             w8,  w6,  w4         // height - h_pad
+        rbit            w9,  w5              // rbit(width)
+        rbit            w10, w6              // rbit(height)
+        clz             w9,  w9              // ctz(width)
+        clz             w10, w10             // ctz(height)
+        add             w9,  w9,  w10        // log2sz
+        add             x10, x1,  x2
+        dup             v31.4s,  w9
+        lsl             x2,  x2,  #1
+        neg             v31.4s,  v31.4s      // -log2sz
+        br              x7
+
+L(ipred_cfl_ac_420_w4):
+1:      // Copy and subsample input
+        ld1             {v0.8b},   [x1],  x2
+        ld1             {v1.8b},   [x10], x2
+        ld1             {v0.d}[1], [x1],  x2
+        ld1             {v1.d}[1], [x10], x2
+        uaddlp          v0.8h,   v0.16b
+        uaddlp          v1.8h,   v1.16b
+        add             v0.8h,   v0.8h,   v1.8h
+        shl             v0.8h,   v0.8h,   #1
+        subs            w8,  w8,  #2
+        st1             {v0.8h}, [x0], #16
+        add             v16.8h,  v16.8h,  v0.8h
+        b.gt            1b
+        trn2            v1.2d,   v0.2d,   v0.2d
+        trn2            v0.2d,   v0.2d,   v0.2d
+L(ipred_cfl_ac_420_w4_hpad):
+        cbz             w4,  3f
+2:      // Vertical padding (h_pad > 0)
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], #32
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        b.gt            2b
+3:
+        // Aggregate the sums
+        add             v0.8h,   v16.8h,  v17.8h
+        uaddlv          s0,  v0.8h                // sum
+        sub             x0,  x0,  w6, uxtw #3
+        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
+        dup             v4.8h,   v4.h[0]
+6:      // Subtract dc from ac
+        ld1             {v0.8h, v1.8h}, [x0]
+        subs            w6,  w6,  #4
+        sub             v0.8h,   v0.8h,   v4.8h
+        sub             v1.8h,   v1.8h,   v4.8h
+        st1             {v0.8h, v1.8h}, [x0], #32
+        b.gt            6b
+        ret
+
+L(ipred_cfl_ac_420_w8):
+        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
+1:      // Copy and subsample input, without padding
+        ld1             {v0.16b}, [x1],  x2
+        ld1             {v1.16b}, [x10], x2
+        ld1             {v2.16b}, [x1],  x2
+        uaddlp          v0.8h,   v0.16b
+        ld1             {v3.16b}, [x10], x2
+        uaddlp          v1.8h,   v1.16b
+        uaddlp          v2.8h,   v2.16b
+        uaddlp          v3.8h,   v3.16b
+        add             v0.8h,   v0.8h,   v1.8h
+        add             v2.8h,   v2.8h,   v3.8h
+        shl             v0.8h,   v0.8h,   #1
+        shl             v1.8h,   v2.8h,   #1
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h}, [x0], #32
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        b.gt            1b
+        mov             v0.16b,  v1.16b
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1:      // Copy and subsample input, padding 4
+        ld1             {v0.8b},   [x1],  x2
+        ld1             {v1.8b},   [x10], x2
+        ld1             {v0.d}[1], [x1],  x2
+        ld1             {v1.d}[1], [x10], x2
+        uaddlp          v0.8h,   v0.16b
+        uaddlp          v1.8h,   v1.16b
+        add             v0.8h,   v0.8h,   v1.8h
+        shl             v0.8h,   v0.8h,   #1
+        dup             v1.4h,   v0.h[3]
+        dup             v3.4h,   v0.h[7]
+        trn2            v2.2d,   v0.2d,   v0.2d
+        subs            w8,  w8,  #2
+        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+        add             v16.4h,  v16.4h,  v0.4h
+        add             v17.4h,  v17.4h,  v1.4h
+        add             v18.4h,  v18.4h,  v2.4h
+        add             v19.4h,  v19.4h,  v3.4h
+        b.gt            1b
+        trn1            v0.2d,   v2.2d,   v3.2d
+        trn1            v1.2d,   v2.2d,   v3.2d
+
+L(ipred_cfl_ac_420_w8_hpad):
+        cbz             w4,  3f
+2:      // Vertical padding (h_pad > 0)
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], #32
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        st1             {v0.8h, v1.8h}, [x0], #32
+        add             v18.8h,  v18.8h,  v0.8h
+        add             v19.8h,  v19.8h,  v1.8h
+        b.gt            2b
+3:
+
+L(ipred_cfl_ac_420_w8_calc_subtract_dc):
+        // Aggregate the sums
+        add             v0.8h,   v16.8h,  v17.8h
+        add             v2.8h,   v18.8h,  v19.8h
+        uaddlp          v0.4s,   v0.8h
+        uaddlp          v2.4s,   v2.8h
+        add             v0.4s,   v0.4s,   v2.4s
+        addv            s0,  v0.4s                // sum
+        sub             x0,  x0,  w6, uxtw #4
+        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
+        dup             v4.8h,   v4.h[0]
+6:      // Subtract dc from ac
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+        subs            w6,  w6,  #4
+        sub             v0.8h,   v0.8h,   v4.8h
+        sub             v1.8h,   v1.8h,   v4.8h
+        sub             v2.8h,   v2.8h,   v4.8h
+        sub             v3.8h,   v3.8h,   v4.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        b.gt            6b
+        ret
+
+L(ipred_cfl_ac_420_w16):
+        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
+        ldrh            w3,  [x7, w3, uxtw #1]
+        sub             x7,  x7,  w3, uxtw
+        br              x7
+
+L(ipred_cfl_ac_420_w16_wpad0):
+1:      // Copy and subsample input, without padding
+        ld1             {v0.16b, v1.16b}, [x1],  x2
+        ld1             {v2.16b, v3.16b}, [x10], x2
+        uaddlp          v0.8h,   v0.16b
+        ld1             {v4.16b, v5.16b}, [x1],  x2
+        uaddlp          v1.8h,   v1.16b
+        ld1             {v6.16b, v7.16b}, [x10], x2
+        uaddlp          v2.8h,   v2.16b
+        uaddlp          v3.8h,   v3.16b
+        uaddlp          v4.8h,   v4.16b
+        uaddlp          v5.8h,   v5.16b
+        uaddlp          v6.8h,   v6.16b
+        uaddlp          v7.8h,   v7.16b
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v1.8h,   v1.8h,   v3.8h
+        add             v4.8h,   v4.8h,   v6.8h
+        add             v5.8h,   v5.8h,   v7.8h
+        shl             v0.8h,   v0.8h,   #1
+        shl             v1.8h,   v1.8h,   #1
+        shl             v2.8h,   v4.8h,   #1
+        shl             v3.8h,   v5.8h,   #1
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+1:      // Copy and subsample input, padding 4
+        ldr             d1,  [x1,  #16]
+        ld1             {v0.16b}, [x1],  x2
+        ldr             d3,  [x10, #16]
+        ld1             {v2.16b}, [x10], x2
+        uaddlp          v1.4h,   v1.8b
+        ldr             d5,  [x1,  #16]
+        uaddlp          v0.8h,   v0.16b
+        ld1             {v4.16b}, [x1],  x2
+        uaddlp          v3.4h,   v3.8b
+        ldr             d7,  [x10, #16]
+        uaddlp          v2.8h,   v2.16b
+        ld1             {v6.16b}, [x10], x2
+        uaddlp          v5.4h,   v5.8b
+        uaddlp          v4.8h,   v4.16b
+        uaddlp          v7.4h,   v7.8b
+        uaddlp          v6.8h,   v6.16b
+        add             v1.4h,   v1.4h,   v3.4h
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v5.4h,   v5.4h,   v7.4h
+        add             v4.8h,   v4.8h,   v6.8h
+        shl             v1.4h,   v1.4h,   #1
+        shl             v0.8h,   v0.8h,   #1
+        shl             v3.4h,   v5.4h,   #1
+        shl             v2.8h,   v4.8h,   #1
+        dup             v4.4h,   v1.h[3]
+        dup             v5.4h,   v3.h[3]
+        trn1            v1.2d,   v1.2d,   v4.2d
+        trn1            v3.2d,   v3.2d,   v5.2d
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+1:      // Copy and subsample input, padding 8
+        ld1             {v0.16b}, [x1],  x2
+        ld1             {v2.16b}, [x10], x2
+        ld1             {v4.16b}, [x1],  x2
+        uaddlp          v0.8h,   v0.16b
+        ld1             {v6.16b}, [x10], x2
+        uaddlp          v2.8h,   v2.16b
+        uaddlp          v4.8h,   v4.16b
+        uaddlp          v6.8h,   v6.16b
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v4.8h,   v4.8h,   v6.8h
+        shl             v0.8h,   v0.8h,   #1
+        shl             v2.8h,   v4.8h,   #1
+        dup             v1.8h,   v0.h[7]
+        dup             v3.8h,   v2.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+1:      // Copy and subsample input, padding 12
+        ld1             {v0.8b}, [x1],  x2
+        ld1             {v2.8b}, [x10], x2
+        ld1             {v4.8b}, [x1],  x2
+        uaddlp          v0.4h,   v0.8b
+        ld1             {v6.8b}, [x10], x2
+        uaddlp          v2.4h,   v2.8b
+        uaddlp          v4.4h,   v4.8b
+        uaddlp          v6.4h,   v6.8b
+        add             v0.4h,   v0.4h,   v2.4h
+        add             v4.4h,   v4.4h,   v6.4h
+        shl             v0.4h,   v0.4h,   #1
+        shl             v2.4h,   v4.4h,   #1
+        dup             v1.8h,   v0.h[3]
+        dup             v3.8h,   v2.h[3]
+        trn1            v0.2d,   v0.2d,   v1.2d
+        trn1            v2.2d,   v2.2d,   v3.2d
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_hpad):
+        cbz             w4,  3f
+2:      // Vertical padding (h_pad > 0)
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            2b
+3:
+
+        // Double the height and reuse the w8 summing/subtracting
+        lsl             w6,  w6,  #1
+        lsl             w9,  w9,  #1
+        b               L(ipred_cfl_ac_420_w8_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_tbl):
+        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
+        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
+        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
+        .hword 0
+
+L(ipred_cfl_ac_420_w16_tbl):
+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
+endfunc
+
+// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                           const ptrdiff_t stride, const int w_pad,
+//                           const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_8bpc_neon, export=1
+        clz             w8,  w5
+        lsl             w4,  w4,  #2
+        adr             x7,  L(ipred_cfl_ac_422_tbl)
+        sub             w8,  w8,  #27
+        ldrh            w8,  [x7, w8, uxtw #1]
+        movi            v16.8h,  #0
+        movi            v17.8h,  #0
+        movi            v18.8h,  #0
+        movi            v19.8h,  #0
+        sub             x7,  x7,  w8, uxtw
+        sub             w8,  w6,  w4         // height - h_pad
+        rbit            w9,  w5              // rbit(width)
+        rbit            w10, w6              // rbit(height)
+        clz             w9,  w9              // ctz(width)
+        clz             w10, w10             // ctz(height)
+        add             w9,  w9,  w10        // log2sz
+        add             x10, x1,  x2
+        dup             v31.4s,  w9
+        lsl             x2,  x2,  #1
+        neg             v31.4s,  v31.4s      // -log2sz
+        br              x7
+
+L(ipred_cfl_ac_422_w4):
+1:      // Copy and subsample input
+        ld1             {v0.8b},   [x1],  x2
+        ld1             {v0.d}[1], [x10], x2
+        ld1             {v1.8b},   [x1],  x2
+        ld1             {v1.d}[1], [x10], x2
+        uaddlp          v0.8h,   v0.16b
+        uaddlp          v1.8h,   v1.16b
+        shl             v0.8h,   v0.8h,   #2
+        shl             v1.8h,   v1.8h,   #2
+        subs            w8,  w8,  #4
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        st1             {v0.8h, v1.8h}, [x0], #32
+        b.gt            1b
+        trn2            v0.2d,   v1.2d,   v1.2d
+        trn2            v1.2d,   v1.2d,   v1.2d
+        b               L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
+1:      // Copy and subsample input, without padding
+        ld1             {v0.16b}, [x1],  x2
+        ld1             {v1.16b}, [x10], x2
+        ld1             {v2.16b}, [x1],  x2
+        uaddlp          v0.8h,   v0.16b
+        ld1             {v3.16b}, [x10], x2
+        uaddlp          v1.8h,   v1.16b
+        uaddlp          v2.8h,   v2.16b
+        uaddlp          v3.8h,   v3.16b
+        shl             v0.8h,   v0.8h,   #2
+        shl             v1.8h,   v1.8h,   #2
+        shl             v2.8h,   v2.8h,   #2
+        shl             v3.8h,   v3.8h,   #2
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v3.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1:      // Copy and subsample input, padding 4
+        ld1             {v0.8b},   [x1],  x2
+        ld1             {v0.d}[1], [x10], x2
+        ld1             {v2.8b},   [x1],  x2
+        ld1             {v2.d}[1], [x10], x2
+        uaddlp          v0.8h,   v0.16b
+        uaddlp          v2.8h,   v2.16b
+        shl             v0.8h,   v0.8h,   #2
+        shl             v2.8h,   v2.8h,   #2
+        dup             v4.4h,   v0.h[3]
+        dup             v5.8h,   v0.h[7]
+        dup             v6.4h,   v2.h[3]
+        dup             v7.8h,   v2.h[7]
+        trn2            v1.2d,   v0.2d,   v5.2d
+        trn1            v0.2d,   v0.2d,   v4.2d
+        trn2            v3.2d,   v2.2d,   v7.2d
+        trn1            v2.2d,   v2.2d,   v6.2d
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v3.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
+        ldrh            w3,  [x7, w3, uxtw #1]
+        sub             x7,  x7,  w3, uxtw
+        br              x7
+
+L(ipred_cfl_ac_422_w16_wpad0):
+1:      // Copy and subsample input, without padding
+        ld1             {v0.16b, v1.16b}, [x1],  x2
+        ld1             {v2.16b, v3.16b}, [x10], x2
+        uaddlp          v0.8h,   v0.16b
+        uaddlp          v1.8h,   v1.16b
+        uaddlp          v2.8h,   v2.16b
+        uaddlp          v3.8h,   v3.16b
+        shl             v0.8h,   v0.8h,   #2
+        shl             v1.8h,   v1.8h,   #2
+        shl             v2.8h,   v2.8h,   #2
+        shl             v3.8h,   v3.8h,   #2
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+1:      // Copy and subsample input, padding 4
+        ldr             d1,  [x1,  #16]
+        ld1             {v0.16b}, [x1],  x2
+        ldr             d3,  [x10, #16]
+        ld1             {v2.16b}, [x10], x2
+        uaddlp          v1.4h,   v1.8b
+        uaddlp          v0.8h,   v0.16b
+        uaddlp          v3.4h,   v3.8b
+        uaddlp          v2.8h,   v2.16b
+        shl             v1.4h,   v1.4h,   #2
+        shl             v0.8h,   v0.8h,   #2
+        shl             v3.4h,   v3.4h,   #2
+        shl             v2.8h,   v2.8h,   #2
+        dup             v4.4h,   v1.h[3]
+        dup             v5.4h,   v3.h[3]
+        trn1            v1.2d,   v1.2d,   v4.2d
+        trn1            v3.2d,   v3.2d,   v5.2d
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+1:      // Copy and subsample input, padding 8
+        ld1             {v0.16b}, [x1],  x2
+        ld1             {v2.16b}, [x10], x2
+        uaddlp          v0.8h,   v0.16b
+        uaddlp          v2.8h,   v2.16b
+        shl             v0.8h,   v0.8h,   #2
+        shl             v2.8h,   v2.8h,   #2
+        dup             v1.8h,   v0.h[7]
+        dup             v3.8h,   v2.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+1:      // Copy and subsample input, padding 12
+        ld1             {v0.8b}, [x1],  x2
+        ld1             {v2.8b}, [x10], x2
+        uaddlp          v0.4h,   v0.8b
+        uaddlp          v2.4h,   v2.8b
+        shl             v0.4h,   v0.4h,   #2
+        shl             v2.4h,   v2.4h,   #2
+        dup             v1.8h,   v0.h[3]
+        dup             v3.8h,   v2.h[3]
+        trn1            v0.2d,   v0.2d,   v1.2d
+        trn1            v2.2d,   v2.2d,   v3.2d
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_tbl):
+        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
+        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
+        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
+        .hword 0
+
+L(ipred_cfl_ac_422_w16_tbl):
+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
diff --git a/src/arm/64/ipred16.S b/src/arm/64/ipred16.S
new file mode 100644 (file)
index 0000000..5c13949
--- /dev/null
@@ -0,0 +1,2834 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height, const int a,
+//                              const int max_width, const int max_height,
+//                              const int bitdepth_max);
+function ipred_dc_128_16bpc_neon, export=1
+        ldr             w8,  [sp]
+        clz             w3,  w3
+        adr             x5,  L(ipred_dc_128_tbl)
+        sub             w3,  w3,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        dup             v0.8h,   w8
+        sub             x5,  x5,  w3, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        urshr           v0.8h,   v0.8h,  #1
+        br              x5
+4:
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        b.gt            4b
+        ret
+8:
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        b.gt            8b
+        ret
+160:
+        mov             v1.16b,  v0.16b
+16:
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        b.gt            16b
+        ret
+320:
+        mov             v1.16b,  v0.16b
+        mov             v2.16b,  v0.16b
+        mov             v3.16b,  v0.16b
+32:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            32b
+        ret
+640:
+        mov             v1.16b,  v0.16b
+        mov             v2.16b,  v0.16b
+        mov             v3.16b,  v0.16b
+        sub             x1,  x1,  #64
+64:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            64b
+        ret
+
+L(ipred_dc_128_tbl):
+        .hword L(ipred_dc_128_tbl) - 640b
+        .hword L(ipred_dc_128_tbl) - 320b
+        .hword L(ipred_dc_128_tbl) - 160b
+        .hword L(ipred_dc_128_tbl) -   8b
+        .hword L(ipred_dc_128_tbl) -   4b
+endfunc
+
+// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                         const pixel *const topleft,
+//                         const int width, const int height, const int a,
+//                         const int max_width, const int max_height);
+function ipred_v_16bpc_neon, export=1
+        clz             w3,  w3
+        adr             x5,  L(ipred_v_tbl)
+        sub             w3,  w3,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        add             x2,  x2,  #2
+        sub             x5,  x5,  w3, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1             {v0.4h},  [x2]
+4:
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v0.8h},  [x2]
+8:
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        b.gt            8b
+        ret
+160:
+        ld1             {v0.8h, v1.8h}, [x2]
+16:
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        b.gt            16b
+        ret
+320:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+32:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            32b
+        ret
+640:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+        sub             x1,  x1,  #64
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+64:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
+        b.gt            64b
+        ret
+
+L(ipred_v_tbl):
+        .hword L(ipred_v_tbl) - 640b
+        .hword L(ipred_v_tbl) - 320b
+        .hword L(ipred_v_tbl) - 160b
+        .hword L(ipred_v_tbl) -  80b
+        .hword L(ipred_v_tbl) -  40b
+endfunc
+
+// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                         const pixel *const topleft,
+//                         const int width, const int height, const int a,
+//                         const int max_width, const int max_height);
+function ipred_h_16bpc_neon, export=1
+        clz             w3,  w3
+        adr             x5,  L(ipred_h_tbl)
+        sub             w3,  w3,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        sub             x2,  x2,  #8
+        sub             x5,  x5,  w3, uxtw
+        mov             x7,  #-8
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+4:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
+        st1             {v3.4h},  [x0], x1
+        st1             {v2.4h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v1.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        b.gt            4b
+        ret
+8:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
+        st1             {v3.8h},  [x0], x1
+        st1             {v2.8h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v1.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        b.gt            8b
+        ret
+16:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
+        str             q3,  [x0, #16]
+        str             q2,  [x6, #16]
+        st1             {v3.8h}, [x0], x1
+        st1             {v2.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        str             q1,  [x0, #16]
+        str             q0,  [x6, #16]
+        st1             {v1.8h}, [x0], x1
+        st1             {v0.8h}, [x6], x1
+        b.gt            16b
+        ret
+32:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
+        str             q3,  [x0, #16]
+        str             q2,  [x6, #16]
+        stp             q3,  q3,  [x0, #32]
+        stp             q2,  q2,  [x6, #32]
+        st1             {v3.8h}, [x0], x1
+        st1             {v2.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        str             q1,  [x0, #16]
+        str             q0,  [x6, #16]
+        stp             q1,  q1,  [x0, #32]
+        stp             q0,  q0,  [x6, #32]
+        st1             {v1.8h}, [x0], x1
+        st1             {v0.8h}, [x6], x1
+        b.gt            32b
+        ret
+64:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
+        str             q3,  [x0, #16]
+        str             q2,  [x6, #16]
+        stp             q3,  q3,  [x0, #32]
+        stp             q2,  q2,  [x6, #32]
+        stp             q3,  q3,  [x0, #64]
+        stp             q2,  q2,  [x6, #64]
+        stp             q3,  q3,  [x0, #96]
+        stp             q2,  q2,  [x6, #96]
+        st1             {v3.8h}, [x0], x1
+        st1             {v2.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        str             q1,  [x0, #16]
+        str             q0,  [x6, #16]
+        stp             q1,  q1,  [x0, #32]
+        stp             q0,  q0,  [x6, #32]
+        stp             q1,  q1,  [x0, #64]
+        stp             q0,  q0,  [x6, #64]
+        stp             q1,  q1,  [x0, #96]
+        stp             q0,  q0,  [x6, #96]
+        st1             {v1.8h}, [x0], x1
+        st1             {v0.8h}, [x6], x1
+        b.gt            64b
+        ret
+
+L(ipred_h_tbl):
+        .hword L(ipred_h_tbl) - 64b
+        .hword L(ipred_h_tbl) - 32b
+        .hword L(ipred_h_tbl) - 16b
+        .hword L(ipred_h_tbl) -  8b
+        .hword L(ipred_h_tbl) -  4b
+endfunc
+
+// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height, const int a,
+//                              const int max_width, const int max_height);
+function ipred_dc_top_16bpc_neon, export=1
+        clz             w3,  w3
+        adr             x5,  L(ipred_dc_top_tbl)
+        sub             w3,  w3,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        add             x2,  x2,  #2
+        sub             x5,  x5,  w3, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1             {v0.4h},  [x2]
+        addv            h0,      v0.4h
+        urshr           v0.4h,   v0.4h,   #2
+        dup             v0.4h,   v0.h[0]
+4:
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v0.8h},  [x2]
+        addv            h0,      v0.8h
+        urshr           v0.4h,   v0.4h,   #3
+        dup             v0.8h,   v0.h[0]
+8:
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        b.gt            8b
+        ret
+160:
+        ld1             {v0.8h, v1.8h}, [x2]
+        addp            v0.8h,   v0.8h,   v1.8h
+        addv            h0,      v0.8h
+        urshr           v2.4h,   v0.4h,   #4
+        dup             v0.8h,   v2.h[0]
+        dup             v1.8h,   v2.h[0]
+16:
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        b.gt            16b
+        ret
+320:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v0.8h,   v0.8h,   v2.8h
+        uaddlv          s0,      v0.8h
+        rshrn           v4.4h,   v0.4s,   #5
+        dup             v0.8h,   v4.h[0]
+        dup             v1.8h,   v4.h[0]
+        dup             v2.8h,   v4.h[0]
+        dup             v3.8h,   v4.h[0]
+32:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            32b
+        ret
+640:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+        addp            v0.8h,   v0.8h,   v1.8h
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        addp            v0.8h,   v0.8h,   v2.8h
+        addp            v4.8h,   v4.8h,   v6.8h
+        addp            v0.8h,   v0.8h,   v4.8h
+        uaddlv          s0,      v0.8h
+        rshrn           v4.4h,   v0.4s,   #6
+        sub             x1,  x1,  #64
+        dup             v0.8h,   v4.h[0]
+        dup             v1.8h,   v4.h[0]
+        dup             v2.8h,   v4.h[0]
+        dup             v3.8h,   v4.h[0]
+64:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            64b
+        ret
+
+L(ipred_dc_top_tbl):
+        .hword L(ipred_dc_top_tbl) - 640b
+        .hword L(ipred_dc_top_tbl) - 320b
+        .hword L(ipred_dc_top_tbl) - 160b
+        .hword L(ipred_dc_top_tbl) -  80b
+        .hword L(ipred_dc_top_tbl) -  40b
+endfunc
+
+// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height, const int a,
+//                               const int max_width, const int max_height);
+function ipred_dc_left_16bpc_neon, export=1
+        sub             x2,  x2,  w4, uxtw #1
+        clz             w3,  w3
+        clz             w7,  w4
+        adr             x5,  L(ipred_dc_left_tbl)
+        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
+        sub             w7,  w7,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrh            w7,  [x5, w7, uxtw #1]
+        sub             x3,  x5,  w3, uxtw
+        sub             x5,  x5,  w7, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+
+L(ipred_dc_left_h4):
+        ld1             {v0.4h},  [x2]
+        addv            h0,      v0.4h
+        urshr           v0.4h,   v0.4h,   #2
+        dup             v0.8h,   v0.h[0]
+        br              x3
+L(ipred_dc_left_w4):
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        b.gt            L(ipred_dc_left_w4)
+        ret
+
+L(ipred_dc_left_h8):
+        ld1             {v0.8h},  [x2]
+        addv            h0,      v0.8h
+        urshr           v0.4h,   v0.4h,   #3
+        dup             v0.8h,   v0.h[0]
+        br              x3
+L(ipred_dc_left_w8):
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        b.gt            L(ipred_dc_left_w8)
+        ret
+
+L(ipred_dc_left_h16):
+        ld1             {v0.8h, v1.8h}, [x2]
+        addp            v0.8h,   v0.8h,   v1.8h
+        addv            h0,      v0.8h
+        urshr           v2.4h,   v0.4h,   #4
+        dup             v0.8h,   v2.h[0]
+        dup             v1.8h,   v2.h[0]
+        br              x3
+L(ipred_dc_left_w16):
+        mov             v1.16b,  v0.16b
+1:
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        b.gt            1b
+        ret
+
+L(ipred_dc_left_h32):
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v0.8h,   v0.8h,   v2.8h
+        uaddlp          v0.4s,   v0.8h
+        addv            s0,      v0.4s
+        rshrn           v4.4h,   v0.4s,   #5
+        dup             v0.8h,   v4.h[0]
+        br              x3
+L(ipred_dc_left_w32):
+        mov             v1.16b,  v0.16b
+        mov             v2.16b,  v0.16b
+        mov             v3.16b,  v0.16b
+1:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            1b
+        ret
+
+L(ipred_dc_left_h64):
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+        addp            v0.8h,   v0.8h,   v1.8h
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        addp            v0.8h,   v0.8h,   v2.8h
+        addp            v4.8h,   v4.8h,   v6.8h
+        addp            v0.8h,   v0.8h,   v4.8h
+        uaddlv          s0,      v0.8h
+        rshrn           v4.4h,   v0.4s,   #6
+        dup             v0.8h,   v4.h[0]
+        br              x3
+L(ipred_dc_left_w64):
+        mov             v1.16b,  v0.16b
+        mov             v2.16b,  v0.16b
+        mov             v3.16b,  v0.16b
+        sub             x1,  x1,  #64
+1:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            1b
+        ret
+
+L(ipred_dc_left_tbl):
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+endfunc
+
+// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                          const pixel *const topleft,
+//                          const int width, const int height, const int a,
+//                          const int max_width, const int max_height);
+function ipred_dc_16bpc_neon, export=1
+        sub             x2,  x2,  w4, uxtw #1
+        add             w7,  w3,  w4             // width + height
+        clz             w3,  w3
+        clz             w6,  w4
+        dup             v16.4s, w7               // width + height
+        adr             x5,  L(ipred_dc_tbl)
+        rbit            w7,  w7                  // rbit(width + height)
+        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
+        sub             w6,  w6,  #25
+        clz             w7,  w7                  // ctz(width + height)
+        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrh            w6,  [x5, w6, uxtw #1]
+        neg             w7,  w7                  // -ctz(width + height)
+        sub             x3,  x5,  w3, uxtw
+        sub             x5,  x5,  w6, uxtw
+        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
+        dup             v17.4s,  w7              // -ctz(width + height)
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+
+L(ipred_dc_h4):
+        ld1             {v0.4h},  [x2], #8
+        uaddlv          s0,      v0.4h
+        br              x3
+L(ipred_dc_w4):
+        add             x2,  x2,  #2
+        ld1             {v1.4h},  [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        uaddlv          s1,      v1.4h
+        cmp             w4,  #4
+        add             v0.2s,   v0.2s,   v1.2s
+        ushl            v0.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 8/16
+        cmp             w4,  #16
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v0.2s,   v0.2s,   v16.2s
+        ushr            v0.2s,   v0.2s,   #17
+1:
+        dup             v0.4h,   v0.h[0]
+2:
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_h8):
+        ld1             {v0.8h},  [x2], #16
+        uaddlv          s0,      v0.8h
+        br              x3
+L(ipred_dc_w8):
+        add             x2,  x2,  #2
+        ld1             {v1.8h},  [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        uaddlv          s1,      v1.8h
+        cmp             w4,  #8
+        add             v0.2s,   v0.2s,   v1.2s
+        ushl            v0.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 4/16/32
+        cmp             w4,  #32
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v0.2s,   v0.2s,   v16.2s
+        ushr            v0.2s,   v0.2s,   #17
+1:
+        dup             v0.8h,   v0.h[0]
+2:
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_h16):
+        ld1             {v0.8h, v1.8h}, [x2], #32
+        addp            v0.8h,   v0.8h,   v1.8h
+        uaddlv          s0,      v0.8h
+        br              x3
+L(ipred_dc_w16):
+        add             x2,  x2,  #2
+        ld1             {v1.8h, v2.8h}, [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        addp            v1.8h,   v1.8h,   v2.8h
+        uaddlv          s1,      v1.8h
+        cmp             w4,  #16
+        add             v0.2s,   v0.2s,   v1.2s
+        ushl            v4.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 4/8/32/64
+        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v4.2s,   v4.2s,   v16.2s
+        ushr            v4.2s,   v4.2s,   #17
+1:
+        dup             v0.8h,   v4.h[0]
+        dup             v1.8h,   v4.h[0]
+2:
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_h32):
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v0.8h,   v0.8h,   v2.8h
+        uaddlv          s0,      v0.8h
+        br              x3
+L(ipred_dc_w32):
+        add             x2,  x2,  #2
+        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        addp            v1.8h,   v1.8h,   v2.8h
+        addp            v3.8h,   v3.8h,   v4.8h
+        addp            v1.8h,   v1.8h,   v3.8h
+        uaddlv          s1,      v1.8h
+        cmp             w4,  #32
+        add             v0.2s,   v0.2s,   v1.2s
+        ushl            v4.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 8/16/64
+        cmp             w4,  #8
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v4.2s,   v4.2s,   v16.2s
+        ushr            v4.2s,   v4.2s,   #17
+1:
+        dup             v0.8h,   v4.h[0]
+        dup             v1.8h,   v4.h[0]
+        dup             v2.8h,   v4.h[0]
+        dup             v3.8h,   v4.h[0]
+2:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_h64):
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+        addp            v0.8h,   v0.8h,   v1.8h
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        addp            v0.8h,   v0.8h,   v2.8h
+        addp            v4.8h,   v4.8h,   v6.8h
+        addp            v0.8h,   v0.8h,   v4.8h
+        uaddlv          s0,      v0.8h
+        br              x3
+L(ipred_dc_w64):
+        add             x2,  x2,  #2
+        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
+        add             v0.2s,   v0.2s,   v16.2s
+        addp            v1.8h,   v1.8h,   v2.8h
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
+        addp            v3.8h,   v3.8h,   v4.8h
+        addp            v20.8h,  v20.8h,  v21.8h
+        addp            v22.8h,  v22.8h,  v23.8h
+        addp            v1.8h,   v1.8h,   v3.8h
+        addp            v20.8h,  v20.8h,  v22.8h
+        addp            v1.8h,   v1.8h,   v20.8h
+        uaddlv          s1,      v1.8h
+        cmp             w4,  #64
+        add             v0.2s,   v0.2s,   v1.2s
+        ushl            v4.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 16/32
+        cmp             w4,  #16
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v4.2s,   v4.2s,   v16.2s
+        ushr            v4.2s,   v4.2s,   #17
+1:
+        sub             x1,  x1,  #64
+        dup             v0.8h,   v4.h[0]
+        dup             v1.8h,   v4.h[0]
+        dup             v2.8h,   v4.h[0]
+        dup             v3.8h,   v4.h[0]
+2:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_tbl):
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+endfunc
+
+// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_paeth_16bpc_neon, export=1
+        clz             w9,  w3
+        adr             x5,  L(ipred_paeth_tbl)
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x5, w9, uxtw #1]
+        ld1r            {v4.8h},  [x2]
+        add             x8,  x2,  #2
+        sub             x2,  x2,  #8
+        sub             x5,  x5,  w9, uxtw
+        mov             x7,  #-8
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1r            {v5.2d},  [x8]
+        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
+4:
+        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7
+        zip1            v0.2d,   v0.2d,   v1.2d
+        zip1            v2.2d,   v2.2d,   v3.2d
+        add             v16.8h,  v6.8h,   v0.8h   // base
+        add             v17.8h,  v6.8h,   v2.8h
+        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
+        sabd            v21.8h,  v5.8h,   v17.8h
+        sabd            v22.8h,  v4.8h,   v16.8h  // tldiff
+        sabd            v23.8h,  v4.8h,   v17.8h
+        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
+        sabd            v17.8h,  v2.8h,   v17.8h
+        umin            v18.8h,  v20.8h,  v22.8h  // min(tdiff, tldiff)
+        umin            v19.8h,  v21.8h,  v23.8h
+        cmge            v20.8h,  v22.8h,  v20.8h  // tldiff >= tdiff
+        cmge            v21.8h,  v23.8h,  v21.8h
+        cmge            v16.8h,  v18.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
+        cmge            v17.8h,  v19.8h,  v17.8h
+        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
+        bsl             v20.16b, v5.16b,  v4.16b
+        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
+        bit             v20.16b, v0.16b,  v16.16b
+        st1             {v21.d}[1], [x0], x1
+        st1             {v21.d}[0], [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v20.d}[1], [x0], x1
+        st1             {v20.d}[0], [x6], x1
+        b.gt            4b
+        ret
+80:
+160:
+320:
+640:
+        ld1             {v5.8h},  [x8], #16
+        mov             w9,  w3
+        // Set up pointers for four rows in parallel; x0, x6, x5, x10
+        add             x5,  x0,  x1
+        add             x10, x6,  x1
+        lsl             x1,  x1,  #1
+        sub             x1,  x1,  w3, uxtw #1
+1:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
+2:
+        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
+        add             v16.8h,  v6.8h,   v0.8h   // base
+        add             v17.8h,  v6.8h,   v1.8h
+        add             v18.8h,  v6.8h,   v2.8h
+        add             v19.8h,  v6.8h,   v3.8h
+        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
+        sabd            v21.8h,  v5.8h,   v17.8h
+        sabd            v22.8h,  v5.8h,   v18.8h
+        sabd            v23.8h,  v5.8h,   v19.8h
+        sabd            v24.8h,  v4.8h,   v16.8h  // tldiff
+        sabd            v25.8h,  v4.8h,   v17.8h
+        sabd            v26.8h,  v4.8h,   v18.8h
+        sabd            v27.8h,  v4.8h,   v19.8h
+        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
+        sabd            v17.8h,  v1.8h,   v17.8h
+        sabd            v18.8h,  v2.8h,   v18.8h
+        sabd            v19.8h,  v3.8h,   v19.8h
+        umin            v28.8h,  v20.8h,  v24.8h  // min(tdiff, tldiff)
+        umin            v29.8h,  v21.8h,  v25.8h
+        umin            v30.8h,  v22.8h,  v26.8h
+        umin            v31.8h,  v23.8h,  v27.8h
+        cmge            v20.8h,  v24.8h,  v20.8h  // tldiff >= tdiff
+        cmge            v21.8h,  v25.8h,  v21.8h
+        cmge            v22.8h,  v26.8h,  v22.8h
+        cmge            v23.8h,  v27.8h,  v23.8h
+        cmge            v16.8h,  v28.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
+        cmge            v17.8h,  v29.8h,  v17.8h
+        cmge            v18.8h,  v30.8h,  v18.8h
+        cmge            v19.8h,  v31.8h,  v19.8h
+        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
+        bsl             v22.16b, v5.16b,  v4.16b
+        bsl             v21.16b, v5.16b,  v4.16b
+        bsl             v20.16b, v5.16b,  v4.16b
+        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
+        bit             v22.16b, v2.16b,  v18.16b
+        bit             v21.16b, v1.16b,  v17.16b
+        bit             v20.16b, v0.16b,  v16.16b
+        st1             {v23.8h}, [x0], #16
+        st1             {v22.8h}, [x6], #16
+        subs            w3,  w3,  #8
+        st1             {v21.8h}, [x5], #16
+        st1             {v20.8h}, [x10], #16
+        b.le            8f
+        ld1             {v5.8h},  [x8], #16
+        b               2b
+8:
+        subs            w4,  w4,  #4
+        b.le            9f
+        // End of horizontal loop, move pointers to next four rows
+        sub             x8,  x8,  w9, uxtw #1
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        // Load the top row as early as possible
+        ld1             {v5.8h},  [x8], #16
+        add             x5,  x5,  x1
+        add             x10, x10, x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_paeth_tbl):
+        .hword L(ipred_paeth_tbl) - 640b
+        .hword L(ipred_paeth_tbl) - 320b
+        .hword L(ipred_paeth_tbl) - 160b
+        .hword L(ipred_paeth_tbl) -  80b
+        .hword L(ipred_paeth_tbl) -  40b
+endfunc
+
+// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height, const int a,
+//                              const int max_width, const int max_height);
+function ipred_smooth_16bpc_neon, export=1
+        movrel          x10, X(sm_weights)
+        add             x11, x10, w4, uxtw
+        add             x10, x10, w3, uxtw
+        clz             w9,  w3
+        adr             x5,  L(ipred_smooth_tbl)
+        sub             x12, x2,  w4, uxtw #1
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x5, w9, uxtw #1]
+        ld1r            {v4.8h},  [x12] // bottom
+        add             x8,  x2,  #2
+        sub             x5,  x5,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
+        ld1r            {v6.2d}, [x8]             // top
+        ld1r            {v7.2s}, [x10]            // weights_hor
+        dup             v5.8h,   v6.h[3]          // right
+        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
+        uxtl            v7.8h,   v7.8b            // weights_hor
+        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
+4:
+        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
+        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
+        ushll           v21.4s,  v31.4h,  #8
+        ushll           v22.4s,  v31.4h,  #8
+        ushll           v23.4s,  v31.4h,  #8
+        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
+        zip1            v0.2d,   v3.2d,   v2.2d
+        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
+        zip1            v18.2s,  v18.2s,  v19.2s
+        sub             v0.8h,   v0.8h,   v5.8h   // left-right
+        sub             v1.8h,   v1.8h,   v5.8h
+        uxtl            v16.8h,  v16.8b           // weights_ver
+        uxtl            v18.8h,  v18.8b
+        smlal           v20.4s,  v0.4h,   v7.4h   // += (left-right)*weights_hor
+        smlal2          v21.4s,  v0.8h,   v7.8h
+        smlal           v22.4s,  v1.4h,   v7.4h
+        smlal2          v23.4s,  v1.8h,   v7.8h
+        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
+        smlal2          v21.4s,  v6.8h,   v16.8h
+        smlal           v22.4s,  v6.4h,   v18.4h
+        smlal2          v23.4s,  v6.8h,   v18.8h
+        rshrn           v20.4h,  v20.4s,  #9
+        rshrn           v21.4h,  v21.4s,  #9
+        rshrn           v22.4h,  v22.4s,  #9
+        rshrn           v23.4h,  v23.4s,  #9
+        st1             {v20.4h}, [x0], x1
+        st1             {v21.4h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v22.4h}, [x0], x1
+        st1             {v23.4h}, [x6], x1
+        b.gt            4b
+        ret
+80:
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
+        ld1             {v6.8h}, [x8]             // top
+        ld1             {v7.8b}, [x10]            // weights_hor
+        dup             v5.8h,   v6.h[7]          // right
+        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
+        uxtl            v7.8h,   v7.8b            // weights_hor
+        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
+8:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
+        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
+        ushll           v21.4s,  v31.4h,  #8
+        ushll           v22.4s,  v31.4h,  #8
+        ushll           v23.4s,  v31.4h,  #8
+        ushll           v24.4s,  v31.4h,  #8
+        ushll           v25.4s,  v31.4h,  #8
+        ushll           v26.4s,  v31.4h,  #8
+        ushll           v27.4s,  v31.4h,  #8
+        sub             v0.8h,   v0.8h,   v5.8h   // left-right
+        sub             v1.8h,   v1.8h,   v5.8h
+        sub             v2.8h,   v2.8h,   v5.8h
+        sub             v3.8h,   v3.8h,   v5.8h
+        uxtl            v16.8h,  v16.8b           // weights_ver
+        uxtl            v17.8h,  v17.8b
+        uxtl            v18.8h,  v18.8b
+        uxtl            v19.8h,  v19.8b
+        smlal           v20.4s,  v3.4h,   v7.4h   // += (left-right)*weights_hor
+        smlal2          v21.4s,  v3.8h,   v7.8h   // (left flipped)
+        smlal           v22.4s,  v2.4h,   v7.4h
+        smlal2          v23.4s,  v2.8h,   v7.8h
+        smlal           v24.4s,  v1.4h,   v7.4h
+        smlal2          v25.4s,  v1.8h,   v7.8h
+        smlal           v26.4s,  v0.4h,   v7.4h
+        smlal2          v27.4s,  v0.8h,   v7.8h
+        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
+        smlal2          v21.4s,  v6.8h,   v16.8h
+        smlal           v22.4s,  v6.4h,   v17.4h
+        smlal2          v23.4s,  v6.8h,   v17.8h
+        smlal           v24.4s,  v6.4h,   v18.4h
+        smlal2          v25.4s,  v6.8h,   v18.8h
+        smlal           v26.4s,  v6.4h,   v19.4h
+        smlal2          v27.4s,  v6.8h,   v19.8h
+        rshrn           v20.4h,  v20.4s,  #9
+        rshrn2          v20.8h,  v21.4s,  #9
+        rshrn           v21.4h,  v22.4s,  #9
+        rshrn2          v21.8h,  v23.4s,  #9
+        rshrn           v22.4h,  v24.4s,  #9
+        rshrn2          v22.8h,  v25.4s,  #9
+        rshrn           v23.4h,  v26.4s,  #9
+        rshrn2          v23.8h,  v27.4s,  #9
+        st1             {v20.8h}, [x0], x1
+        st1             {v21.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v22.8h}, [x0], x1
+        st1             {v23.8h}, [x6], x1
+        b.gt            8b
+        ret
+160:
+320:
+640:
+        add             x12, x2,  w3, uxtw #1
+        sub             x1,  x1,  w3, uxtw #1
+        ld1r            {v5.8h}, [x12]            // right
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+        mov             w9,  w3
+        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
+
+1:
+        ld2r            {v0.8h, v1.8h},   [x2],  x7 // left
+        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
+        sub             v0.8h,   v0.8h,   v5.8h   // left-right
+        sub             v1.8h,   v1.8h,   v5.8h
+        uxtl            v16.8h,  v16.8b           // weights_ver
+        uxtl            v17.8h,  v17.8b
+2:
+        ld1             {v7.16b}, [x10],  #16     // weights_hor
+        ld1             {v2.8h, v3.8h}, [x8], #32 // top
+        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
+        ushll           v21.4s,  v31.4h,  #8
+        ushll           v22.4s,  v31.4h,  #8
+        ushll           v23.4s,  v31.4h,  #8
+        ushll           v24.4s,  v31.4h,  #8
+        ushll           v25.4s,  v31.4h,  #8
+        ushll           v26.4s,  v31.4h,  #8
+        ushll           v27.4s,  v31.4h,  #8
+        uxtl            v6.8h,   v7.8b            // weights_hor
+        uxtl2           v7.8h,   v7.16b
+        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
+        sub             v3.8h,   v3.8h,   v4.8h
+        smlal           v20.4s,  v1.4h,   v6.4h   // += (left-right)*weights_hor
+        smlal2          v21.4s,  v1.8h,   v6.8h   // (left flipped)
+        smlal           v22.4s,  v1.4h,   v7.4h
+        smlal2          v23.4s,  v1.8h,   v7.8h
+        smlal           v24.4s,  v0.4h,   v6.4h
+        smlal2          v25.4s,  v0.8h,   v6.8h
+        smlal           v26.4s,  v0.4h,   v7.4h
+        smlal2          v27.4s,  v0.8h,   v7.8h
+        smlal           v20.4s,  v2.4h,   v16.4h  // += (top-bottom)*weights_ver
+        smlal2          v21.4s,  v2.8h,   v16.8h
+        smlal           v22.4s,  v3.4h,   v16.4h
+        smlal2          v23.4s,  v3.8h,   v16.8h
+        smlal           v24.4s,  v2.4h,   v17.4h
+        smlal2          v25.4s,  v2.8h,   v17.8h
+        smlal           v26.4s,  v3.4h,   v17.4h
+        smlal2          v27.4s,  v3.8h,   v17.8h
+        rshrn           v20.4h,  v20.4s,  #9
+        rshrn2          v20.8h,  v21.4s,  #9
+        rshrn           v21.4h,  v22.4s,  #9
+        rshrn2          v21.8h,  v23.4s,  #9
+        rshrn           v22.4h,  v24.4s,  #9
+        rshrn2          v22.8h,  v25.4s,  #9
+        rshrn           v23.4h,  v26.4s,  #9
+        rshrn2          v23.8h,  v27.4s,  #9
+        subs            w3,  w3,  #16
+        st1             {v20.8h, v21.8h}, [x0], #32
+        st1             {v22.8h, v23.8h}, [x6], #32
+        b.gt            2b
+        subs            w4,  w4,  #2
+        b.le            9f
+        sub             x8,  x8,  w9, uxtw #1
+        sub             x10, x10, w9, uxtw
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_smooth_tbl):
+        .hword L(ipred_smooth_tbl) - 640b
+        .hword L(ipred_smooth_tbl) - 320b
+        .hword L(ipred_smooth_tbl) - 160b
+        .hword L(ipred_smooth_tbl) -  80b
+        .hword L(ipred_smooth_tbl) -  40b
+endfunc
+
+// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                const pixel *const topleft,
+//                                const int width, const int height, const int a,
+//                                const int max_width, const int max_height);
+function ipred_smooth_v_16bpc_neon, export=1
+        movrel          x7,  X(sm_weights)
+        add             x7,  x7,  w4, uxtw
+        clz             w9,  w3
+        adr             x5,  L(ipred_smooth_v_tbl)
+        sub             x8,  x2,  w4, uxtw #1
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x5, w9, uxtw #1]
+        ld1r            {v4.8h},  [x8] // bottom
+        add             x2,  x2,  #2
+        sub             x5,  x5,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1r            {v6.2d}, [x2]             // top
+        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
+4:
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
+        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
+        zip1            v18.2s,  v18.2s,  v19.2s
+        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
+        ushll           v18.8h,  v18.8b,  #7
+        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
+        sqrdmulh        v21.8h,  v6.8h,   v18.8h
+        add             v20.8h,  v20.8h,  v4.8h
+        add             v21.8h,  v21.8h,  v4.8h
+        st1             {v20.d}[0], [x0], x1
+        st1             {v20.d}[1], [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v21.d}[0], [x0], x1
+        st1             {v21.d}[1], [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v6.8h}, [x2]             // top
+        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
+8:
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
+        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
+        ushll           v17.8h,  v17.8b,  #7
+        ushll           v18.8h,  v18.8b,  #7
+        ushll           v19.8h,  v19.8b,  #7
+        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
+        sqrdmulh        v21.8h,  v6.8h,   v17.8h
+        sqrdmulh        v22.8h,  v6.8h,   v18.8h
+        sqrdmulh        v23.8h,  v6.8h,   v19.8h
+        add             v20.8h,  v20.8h,  v4.8h
+        add             v21.8h,  v21.8h,  v4.8h
+        add             v22.8h,  v22.8h,  v4.8h
+        add             v23.8h,  v23.8h,  v4.8h
+        st1             {v20.8h}, [x0], x1
+        st1             {v21.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v22.8h}, [x0], x1
+        st1             {v23.8h}, [x6], x1
+        b.gt            8b
+        ret
+160:
+320:
+640:
+        // Set up pointers for four rows in parallel; x0, x6, x5, x8
+        add             x5,  x0,  x1
+        add             x8,  x6,  x1
+        lsl             x1,  x1,  #1
+        sub             x1,  x1,  w3, uxtw #1
+        mov             w9,  w3
+
+1:
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
+        ushll           v17.8h,  v17.8b,  #7
+        ushll           v18.8h,  v18.8b,  #7
+        ushll           v19.8h,  v19.8b,  #7
+2:
+        ld1             {v2.8h, v3.8h}, [x2], #32 // top
+        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
+        sub             v3.8h,   v3.8h,   v4.8h
+        sqrdmulh        v20.8h,  v2.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
+        sqrdmulh        v21.8h,  v3.8h,   v16.8h
+        sqrdmulh        v22.8h,  v2.8h,   v17.8h
+        sqrdmulh        v23.8h,  v3.8h,   v17.8h
+        sqrdmulh        v24.8h,  v2.8h,   v18.8h
+        sqrdmulh        v25.8h,  v3.8h,   v18.8h
+        sqrdmulh        v26.8h,  v2.8h,   v19.8h
+        sqrdmulh        v27.8h,  v3.8h,   v19.8h
+        add             v20.8h,  v20.8h,  v4.8h
+        add             v21.8h,  v21.8h,  v4.8h
+        add             v22.8h,  v22.8h,  v4.8h
+        add             v23.8h,  v23.8h,  v4.8h
+        add             v24.8h,  v24.8h,  v4.8h
+        add             v25.8h,  v25.8h,  v4.8h
+        add             v26.8h,  v26.8h,  v4.8h
+        add             v27.8h,  v27.8h,  v4.8h
+        subs            w3,  w3,  #16
+        st1             {v20.8h, v21.8h}, [x0], #32
+        st1             {v22.8h, v23.8h}, [x6], #32
+        st1             {v24.8h, v25.8h}, [x5], #32
+        st1             {v26.8h, v27.8h}, [x8], #32
+        b.gt            2b
+        subs            w4,  w4,  #4
+        b.le            9f
+        sub             x2,  x2,  w9, uxtw #1
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        add             x5,  x5,  x1
+        add             x8,  x8,  x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_smooth_v_tbl):
+        .hword L(ipred_smooth_v_tbl) - 640b
+        .hword L(ipred_smooth_v_tbl) - 320b
+        .hword L(ipred_smooth_v_tbl) - 160b
+        .hword L(ipred_smooth_v_tbl) -  80b
+        .hword L(ipred_smooth_v_tbl) -  40b
+endfunc
+
+// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                const pixel *const topleft,
+//                                const int width, const int height, const int a,
+//                                const int max_width, const int max_height);
+function ipred_smooth_h_16bpc_neon, export=1
+        movrel          x8,  X(sm_weights)
+        add             x8,  x8,  w3, uxtw
+        clz             w9,  w3
+        adr             x5,  L(ipred_smooth_h_tbl)
+        add             x12, x2,  w3, uxtw #1
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x5, w9, uxtw #1]
+        ld1r            {v5.8h},  [x12] // right
+        sub             x5,  x5,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1r            {v7.2s}, [x8]             // weights_hor
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
+        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
+4:
+        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
+        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
+        zip1            v0.2d,   v3.2d,   v2.2d
+        sub             v0.8h,   v0.8h,   v5.8h   // left-right
+        sub             v1.8h,   v1.8h,   v5.8h
+        sqrdmulh        v20.8h,  v0.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
+        sqrdmulh        v21.8h,  v1.8h,   v7.8h
+        add             v20.8h,  v20.8h,  v5.8h
+        add             v21.8h,  v21.8h,  v5.8h
+        st1             {v20.d}[0], [x0], x1
+        st1             {v20.d}[1], [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v21.d}[0], [x0], x1
+        st1             {v21.d}[1], [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v7.8b}, [x8]             // weights_hor
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
+        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
+8:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
+        sub             v3.8h,   v3.8h,   v5.8h   // left-right
+        sub             v2.8h,   v2.8h,   v5.8h
+        sub             v1.8h,   v1.8h,   v5.8h
+        sub             v0.8h,   v0.8h,   v5.8h
+        sqrdmulh        v20.8h,  v3.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
+        sqrdmulh        v21.8h,  v2.8h,   v7.8h   // (left flipped)
+        sqrdmulh        v22.8h,  v1.8h,   v7.8h
+        sqrdmulh        v23.8h,  v0.8h,   v7.8h
+        add             v20.8h,  v20.8h,  v5.8h
+        add             v21.8h,  v21.8h,  v5.8h
+        add             v22.8h,  v22.8h,  v5.8h
+        add             v23.8h,  v23.8h,  v5.8h
+        st1             {v20.8h}, [x0], x1
+        st1             {v21.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v22.8h}, [x0], x1
+        st1             {v23.8h}, [x6], x1
+        b.gt            8b
+        ret
+160:
+320:
+640:
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
+        // Set up pointers for four rows in parallel; x0, x6, x5, x10
+        add             x5,  x0,  x1
+        add             x10, x6,  x1
+        lsl             x1,  x1,  #1
+        sub             x1,  x1,  w3, uxtw #1
+        mov             w9,  w3
+
+1:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},   [x2],  x7 // left
+        sub             v0.8h,   v0.8h,   v5.8h   // left-right
+        sub             v1.8h,   v1.8h,   v5.8h
+        sub             v2.8h,   v2.8h,   v5.8h
+        sub             v3.8h,   v3.8h,   v5.8h
+2:
+        ld1             {v7.16b}, [x8],   #16     // weights_hor
+        ushll           v6.8h,   v7.8b,   #7      // weights_hor << 7
+        ushll2          v7.8h,   v7.16b,  #7
+        sqrdmulh        v20.8h,  v3.8h,   v6.8h   // ((left-right)*weights_hor + 128) >> 8
+        sqrdmulh        v21.8h,  v3.8h,   v7.8h   // (left flipped)
+        sqrdmulh        v22.8h,  v2.8h,   v6.8h
+        sqrdmulh        v23.8h,  v2.8h,   v7.8h
+        sqrdmulh        v24.8h,  v1.8h,   v6.8h
+        sqrdmulh        v25.8h,  v1.8h,   v7.8h
+        sqrdmulh        v26.8h,  v0.8h,   v6.8h
+        sqrdmulh        v27.8h,  v0.8h,   v7.8h
+        add             v20.8h,  v20.8h,  v5.8h
+        add             v21.8h,  v21.8h,  v5.8h
+        add             v22.8h,  v22.8h,  v5.8h
+        add             v23.8h,  v23.8h,  v5.8h
+        add             v24.8h,  v24.8h,  v5.8h
+        add             v25.8h,  v25.8h,  v5.8h
+        add             v26.8h,  v26.8h,  v5.8h
+        add             v27.8h,  v27.8h,  v5.8h
+        subs            w3,  w3,  #16
+        st1             {v20.8h, v21.8h}, [x0],  #32
+        st1             {v22.8h, v23.8h}, [x6],  #32
+        st1             {v24.8h, v25.8h}, [x5],  #32
+        st1             {v26.8h, v27.8h}, [x10], #32
+        b.gt            2b
+        subs            w4,  w4,  #4
+        b.le            9f
+        sub             x8,  x8,  w9, uxtw
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        add             x5,  x5,  x1
+        add             x10, x10, x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_smooth_h_tbl):
+        .hword L(ipred_smooth_h_tbl) - 640b
+        .hword L(ipred_smooth_h_tbl) - 320b
+        .hword L(ipred_smooth_h_tbl) - 160b
+        .hword L(ipred_smooth_h_tbl) -  80b
+        .hword L(ipred_smooth_h_tbl) -  40b
+endfunc
+
+// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height, const int filt_idx,
+//                              const int max_width, const int max_height,
+//                              const int bitdepth_max);
+.macro filter_fn bpc
+function ipred_filter_\bpc\()bpc_neon
+        and             w5,  w5,  #511
+        movrel          x6,  X(filter_intra_taps)
+        lsl             w5,  w5,  #6
+        add             x6,  x6,  w5, uxtw
+        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
+        clz             w9,  w3
+        adr             x5,  L(ipred_filter\bpc\()_tbl)
+        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
+        sub             w9,  w9,  #26
+        ldrh            w9,  [x5, w9, uxtw #1]
+        sxtl            v16.8h,  v16.8b
+        sxtl            v17.8h,  v17.8b
+        sub             x5,  x5,  w9, uxtw
+        sxtl            v18.8h,  v18.8b
+        sxtl            v19.8h,  v19.8b
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        sxtl            v20.8h,  v20.8b
+        sxtl            v21.8h,  v21.8b
+        sxtl            v22.8h,  v22.8b
+        dup             v31.8h,  w8
+        movi            v30.8h,  #0
+        br              x5
+40:
+        ldur            d0,  [x2, #2]             // top (0-3)
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+4:
+        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
+.if \bpc == 10
+        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
+        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
+        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
+        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
+        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
+        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
+        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
+        srshr           v2.8h,   v2.8h,   #4
+        smax            v2.8h,   v2.8h,   v30.8h
+.else
+        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
+        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
+        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
+        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
+        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
+        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
+        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
+        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
+        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
+        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
+        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
+        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
+        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
+        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
+        sqrshrun        v2.4h,   v2.4s,   #4
+        sqrshrun2       v2.8h,   v3.4s,   #4
+.endif
+        smin            v2.8h,   v2.8h,   v31.8h
+        subs            w4,  w4,  #2
+        st1             {v2.d}[0], [x0], x1
+        uxtl            v0.8h,   v2.8b
+        ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]
+        st1             {v2.d}[1], [x6], x1
+        b.gt            4b
+        ret
+80:
+        ldur            q0,  [x2, #2]             // top (0-7)
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+8:
+        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
+.if \bpc == 10
+        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
+        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
+        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
+        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
+        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
+        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
+        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
+        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
+        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
+        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
+        srshr           v2.8h,   v2.8h,   #4
+        smax            v2.8h,   v2.8h,   v30.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
+        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
+        mla             v3.8h,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
+        mla             v3.8h,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
+        srshr           v3.8h,   v3.8h,   #4
+        smax            v3.8h,   v3.8h,   v30.8h
+.else
+        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
+        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
+        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
+        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
+        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
+        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
+        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
+        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
+        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
+        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
+        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
+        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
+        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
+        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
+        smull           v4.4s,   v17.4h,  v0.h[4] // p1(top[0]) * filter(1)
+        smlal           v4.4s,   v18.4h,  v0.h[5] // p2(top[1]) * filter(2)
+        smlal           v4.4s,   v19.4h,  v0.h[6] // p3(top[2]) * filter(3)
+        sqrshrun        v2.4h,   v2.4s,   #4
+        sqrshrun2       v2.8h,   v3.4s,   #4
+        smin            v2.8h,   v2.8h,   v31.8h
+        smlal           v4.4s,   v20.4h,  v0.h[7] // p4(top[3]) * filter(4)
+        smlal           v4.4s,   v16.4h,  v0.h[3] // p0(topleft) * filter(0)
+        smlal           v4.4s,   v21.4h,  v2.h[3] // p5(left[0]) * filter(5)
+        smlal           v4.4s,   v22.4h,  v2.h[7] // p6(left[1]) * filter(6)
+        smull2          v5.4s,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
+        smlal2          v5.4s,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
+        smlal2          v5.4s,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
+        smlal2          v5.4s,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
+        smlal2          v5.4s,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
+        smlal2          v5.4s,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
+        smlal2          v5.4s,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
+        sqrshrun        v3.4h,   v4.4s,   #4
+        sqrshrun2       v3.8h,   v5.4s,   #4
+.endif
+        smin            v3.8h,   v3.8h,   v31.8h
+        subs            w4,  w4,  #2
+        st2             {v2.d, v3.d}[0], [x0], x1
+        zip2            v0.2d,   v2.2d,   v3.2d
+        st2             {v2.d, v3.d}[1], [x6], x1
+        b.gt            8b
+        ret
+160:
+320:
+        add             x8,  x2,  #2
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+        sub             x1,  x1,  w3, uxtw #1
+        mov             w9,  w3
+
+1:
+        ld1             {v0.4h}, [x2], x7         // left (0-1) + topleft (2)
+2:
+        ld1             {v1.8h, v2.8h}, [x8], #32 // top(0-15)
+.if \bpc == 10
+        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
+        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
+        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
+        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
+        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
+        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
+        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
+
+        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
+        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
+        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
+        srshr           v3.8h,   v3.8h,   #4
+        smax            v3.8h,   v3.8h,   v30.8h
+        smin            v3.8h,   v3.8h,   v31.8h
+        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
+        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
+        mla             v4.8h,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
+        mla             v4.8h,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
+
+        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
+        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
+        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
+        srshr           v4.8h,   v4.8h,   #4
+        smax            v4.8h,   v4.8h,   v30.8h
+        smin            v4.8h,   v4.8h,   v31.8h
+        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
+        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
+        mla             v5.8h,   v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
+        mla             v5.8h,   v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
+
+        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
+        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
+        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
+        srshr           v5.8h,   v5.8h,   #4
+        smax            v5.8h,   v5.8h,   v30.8h
+        smin            v5.8h,   v5.8h,   v31.8h
+        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
+        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
+        mla             v6.8h,   v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
+        mla             v6.8h,   v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
+
+        subs            w3,  w3,  #16
+        srshr           v6.8h,   v6.8h,   #4
+        smax            v6.8h,   v6.8h,   v30.8h
+.else
+        smull           v3.4s,   v16.4h,  v0.h[2] // p0(topleft) * filter(0)
+        smlal           v3.4s,   v21.4h,  v0.h[1] // p5(left[0]) * filter(5)
+        smlal           v3.4s,   v22.4h,  v0.h[0] // p6(left[1]) * filter(6)
+        smlal           v3.4s,   v17.4h,  v1.h[0] // p1(top[0]) * filter(1)
+        smlal           v3.4s,   v18.4h,  v1.h[1] // p2(top[1]) * filter(2)
+        smlal           v3.4s,   v19.4h,  v1.h[2] // p3(top[2]) * filter(3)
+        smlal           v3.4s,   v20.4h,  v1.h[3] // p4(top[3]) * filter(4)
+        smull2          v4.4s,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
+        smlal2          v4.4s,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
+        smlal2          v4.4s,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
+        smlal2          v4.4s,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
+        smlal2          v4.4s,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
+        smlal2          v4.4s,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
+        smlal2          v4.4s,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
+
+        smull           v5.4s,   v17.4h,  v1.h[4] // p1(top[0]) * filter(1)
+        smlal           v5.4s,   v18.4h,  v1.h[5] // p2(top[1]) * filter(2)
+        smlal           v5.4s,   v19.4h,  v1.h[6] // p3(top[2]) * filter(3)
+        sqrshrun        v3.4h,   v3.4s,   #4
+        sqrshrun2       v3.8h,   v4.4s,   #4
+        smin            v3.8h,   v3.8h,   v31.8h
+        smlal           v5.4s,   v20.4h,  v1.h[7] // p4(top[3]) * filter(4)
+        smlal           v5.4s,   v16.4h,  v1.h[3] // p0(topleft) * filter(0)
+        smlal           v5.4s,   v21.4h,  v3.h[3] // p5(left[0]) * filter(5)
+        smlal           v5.4s,   v22.4h,  v3.h[7] // p6(left[1]) * filter(6)
+        smull2          v6.4s,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
+        smlal2          v6.4s,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
+        smlal2          v6.4s,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
+        smlal2          v6.4s,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
+        smlal2          v6.4s,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
+        smlal2          v6.4s,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
+        smlal2          v6.4s,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
+
+        smull           v24.4s,  v17.4h,  v2.h[0] // p1(top[0]) * filter(1)
+        smlal           v24.4s,  v18.4h,  v2.h[1] // p2(top[1]) * filter(2)
+        smlal           v24.4s,  v19.4h,  v2.h[2] // p3(top[2]) * filter(3)
+        sqrshrun        v4.4h,   v5.4s,   #4
+        sqrshrun2       v4.8h,   v6.4s,   #4
+        smin            v4.8h,   v4.8h,   v31.8h
+        smlal           v24.4s,  v20.4h,  v2.h[3] // p4(top[3]) * filter(4)
+        smlal           v24.4s,  v16.4h,  v1.h[7] // p0(topleft) * filter(0)
+        smlal           v24.4s,  v21.4h,  v4.h[3] // p5(left[0]) * filter(5)
+        smlal           v24.4s,  v22.4h,  v4.h[7] // p6(left[1]) * filter(6)
+        smull2          v25.4s,  v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
+        smlal2          v25.4s,  v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
+        smlal2          v25.4s,  v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
+        smlal2          v25.4s,  v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
+        smlal2          v25.4s,  v16.8h,  v1.h[7] // p0(topleft) * filter(0)
+        smlal2          v25.4s,  v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
+        smlal2          v25.4s,  v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
+
+        smull           v26.4s,  v17.4h,  v2.h[4] // p1(top[0]) * filter(1)
+        smlal           v26.4s,  v18.4h,  v2.h[5] // p2(top[1]) * filter(2)
+        smlal           v26.4s,  v19.4h,  v2.h[6] // p3(top[2]) * filter(3)
+        sqrshrun        v5.4h,   v24.4s,  #4
+        sqrshrun2       v5.8h,   v25.4s,  #4
+        smin            v5.8h,   v5.8h,   v31.8h
+        smlal           v26.4s,  v20.4h,  v2.h[7] // p4(top[3]) * filter(4)
+        smlal           v26.4s,  v16.4h,  v2.h[3] // p0(topleft) * filter(0)
+        smlal           v26.4s,  v21.4h,  v5.h[3] // p5(left[0]) * filter(5)
+        smlal           v26.4s,  v22.4h,  v5.h[7] // p6(left[1]) * filter(6)
+        smull2          v27.4s,  v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
+        smlal2          v27.4s,  v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
+        smlal2          v27.4s,  v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
+        smlal2          v27.4s,  v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
+        smlal2          v27.4s,  v16.8h,  v2.h[3] // p0(topleft) * filter(0)
+        smlal2          v27.4s,  v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
+        smlal2          v27.4s,  v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
+
+        subs            w3,  w3,  #16
+        sqrshrun        v6.4h,   v26.4s,  #4
+        sqrshrun2       v6.8h,   v27.4s,  #4
+.endif
+        smin            v6.8h,   v6.8h,   v31.8h
+
+        ins             v0.h[2], v2.h[7]
+        st4             {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
+        ins             v0.h[0], v6.h[7]
+        st4             {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
+        ins             v0.h[1], v6.h[3]
+        b.gt            2b
+        subs            w4,  w4,  #2
+        b.le            9f
+        sub             x8,  x6,  w9, uxtw #1
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_filter\bpc\()_tbl):
+        .hword L(ipred_filter\bpc\()_tbl) - 320b
+        .hword L(ipred_filter\bpc\()_tbl) - 160b
+        .hword L(ipred_filter\bpc\()_tbl) -  80b
+        .hword L(ipred_filter\bpc\()_tbl) -  40b
+endfunc
+.endm
+
+filter_fn 10
+filter_fn 12
+
+function ipred_filter_16bpc_neon, export=1
+        ldr             w8,  [sp]
+        cmp             w8,  0x3ff
+        b.le            ipred_filter_10bpc_neon
+        b               ipred_filter_12bpc_neon
+endfunc
+
+// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                          const uint16_t *const pal, const uint8_t *idx,
+//                          const int w, const int h);
+function pal_pred_16bpc_neon, export=1
+        ld1             {v30.8h}, [x2]
+        clz             w9,  w4
+        adr             x6,  L(pal_pred_tbl)
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x6, w9, uxtw #1]
+        movi            v31.8h,  #1, lsl #8
+        sub             x6,  x6,  w9, uxtw
+        br              x6
+40:
+        add             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+4:
+        ld1             {v1.16b}, [x3], #16
+        subs            w5,  w5,  #4
+        // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
+        add             v1.16b,  v1.16b,  v1.16b
+        zip1            v0.16b,  v1.16b,  v1.16b
+        zip2            v1.16b,  v1.16b,  v1.16b
+        add             v0.8h,   v0.8h,   v31.8h
+        add             v1.8h,   v1.8h,   v31.8h
+        tbl             v0.16b, {v30.16b}, v0.16b
+        st1             {v0.d}[0], [x0], x1
+        tbl             v1.16b, {v30.16b}, v1.16b
+        st1             {v0.d}[1], [x2], x1
+        st1             {v1.d}[0], [x0], x1
+        st1             {v1.d}[1], [x2], x1
+        b.gt            4b
+        ret
+80:
+        add             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+8:
+        ld1             {v2.16b, v3.16b}, [x3], #32
+        subs            w5,  w5,  #4
+        add             v2.16b,  v2.16b,  v2.16b
+        add             v3.16b,  v3.16b,  v3.16b
+        zip1            v0.16b,  v2.16b,  v2.16b
+        zip2            v1.16b,  v2.16b,  v2.16b
+        zip1            v2.16b,  v3.16b,  v3.16b
+        zip2            v3.16b,  v3.16b,  v3.16b
+        add             v0.8h,   v0.8h,   v31.8h
+        add             v1.8h,   v1.8h,   v31.8h
+        add             v2.8h,   v2.8h,   v31.8h
+        add             v3.8h,   v3.8h,   v31.8h
+        tbl             v0.16b, {v30.16b}, v0.16b
+        tbl             v1.16b, {v30.16b}, v1.16b
+        st1             {v0.8h}, [x0], x1
+        tbl             v2.16b, {v30.16b}, v2.16b
+        st1             {v1.8h}, [x2], x1
+        tbl             v3.16b, {v30.16b}, v3.16b
+        st1             {v2.8h}, [x0], x1
+        st1             {v3.8h}, [x2], x1
+        b.gt            8b
+        ret
+160:
+        add             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+16:
+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+        subs            w5,  w5,  #4
+        add             v4.16b,  v4.16b,  v4.16b
+        add             v5.16b,  v5.16b,  v5.16b
+        add             v6.16b,  v6.16b,  v6.16b
+        add             v7.16b,  v7.16b,  v7.16b
+        zip1            v0.16b,  v4.16b,  v4.16b
+        zip2            v1.16b,  v4.16b,  v4.16b
+        zip1            v2.16b,  v5.16b,  v5.16b
+        zip2            v3.16b,  v5.16b,  v5.16b
+        zip1            v4.16b,  v6.16b,  v6.16b
+        zip2            v5.16b,  v6.16b,  v6.16b
+        zip1            v6.16b,  v7.16b,  v7.16b
+        zip2            v7.16b,  v7.16b,  v7.16b
+        add             v0.8h,   v0.8h,   v31.8h
+        add             v1.8h,   v1.8h,   v31.8h
+        add             v2.8h,   v2.8h,   v31.8h
+        add             v3.8h,   v3.8h,   v31.8h
+        add             v4.8h,   v4.8h,   v31.8h
+        tbl             v0.16b, {v30.16b}, v0.16b
+        add             v5.8h,   v5.8h,   v31.8h
+        tbl             v1.16b, {v30.16b}, v1.16b
+        add             v6.8h,   v6.8h,   v31.8h
+        tbl             v2.16b, {v30.16b}, v2.16b
+        add             v7.8h,   v7.8h,   v31.8h
+        tbl             v3.16b, {v30.16b}, v3.16b
+        tbl             v4.16b, {v30.16b}, v4.16b
+        tbl             v5.16b, {v30.16b}, v5.16b
+        st1             {v0.8h, v1.8h}, [x0], x1
+        tbl             v6.16b, {v30.16b}, v6.16b
+        st1             {v2.8h, v3.8h}, [x2], x1
+        tbl             v7.16b, {v30.16b}, v7.16b
+        st1             {v4.8h, v5.8h}, [x0], x1
+        st1             {v6.8h, v7.8h}, [x2], x1
+        b.gt            16b
+        ret
+320:
+        add             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+32:
+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+        subs            w5,  w5,  #2
+        add             v4.16b,  v4.16b,  v4.16b
+        add             v5.16b,  v5.16b,  v5.16b
+        add             v6.16b,  v6.16b,  v6.16b
+        add             v7.16b,  v7.16b,  v7.16b
+        zip1            v0.16b,  v4.16b,  v4.16b
+        zip2            v1.16b,  v4.16b,  v4.16b
+        zip1            v2.16b,  v5.16b,  v5.16b
+        zip2            v3.16b,  v5.16b,  v5.16b
+        zip1            v4.16b,  v6.16b,  v6.16b
+        zip2            v5.16b,  v6.16b,  v6.16b
+        zip1            v6.16b,  v7.16b,  v7.16b
+        zip2            v7.16b,  v7.16b,  v7.16b
+        add             v0.8h,   v0.8h,   v31.8h
+        add             v1.8h,   v1.8h,   v31.8h
+        add             v2.8h,   v2.8h,   v31.8h
+        add             v3.8h,   v3.8h,   v31.8h
+        add             v4.8h,   v4.8h,   v31.8h
+        tbl             v0.16b, {v30.16b}, v0.16b
+        add             v5.8h,   v5.8h,   v31.8h
+        tbl             v1.16b, {v30.16b}, v1.16b
+        add             v6.8h,   v6.8h,   v31.8h
+        tbl             v2.16b, {v30.16b}, v2.16b
+        add             v7.8h,   v7.8h,   v31.8h
+        tbl             v3.16b, {v30.16b}, v3.16b
+        tbl             v4.16b, {v30.16b}, v4.16b
+        tbl             v5.16b, {v30.16b}, v5.16b
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        tbl             v6.16b, {v30.16b}, v6.16b
+        tbl             v7.16b, {v30.16b}, v7.16b
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
+        b.gt            32b
+        ret
+640:
+        add             x2,  x0,  #64
+64:
+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+        subs            w5,  w5,  #1
+        add             v4.16b,  v4.16b,  v4.16b
+        add             v5.16b,  v5.16b,  v5.16b
+        add             v6.16b,  v6.16b,  v6.16b
+        add             v7.16b,  v7.16b,  v7.16b
+        zip1            v0.16b,  v4.16b,  v4.16b
+        zip2            v1.16b,  v4.16b,  v4.16b
+        zip1            v2.16b,  v5.16b,  v5.16b
+        zip2            v3.16b,  v5.16b,  v5.16b
+        zip1            v4.16b,  v6.16b,  v6.16b
+        zip2            v5.16b,  v6.16b,  v6.16b
+        zip1            v6.16b,  v7.16b,  v7.16b
+        zip2            v7.16b,  v7.16b,  v7.16b
+        add             v0.8h,   v0.8h,   v31.8h
+        add             v1.8h,   v1.8h,   v31.8h
+        add             v2.8h,   v2.8h,   v31.8h
+        add             v3.8h,   v3.8h,   v31.8h
+        add             v4.8h,   v4.8h,   v31.8h
+        tbl             v0.16b, {v30.16b}, v0.16b
+        add             v5.8h,   v5.8h,   v31.8h
+        tbl             v1.16b, {v30.16b}, v1.16b
+        add             v6.8h,   v6.8h,   v31.8h
+        tbl             v2.16b, {v30.16b}, v2.16b
+        add             v7.8h,   v7.8h,   v31.8h
+        tbl             v3.16b, {v30.16b}, v3.16b
+        tbl             v4.16b, {v30.16b}, v4.16b
+        tbl             v5.16b, {v30.16b}, v5.16b
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        tbl             v6.16b, {v30.16b}, v6.16b
+        tbl             v7.16b, {v30.16b}, v7.16b
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
+        b.gt            64b
+        ret
+
+L(pal_pred_tbl):
+        .hword L(pal_pred_tbl) - 640b
+        .hword L(pal_pred_tbl) - 320b
+        .hword L(pal_pred_tbl) - 160b
+        .hword L(pal_pred_tbl) -  80b
+        .hword L(pal_pred_tbl) -  40b
+endfunc
+
+// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height,
+//                               const int16_t *ac, const int alpha,
+//                               const int bitdepth_max);
+function ipred_cfl_128_16bpc_neon, export=1
+        dup             v31.8h,  w7   // bitdepth_max
+        clz             w9,  w3
+        adr             x7,  L(ipred_cfl_128_tbl)
+        sub             w9,  w9,  #26
+        ldrh            w9,  [x7, w9, uxtw #1]
+        urshr           v0.8h,   v31.8h,  #1
+        dup             v1.8h,   w6   // alpha
+        sub             x7,  x7,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        movi            v30.8h,  #0
+        br              x7
+L(ipred_cfl_splat_w4):
+        ld1             {v4.8h, v5.8h}, [x5], #32
+        subs            w4,  w4,  #4
+        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
+        smull2          v3.4s,   v4.8h,   v1.8h
+        smull           v4.4s,   v5.4h,   v1.4h
+        smull2          v5.4s,   v5.8h,   v1.8h
+        sshr            v16.4s,  v2.4s,   #31    // sign = diff >> 31
+        sshr            v17.4s,  v3.4s,   #31
+        sshr            v18.4s,  v4.4s,   #31
+        sshr            v19.4s,  v5.4s,   #31
+        add             v2.4s,   v2.4s,   v16.4s // diff + sign
+        add             v3.4s,   v3.4s,   v17.4s
+        add             v4.4s,   v4.4s,   v18.4s
+        add             v5.4s,   v5.4s,   v19.4s
+        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
+        rshrn2          v2.8h,   v3.4s,   #6
+        rshrn           v3.4h,   v4.4s,   #6
+        rshrn2          v3.8h,   v5.4s,   #6
+        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
+        add             v3.8h,   v3.8h,   v0.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        smin            v3.8h,   v3.8h,   v31.8h
+        st1             {v2.d}[0],  [x0], x1
+        st1             {v2.d}[1],  [x6], x1
+        st1             {v3.d}[0],  [x0], x1
+        st1             {v3.d}[1],  [x6], x1
+        b.gt            L(ipred_cfl_splat_w4)
+        ret
+L(ipred_cfl_splat_w8):
+        ld1             {v4.8h, v5.8h}, [x5], #32
+        subs            w4,  w4,  #2
+        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
+        smull2          v3.4s,   v4.8h,   v1.8h
+        smull           v4.4s,   v5.4h,   v1.4h
+        smull2          v5.4s,   v5.8h,   v1.8h
+        sshr            v16.4s,  v2.4s,   #31    // sign = diff >> 31
+        sshr            v17.4s,  v3.4s,   #31
+        sshr            v18.4s,  v4.4s,   #31
+        sshr            v19.4s,  v5.4s,   #31
+        add             v2.4s,   v2.4s,   v16.4s // diff + sign
+        add             v3.4s,   v3.4s,   v17.4s
+        add             v4.4s,   v4.4s,   v18.4s
+        add             v5.4s,   v5.4s,   v19.4s
+        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
+        rshrn2          v2.8h,   v3.4s,   #6
+        rshrn           v3.4h,   v4.4s,   #6
+        rshrn2          v3.8h,   v5.4s,   #6
+        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
+        add             v3.8h,   v3.8h,   v0.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        smin            v3.8h,   v3.8h,   v31.8h
+        st1             {v2.8h},  [x0], x1
+        st1             {v3.8h},  [x6], x1
+        b.gt            L(ipred_cfl_splat_w8)
+        ret
+L(ipred_cfl_splat_w16):
+        add             x7,  x5,  w3, uxtw #1
+        sub             x1,  x1,  w3, uxtw #1
+        mov             w9,  w3
+1:
+        ld1             {v2.8h, v3.8h}, [x5], #32
+        ld1             {v4.8h, v5.8h}, [x7], #32
+        subs            w3,  w3,  #16
+        smull           v16.4s,  v2.4h,   v1.4h  // diff = ac * alpha
+        smull2          v17.4s,  v2.8h,   v1.8h
+        smull           v18.4s,  v3.4h,   v1.4h
+        smull2          v19.4s,  v3.8h,   v1.8h
+        smull           v2.4s,   v4.4h,   v1.4h
+        smull2          v3.4s,   v4.8h,   v1.8h
+        smull           v4.4s,   v5.4h,   v1.4h
+        smull2          v5.4s,   v5.8h,   v1.8h
+        sshr            v20.4s,  v16.4s,  #31    // sign = diff >> 31
+        sshr            v21.4s,  v17.4s,  #31
+        sshr            v22.4s,  v18.4s,  #31
+        sshr            v23.4s,  v19.4s,  #31
+        sshr            v24.4s,  v2.4s,   #31
+        sshr            v25.4s,  v3.4s,   #31
+        sshr            v26.4s,  v4.4s,   #31
+        sshr            v27.4s,  v5.4s,   #31
+        add             v16.4s,  v16.4s,  v20.4s // diff + sign
+        add             v17.4s,  v17.4s,  v21.4s
+        add             v18.4s,  v18.4s,  v22.4s
+        add             v19.4s,  v19.4s,  v23.4s
+        add             v2.4s,   v2.4s,   v24.4s
+        add             v3.4s,   v3.4s,   v25.4s
+        add             v4.4s,   v4.4s,   v26.4s
+        add             v5.4s,   v5.4s,   v27.4s
+        rshrn           v16.4h,  v16.4s,  #6     // (diff + sign + 32) >> 6 = apply_sign()
+        rshrn2          v16.8h,  v17.4s,  #6
+        rshrn           v17.4h,  v18.4s,  #6
+        rshrn2          v17.8h,  v19.4s,  #6
+        rshrn           v6.4h,   v2.4s,   #6
+        rshrn2          v6.8h,   v3.4s,   #6
+        rshrn           v7.4h,   v4.4s,   #6
+        rshrn2          v7.8h,   v5.4s,   #6
+        add             v2.8h,   v16.8h,  v0.8h  // dc + apply_sign()
+        add             v3.8h,   v17.8h,  v0.8h
+        add             v4.8h,   v6.8h,   v0.8h
+        add             v5.8h,   v7.8h,   v0.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smax            v4.8h,   v4.8h,   v30.8h
+        smax            v5.8h,   v5.8h,   v30.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        smin            v3.8h,   v3.8h,   v31.8h
+        smin            v4.8h,   v4.8h,   v31.8h
+        smin            v5.8h,   v5.8h,   v31.8h
+        st1             {v2.8h, v3.8h},  [x0], #32
+        st1             {v4.8h, v5.8h},  [x6], #32
+        b.gt            1b
+        subs            w4,  w4,  #2
+        add             x5,  x5,  w9, uxtw #1
+        add             x7,  x7,  w9, uxtw #1
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        mov             w3,  w9
+        b.gt            1b
+        ret
+
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
+endfunc
+
+// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height,
+//                               const int16_t *ac, const int alpha,
+//                               const int bitdepth_max);
+function ipred_cfl_top_16bpc_neon, export=1
+        dup             v31.8h,  w7   // bitdepth_max
+        clz             w9,  w3
+        adr             x7,  L(ipred_cfl_top_tbl)
+        sub             w9,  w9,  #26
+        ldrh            w9,  [x7, w9, uxtw #1]
+        dup             v1.8h,   w6   // alpha
+        add             x2,  x2,  #2
+        sub             x7,  x7,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        movi            v30.8h,  #0
+        br              x7
+4:
+        ld1             {v0.4h},  [x2]
+        addv            h0,      v0.4h
+        urshr           v0.4h,   v0.4h,   #2
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w4)
+8:
+        ld1             {v0.8h},  [x2]
+        addv            h0,      v0.8h
+        urshr           v0.4h,   v0.4h,   #3
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w8)
+16:
+        ld1             {v2.8h, v3.8h}, [x2]
+        addp            v0.8h,   v2.8h,   v3.8h
+        addv            h0,      v0.8h
+        urshr           v0.4h,   v0.4h,   #4
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w16)
+32:
+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v0.8h,   v2.8h,   v4.8h
+        uaddlv          s0,      v0.8h
+        rshrn           v0.4h,   v0.4s,   #5
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_top_tbl):
+        .hword L(ipred_cfl_top_tbl) - 32b
+        .hword L(ipred_cfl_top_tbl) - 16b
+        .hword L(ipred_cfl_top_tbl) -  8b
+        .hword L(ipred_cfl_top_tbl) -  4b
+endfunc
+
+// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                const pixel *const topleft,
+//                                const int width, const int height,
+//                                const int16_t *ac, const int alpha,
+//                                const int bitdepth_max);
+function ipred_cfl_left_16bpc_neon, export=1
+        dup             v31.8h,  w7   // bitdepth_max
+        sub             x2,  x2,  w4, uxtw #1
+        clz             w9,  w3
+        clz             w8,  w4
+        adr             x10, L(ipred_cfl_splat_tbl)
+        adr             x7,  L(ipred_cfl_left_tbl)
+        sub             w9,  w9,  #26
+        sub             w8,  w8,  #26
+        ldrh            w9,  [x10, w9, uxtw #1]
+        ldrh            w8,  [x7,  w8, uxtw #1]
+        dup             v1.8h,   w6   // alpha
+        sub             x9,  x10, w9, uxtw
+        sub             x7,  x7,  w8, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        movi            v30.8h,  #0
+        br              x7
+
+L(ipred_cfl_left_h4):
+        ld1             {v0.4h},  [x2]
+        addv            h0,      v0.4h
+        urshr           v0.4h,   v0.4h,   #2
+        dup             v0.8h,   v0.h[0]
+        br              x9
+
+L(ipred_cfl_left_h8):
+        ld1             {v0.8h},  [x2]
+        addv            h0,      v0.8h
+        urshr           v0.4h,   v0.4h,   #3
+        dup             v0.8h,   v0.h[0]
+        br              x9
+
+L(ipred_cfl_left_h16):
+        ld1             {v2.8h, v3.8h}, [x2]
+        addp            v0.8h,   v2.8h,   v3.8h
+        addv            h0,      v0.8h
+        urshr           v0.4h,   v0.4h,   #4
+        dup             v0.8h,   v0.h[0]
+        br              x9
+
+L(ipred_cfl_left_h32):
+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v0.8h,   v2.8h,   v4.8h
+        uaddlv          s0,      v0.8h
+        rshrn           v0.4h,   v0.4s,   #5
+        dup             v0.8h,   v0.h[0]
+        br              x9
+
+L(ipred_cfl_left_tbl):
+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
+endfunc
+
+// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                           const pixel *const topleft,
+//                           const int width, const int height,
+//                           const int16_t *ac, const int alpha,
+//                           const int bitdepth_max);
+function ipred_cfl_16bpc_neon, export=1
+        dup             v31.8h,  w7              // bitdepth_max
+        sub             x2,  x2,  w4, uxtw #1
+        add             w8,  w3,  w4             // width + height
+        dup             v1.8h,   w6              // alpha
+        clz             w9,  w3
+        clz             w6,  w4
+        dup             v16.4s, w8               // width + height
+        adr             x7,  L(ipred_cfl_tbl)
+        rbit            w8,  w8                  // rbit(width + height)
+        sub             w9,  w9,  #22            // 22 leading bits, minus table offset 4
+        sub             w6,  w6,  #26
+        clz             w8,  w8                  // ctz(width + height)
+        ldrh            w9,  [x7, w9, uxtw #1]
+        ldrh            w6,  [x7, w6, uxtw #1]
+        neg             w8,  w8                  // -ctz(width + height)
+        sub             x9,  x7,  w9, uxtw
+        sub             x7,  x7,  w6, uxtw
+        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
+        dup             v17.4s,  w8              // -ctz(width + height)
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        movi            v30.8h,  #0
+        br              x7
+
+L(ipred_cfl_h4):
+        ld1             {v0.4h},  [x2], #8
+        uaddlv          s0,      v0.4h
+        br              x9
+L(ipred_cfl_w4):
+        add             x2,  x2,  #2
+        ld1             {v2.4h},  [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        uaddlv          s2,      v2.4h
+        cmp             w4,  #4
+        add             v0.2s,   v0.2s,   v2.2s
+        ushl            v0.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 8/16
+        cmp             w4,  #16
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v0.2s,   v0.2s,   v16.2s
+        ushr            v0.2s,   v0.2s,   #17
+1:
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+        ld1             {v0.8h},  [x2], #16
+        uaddlv          s0,      v0.8h
+        br              x9
+L(ipred_cfl_w8):
+        add             x2,  x2,  #2
+        ld1             {v2.8h},  [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        uaddlv          s2,      v2.8h
+        cmp             w4,  #8
+        add             v0.2s,   v0.2s,   v2.2s
+        ushl            v0.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 4/16/32
+        cmp             w4,  #32
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v0.2s,   v0.2s,   v16.2s
+        ushr            v0.2s,   v0.2s,   #17
+1:
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+        ld1             {v2.8h, v3.8h}, [x2], #32
+        addp            v0.8h,   v2.8h,   v3.8h
+        uaddlv          s0,      v0.8h
+        br              x9
+L(ipred_cfl_w16):
+        add             x2,  x2,  #2
+        ld1             {v2.8h, v3.8h}, [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        addp            v2.8h,   v2.8h,   v3.8h
+        uaddlv          s2,      v2.8h
+        cmp             w4,  #16
+        add             v0.2s,   v0.2s,   v2.2s
+        ushl            v0.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 4/8/32
+        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v0.2s,   v0.2s,   v16.2s
+        ushr            v0.2s,   v0.2s,   #17
+1:
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v0.8h,   v2.8h,   v4.8h
+        uaddlv          s0,      v0.8h
+        br              x9
+L(ipred_cfl_w32):
+        add             x2,  x2,  #2
+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+        add             v0.4s,   v0.4s,   v16.4s
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v2.8h,   v2.8h,   v4.8h
+        cmp             w4,  #32
+        uaddlv          s2,      v2.8h
+        add             v0.2s,   v0.2s,   v2.2s
+        ushl            v0.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 8/16
+        cmp             w4,  #8
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v0.2s,   v0.2s,   v16.2s
+        ushr            v0.2s,   v0.2s,   #17
+1:
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_tbl):
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
+endfunc
+
+// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                            const ptrdiff_t stride, const int w_pad,
+//                            const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_16bpc_neon, export=1
+        clz             w8,  w5
+        lsl             w4,  w4,  #2
+        adr             x7,  L(ipred_cfl_ac_420_tbl)
+        sub             w8,  w8,  #27
+        ldrh            w8,  [x7, w8, uxtw #1]
+        movi            v24.4s,  #0
+        movi            v25.4s,  #0
+        movi            v26.4s,  #0
+        movi            v27.4s,  #0
+        sub             x7,  x7,  w8, uxtw
+        sub             w8,  w6,  w4         // height - h_pad
+        rbit            w9,  w5              // rbit(width)
+        rbit            w10, w6              // rbit(height)
+        clz             w9,  w9              // ctz(width)
+        clz             w10, w10             // ctz(height)
+        add             w9,  w9,  w10        // log2sz
+        add             x10, x1,  x2
+        dup             v31.4s,  w9
+        lsl             x2,  x2,  #1
+        neg             v31.4s,  v31.4s      // -log2sz
+        br              x7
+
+L(ipred_cfl_ac_420_w4):
+1:      // Copy and subsample input
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v1.8h}, [x10], x2
+        ld1             {v2.8h}, [x1],  x2
+        ld1             {v3.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v2.8h
+        addp            v1.8h,   v1.8h,   v3.8h
+        add             v0.8h,   v0.8h,   v1.8h
+        shl             v0.8h,   v0.8h,   #1
+        subs            w8,  w8,  #2
+        st1             {v0.8h}, [x0], #16
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        b.gt            1b
+        trn2            v1.2d,   v0.2d,   v0.2d
+        trn2            v0.2d,   v0.2d,   v0.2d
+L(ipred_cfl_ac_420_w4_hpad):
+        cbz             w4,  3f
+2:      // Vertical padding (h_pad > 0)
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        b.gt            2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+        // Aggregate the sums
+        add             v24.4s,  v24.4s,  v25.4s
+        add             v26.4s,  v26.4s,  v27.4s
+        add             v0.4s,   v24.4s,  v26.4s
+        addv            s0,  v0.4s                // sum
+        sub             x0,  x0,  w6, uxtw #3
+        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1)))  >>= log2sz
+        dup             v4.8h,   v4.h[0]
+6:      // Subtract dc from ac
+        ld1             {v0.8h, v1.8h}, [x0]
+        subs            w6,  w6,  #4
+        sub             v0.8h,   v0.8h,   v4.8h
+        sub             v1.8h,   v1.8h,   v4.8h
+        st1             {v0.8h, v1.8h}, [x0], #32
+        b.gt            6b
+        ret
+
+L(ipred_cfl_ac_420_w8):
+        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
+1:      // Copy and subsample input, without padding
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ld1             {v2.8h, v3.8h}, [x10], x2
+        ld1             {v4.8h, v5.8h}, [x1],  x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        ld1             {v6.8h, v7.8h}, [x10], x2
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v4.8h,   v4.8h,   v6.8h
+        shl             v0.8h,   v0.8h,   #1
+        shl             v1.8h,   v4.8h,   #1
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        b.gt            1b
+        mov             v0.16b,  v1.16b
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1:      // Copy and subsample input, padding 4
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v1.8h}, [x10], x2
+        ld1             {v2.8h}, [x1],  x2
+        ld1             {v3.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v2.8h
+        addp            v1.8h,   v1.8h,   v3.8h
+        add             v0.8h,   v0.8h,   v1.8h
+        shl             v0.8h,   v0.8h,   #1
+        dup             v1.4h,   v0.h[3]
+        dup             v3.4h,   v0.h[7]
+        trn2            v2.2d,   v0.2d,   v0.2d
+        subs            w8,  w8,  #2
+        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw           v25.4s,  v25.4s,  v1.4h
+        uaddw           v26.4s,  v26.4s,  v2.4h
+        uaddw           v27.4s,  v27.4s,  v3.4h
+        b.gt            1b
+        trn1            v0.2d,   v2.2d,   v3.2d
+        trn1            v1.2d,   v2.2d,   v3.2d
+
+L(ipred_cfl_ac_420_w8_hpad):
+        cbz             w4,  3f
+2:      // Vertical padding (h_pad > 0)
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        st1             {v0.8h, v1.8h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        b.gt            2b
+3:
+
+        // Double the height and reuse the w4 summing/subtracting
+        lsl             w6,  w6,  #1
+        lsl             w9,  w9,  #1
+        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
+        ldrh            w3,  [x7, w3, uxtw #1]
+        sub             x7,  x7,  w3, uxtw
+        br              x7
+
+L(ipred_cfl_ac_420_w16_wpad0):
+1:      // Copy and subsample input, without padding
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x1],  x2
+        add             v0.8h,   v0.8h,   v4.8h
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
+        add             v2.8h,   v2.8h,   v6.8h
+        addp            v16.8h,  v16.8h,  v17.8h
+        addp            v18.8h,  v18.8h,  v19.8h
+        addp            v20.8h,  v20.8h,  v21.8h
+        addp            v22.8h,  v22.8h,  v23.8h
+        add             v16.8h,  v16.8h,  v20.8h
+        add             v18.8h,  v18.8h,  v22.8h
+        shl             v0.8h,   v0.8h,   #1
+        shl             v1.8h,   v2.8h,   #1
+        shl             v2.8h,   v16.8h,  #1
+        shl             v3.8h,   v18.8h,  #1
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+1:      // Copy and subsample input, padding 4
+        ldr             q2,  [x1,  #32]
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ldr             q5,  [x10, #32]
+        ld1             {v3.8h, v4.8h}, [x10], x2
+        addp            v2.8h,   v2.8h,   v2.8h
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v5.8h,   v5.8h,   v5.8h
+        addp            v3.8h,   v3.8h,   v4.8h
+        ldr             q18, [x1,  #32]
+        add             v2.4h,   v2.4h,   v5.4h
+        ld1             {v16.8h, v17.8h}, [x1],  x2
+        add             v0.8h,   v0.8h,   v3.8h
+        ldr             q21, [x10, #32]
+        ld1             {v19.8h, v20.8h}, [x10], x2
+        addp            v18.8h,  v18.8h,  v18.8h
+        addp            v16.8h,  v16.8h,  v17.8h
+        addp            v21.8h,  v21.8h,  v21.8h
+        addp            v19.8h,  v19.8h,  v20.8h
+        add             v18.4h,  v18.4h,  v21.4h
+        add             v16.8h,  v16.8h,  v19.8h
+        shl             v1.4h,   v2.4h,   #1
+        shl             v0.8h,   v0.8h,   #1
+        shl             v3.4h,   v18.4h,  #1
+        shl             v2.8h,   v16.8h,  #1
+        dup             v4.4h,   v1.h[3]
+        dup             v5.4h,   v3.h[3]
+        trn1            v1.2d,   v1.2d,   v4.2d
+        trn1            v3.2d,   v3.2d,   v5.2d
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+1:      // Copy and subsample input, padding 8
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ld1             {v2.8h, v3.8h}, [x10], x2
+        ld1             {v4.8h, v5.8h}, [x1],  x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        ld1             {v6.8h, v7.8h}, [x10], x2
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v4.8h,   v4.8h,   v6.8h
+        shl             v0.8h,   v0.8h,   #1
+        shl             v2.8h,   v4.8h,   #1
+        dup             v1.8h,   v0.h[7]
+        dup             v3.8h,   v2.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+1:      // Copy and subsample input, padding 12
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v2.8h}, [x10], x2
+        ld1             {v4.8h}, [x1],  x2
+        ld1             {v6.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v4.8h
+        addp            v2.8h,   v2.8h,   v6.8h
+        add             v0.8h,   v0.8h,   v2.8h
+        shl             v0.8h,   v0.8h,   #1
+        dup             v1.8h,   v0.h[3]
+        dup             v3.8h,   v0.h[7]
+        trn2            v2.2d,   v0.2d,   v3.2d
+        trn1            v0.2d,   v0.2d,   v1.2d
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_hpad):
+        cbz             w4,  3f
+2:      // Vertical padding (h_pad > 0)
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            2b
+3:
+
+        // Quadruple the height and reuse the w4 summing/subtracting
+        lsl             w6,  w6,  #2
+        lsl             w9,  w9,  #2
+        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_tbl):
+        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
+        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
+        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
+        .hword 0
+
+L(ipred_cfl_ac_420_w16_tbl):
+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
+endfunc
+
+// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                            const ptrdiff_t stride, const int w_pad,
+//                            const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_16bpc_neon, export=1
+        clz             w8,  w5
+        lsl             w4,  w4,  #2
+        adr             x7,  L(ipred_cfl_ac_422_tbl)
+        sub             w8,  w8,  #27
+        ldrh            w8,  [x7, w8, uxtw #1]
+        movi            v24.4s,  #0
+        movi            v25.4s,  #0
+        movi            v26.4s,  #0
+        movi            v27.4s,  #0
+        sub             x7,  x7,  w8, uxtw
+        sub             w8,  w6,  w4         // height - h_pad
+        rbit            w9,  w5              // rbit(width)
+        rbit            w10, w6              // rbit(height)
+        clz             w9,  w9              // ctz(width)
+        clz             w10, w10             // ctz(height)
+        add             w9,  w9,  w10        // log2sz
+        add             x10, x1,  x2
+        dup             v31.4s,  w9
+        lsl             x2,  x2,  #1
+        neg             v31.4s,  v31.4s      // -log2sz
+        br              x7
+
+L(ipred_cfl_ac_422_w4):
+1:      // Copy and subsample input
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v1.8h}, [x10], x2
+        ld1             {v2.8h}, [x1],  x2
+        ld1             {v3.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        shl             v0.8h,   v0.8h,   #2
+        shl             v1.8h,   v2.8h,   #2
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        b.gt            1b
+        trn2            v0.2d,   v1.2d,   v1.2d
+        trn2            v1.2d,   v1.2d,   v1.2d
+        b               L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
+1:      // Copy and subsample input, without padding
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ld1             {v2.8h, v3.8h}, [x10], x2
+        ld1             {v4.8h, v5.8h}, [x1],  x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        ld1             {v6.8h, v7.8h}, [x10], x2
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        shl             v0.8h,   v0.8h,   #2
+        shl             v1.8h,   v2.8h,   #2
+        shl             v2.8h,   v4.8h,   #2
+        shl             v3.8h,   v6.8h,   #2
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v3.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1:      // Copy and subsample input, padding 4
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v1.8h}, [x10], x2
+        ld1             {v2.8h}, [x1],  x2
+        ld1             {v3.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        shl             v0.8h,   v0.8h,   #2
+        shl             v2.8h,   v2.8h,   #2
+        dup             v4.4h,   v0.h[3]
+        dup             v5.8h,   v0.h[7]
+        dup             v6.4h,   v2.h[3]
+        dup             v7.8h,   v2.h[7]
+        trn2            v1.2d,   v0.2d,   v5.2d
+        trn1            v0.2d,   v0.2d,   v4.2d
+        trn2            v3.2d,   v2.2d,   v7.2d
+        trn1            v2.2d,   v2.2d,   v6.2d
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v3.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
+        ldrh            w3,  [x7, w3, uxtw #1]
+        sub             x7,  x7,  w3, uxtw
+        br              x7
+
+L(ipred_cfl_ac_422_w16_wpad0):
+1:      // Copy and subsample input, without padding
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        shl             v0.8h,   v0.8h,   #2
+        shl             v1.8h,   v2.8h,   #2
+        shl             v2.8h,   v4.8h,   #2
+        shl             v3.8h,   v6.8h,   #2
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+1:      // Copy and subsample input, padding 4
+        ldr             q2,  [x1,  #32]
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ldr             q6,  [x10, #32]
+        ld1             {v4.8h, v5.8h}, [x10], x2
+        addp            v2.8h,   v2.8h,   v2.8h
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v6.8h,   v6.8h,   v6.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        shl             v1.4h,   v2.4h,   #2
+        shl             v0.8h,   v0.8h,   #2
+        shl             v3.4h,   v6.4h,   #2
+        shl             v2.8h,   v4.8h,   #2
+        dup             v4.4h,   v1.h[3]
+        dup             v5.4h,   v3.h[3]
+        trn1            v1.2d,   v1.2d,   v4.2d
+        trn1            v3.2d,   v3.2d,   v5.2d
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+1:      // Copy and subsample input, padding 8
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ld1             {v2.8h, v3.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        shl             v0.8h,   v0.8h,   #2
+        shl             v2.8h,   v2.8h,   #2
+        dup             v1.8h,   v0.h[7]
+        dup             v3.8h,   v2.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+1:      // Copy and subsample input, padding 12
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v2.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v0.8h
+        addp            v2.8h,   v2.8h,   v2.8h
+        shl             v0.4h,   v0.4h,   #2
+        shl             v2.4h,   v2.4h,   #2
+        dup             v1.8h,   v0.h[3]
+        dup             v3.8h,   v2.h[3]
+        trn1            v0.2d,   v0.2d,   v1.2d
+        trn1            v2.2d,   v2.2d,   v3.2d
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_tbl):
+        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
+        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
+        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
+        .hword 0
+
+L(ipred_cfl_ac_422_w16_tbl):
+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S
new file mode 100644 (file)
index 0000000..245af0e
--- /dev/null
@@ -0,0 +1,3288 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// x0-x3  external parameters
+// x4     function pointer to first transform
+// x5     function pointer to second transform
+// x6     output parameter for helper function
+// x7     input parameter for helper function
+// x8     input stride for helper function
+// x9-x12 scratch variables for helper functions
+// x13    pointer to list of eob thresholds
+// x14    return pointer for helper function
+// x15    return pointer for main function
+
+// The SIMD registers most often use the following layout:
+// v0-v1   multiplication coefficients
+// v2-v7   scratch registers
+// v8-v15  unused
+// v16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+//   transform functions. (The register layout is designed to potentially
+//   allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+//   we know a significant number of inputs are zero. E.g. if the eob value
+//   indicates only a quarter of input values are set, for idct16 and up,
+//   a significant amount of calculation can be skipped, at the cost of more
+//   code duplication and special casing.
+
+const idct_coeffs, align=4
+        // idct4
+        .short          2896, 2896*8, 1567, 3784
+        // idct8
+        .short          799, 4017, 3406, 2276
+        // idct16
+        .short          401, 4076, 3166, 2598
+        .short          1931, 3612, 3920, 1189
+        // idct32
+        .short          201, 4091, 3035, 2751
+        .short          1751, 3703, 3857, 1380
+        .short          995, 3973, 3513, 2106
+        .short          2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+        .short          101*8, 4095*8, 2967*8, -2824*8
+        .short          1660*8, 3745*8, 3822*8, -1474*8
+        .short          4076, 401, 4017, 799
+        .short          0, 0, 0, 0
+
+        .short          4036*8, -700*8, 2359*8, 3349*8
+        .short          3461*8, -2191*8, 897*8, 3996*8
+        .short          -3166, -2598, -799, -4017
+        .short          0, 0, 0, 0
+
+        .short          501*8, 4065*8, 3229*8, -2520*8
+        .short          2019*8, 3564*8, 3948*8, -1092*8
+        .short          3612, 1931, 2276, 3406
+        .short          0, 0, 0, 0
+
+        .short          4085*8, -301*8, 2675*8, 3102*8
+        .short          3659*8, -1842*8, 1285*8, 3889*8
+        .short          -3920, -1189, -3406, -2276
+        .short          0, 0, 0, 0
+endconst
+
+const iadst4_coeffs, align=4
+        // .h[4-5] can be interpreted as .s[2]
+        .short          1321, 3803, 2482, 3344, 3344, 0
+endconst
+
+const iadst8_coeffs, align=4
+        .short          4076, 401, 3612, 1931
+        .short          2598, 3166, 1189, 3920
+        // idct_coeffs
+        .short          2896, 0, 1567, 3784, 0, 0, 0, 0
+endconst
+
+const iadst16_coeffs, align=4
+        .short          4091, 201, 3973, 995
+        .short          3703, 1751, 3290, 2440
+        .short          2751, 3035, 2106, 3513
+        .short          1380, 3857, 601, 4052
+endconst
+
+.macro smull_smlal d0, d1, s0, s1, c0, c1, sz
+        smull           \d0\().4s, \s0\().4h, \c0
+        smlal           \d0\().4s, \s1\().4h, \c1
+.ifc \sz, .8h
+        smull2          \d1\().4s, \s0\().8h, \c0
+        smlal2          \d1\().4s, \s1\().8h, \c1
+.endif
+.endm
+
+.macro smull_smlsl d0, d1, s0, s1, c0, c1, sz
+        smull           \d0\().4s, \s0\().4h, \c0
+        smlsl           \d0\().4s, \s1\().4h, \c1
+.ifc \sz, .8h
+        smull2          \d1\().4s, \s0\().8h, \c0
+        smlsl2          \d1\().4s, \s1\().8h, \c1
+.endif
+.endm
+
+.macro rshrn_sz d0, s0, s1, shift, sz
+        rshrn           \d0\().4h, \s0\().4s, \shift
+.ifc \sz, .8h
+        rshrn2          \d0\().8h, \s1\().4s, \shift
+.endif
+.endm
+
+.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
+        sqrdmulh        \r0\sz,  \r0\sz,  \c
+        sqrdmulh        \r1\sz,  \r1\sz,  \c
+        sqrdmulh        \r2\sz,  \r2\sz,  \c
+        sqrdmulh        \r3\sz,  \r3\sz,  \c
+.ifnb \r4
+        sqrdmulh        \r4\sz,  \r4\sz,  \c
+        sqrdmulh        \r5\sz,  \r5\sz,  \c
+        sqrdmulh        \r6\sz,  \r6\sz,  \c
+        sqrdmulh        \r7\sz,  \r7\sz,  \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
+.ifnb \load
+        ld1             {\load},  [\src], x1
+.endif
+.ifnb \shift
+        srshr           \shift,  \shift,  #\shiftbits
+.endif
+.ifnb \addsrc
+        uaddw           \adddst, \adddst, \addsrc
+.endif
+.ifnb \narrowsrc
+        sqxtun          \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+        st1             {\store},  [\dst], x1
+.endif
+.endm
+.macro load_add_store_8x16 dst, src
+        mov             \src, \dst
+        load_add_store  v2.8b, v16.8h,      ,       ,       ,      ,      , \dst, \src
+        load_add_store  v3.8b, v17.8h,      ,       ,       ,      ,      , \dst, \src
+        load_add_store  v4.8b, v18.8h, v2.8b, v16.8h,       ,      ,      , \dst, \src
+        load_add_store  v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b,      , \dst, \src
+        load_add_store  v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src
+        load_add_store  v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src
+        load_add_store  v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src
+        load_add_store  v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src
+        load_add_store  v4.8b, v24.8h, v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src
+        load_add_store  v5.8b, v25.8h, v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src
+        load_add_store  v6.8b, v26.8h, v4.8b, v24.8h, v23.8h, v3.8b, v2.8b, \dst, \src
+        load_add_store  v7.8b, v27.8h, v5.8b, v25.8h, v24.8h, v4.8b, v3.8b, \dst, \src
+        load_add_store  v2.8b, v28.8h, v6.8b, v26.8h, v25.8h, v5.8b, v4.8b, \dst, \src
+        load_add_store  v3.8b, v29.8h, v7.8b, v27.8h, v26.8h, v6.8b, v5.8b, \dst, \src
+        load_add_store  v4.8b, v30.8h, v2.8b, v28.8h, v27.8h, v7.8b, v6.8b, \dst, \src
+        load_add_store  v5.8b, v31.8h, v3.8b, v29.8h, v28.8h, v2.8b, v7.8b, \dst, \src
+        load_add_store       ,       , v4.8b, v30.8h, v29.8h, v3.8b, v2.8b, \dst, \src
+        load_add_store       ,       , v5.8b, v31.8h, v30.8h, v4.8b, v3.8b, \dst, \src
+        load_add_store       ,       ,      ,       , v31.8h, v5.8b, v4.8b, \dst, \src
+        load_add_store       ,       ,      ,       ,       ,      , v5.8b, \dst, \src
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+        mov             \src, \dst
+        load_add_store  v2.8b, v16.8h,      ,       ,       ,      ,      , \dst, \src, \shiftbits
+        load_add_store  v3.8b, v17.8h,      ,       ,       ,      ,      , \dst, \src, \shiftbits
+        load_add_store  v4.8b, v18.8h, v2.8b, v16.8h,       ,      ,      , \dst, \src, \shiftbits
+        load_add_store  v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b,      , \dst, \src, \shiftbits
+        load_add_store  v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src, \shiftbits
+        load_add_store  v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src, \shiftbits
+        load_add_store  v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src, \shiftbits
+        load_add_store  v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src, \shiftbits
+        load_add_store       ,       , v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src, \shiftbits
+        load_add_store       ,       , v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       , v23.8h, v3.8b, v2.8b, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       ,       ,      , v3.8b, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src
+        mov             \src, \dst
+        load_add_store  v2.8b, v16.8h,      ,       ,       ,      ,      , \dst, \src
+        load_add_store  v3.8b, v17.8h,      ,       ,       ,      ,      , \dst, \src
+        load_add_store  v4.8b, v18.8h, v2.8b, v16.8h,       ,      ,      , \dst, \src
+        load_add_store  v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b,      , \dst, \src
+        load_add_store       ,       , v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src
+        load_add_store       ,       , v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src
+        load_add_store       ,       ,      ,       , v19.8h, v5.8b, v4.8b, \dst, \src
+        load_add_store       ,       ,      ,       ,       ,      , v5.8b, \dst, \src
+.endm
+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src
+.ifnb \load
+        ld1             {\load}[0],  [\src], x1
+.endif
+.ifnb \inssrc
+        ins             \insdst\().d[1],   \inssrc\().d[0]
+.endif
+.ifnb \shift
+        srshr           \shift,  \shift,  #4
+.endif
+.ifnb \load
+        ld1             {\load}[1],  [\src], x1
+.endif
+.ifnb \addsrc
+        uaddw           \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+        st1             {\store}[0],  [\dst], x1
+.endif
+.ifnb \narrowsrc
+        sqxtun          \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+        st1             {\store}[1],  [\dst], x1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+        mov             \src, \dst
+        load_add_store4 v0.s, v17, v16,       ,      ,       ,       ,      ,     , \dst, \src
+        load_add_store4 v1.s, v19, v18,       ,      ,       ,       ,      ,     , \dst, \src
+        load_add_store4 v2.s, v21, v20, v16.8h,      ,       ,       ,      ,     , \dst, \src
+        load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h,       ,      ,     , \dst, \src
+        load_add_store4 v4.s, v25, v24, v20.8h, v1.8b, v18.8h, v16.8h, v0.8b,     , \dst, \src
+        load_add_store4 v5.s, v27, v26, v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src
+        load_add_store4 v6.s, v29, v28, v24.8h, v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src
+        load_add_store4 v7.s, v31, v30, v26.8h, v4.8b, v24.8h, v22.8h, v3.8b, v2.s, \dst, \src
+        load_add_store4     ,    ,    , v28.8h, v5.8b, v26.8h, v24.8h, v4.8b, v3.s, \dst, \src
+        load_add_store4     ,    ,    , v30.8h, v6.8b, v28.8h, v26.8h, v5.8b, v4.s, \dst, \src
+        load_add_store4     ,    ,    ,       , v7.8b, v30.8h, v28.8h, v6.8b, v5.s, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       , v30.8h, v7.8b, v6.s, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       ,       ,      , v7.s, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+        mov             \src, \dst
+        load_add_store4 v0.s, v17, v16,       ,      ,       ,       ,      ,     , \dst, \src
+        load_add_store4 v1.s, v19, v18,       ,      ,       ,       ,      ,     , \dst, \src
+        load_add_store4 v2.s, v21, v20, v16.8h,      ,       ,       ,      ,     , \dst, \src
+        load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h,       ,      ,     , \dst, \src
+        load_add_store4     ,    ,    , v20.8h, v1.8b, v18.8h, v16.8h, v0.8b,     , \dst, \src
+        load_add_store4     ,    ,    , v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src
+        load_add_store4     ,    ,    ,       , v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       , v22.8h, v3.8b, v2.s, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       ,       ,      , v3.s, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+        cbnz            w3,  1f
+        mov             w16, #2896*8
+        ld1r            {v16.8h}, [x2]
+        dup             v0.4h,   w16
+        sqrdmulh        v16.8h,  v16.8h,  v0.h[0]
+        strh            wzr, [x2]
+.if (\w == 2*\h) || (2*\w == \h)
+        sqrdmulh        v16.8h,  v16.8h,  v0.h[0]
+.endif
+.if \shift > 0
+        srshr           v16.8h,  v16.8h,  #\shift
+.endif
+        sqrdmulh        v16.8h,  v16.8h,  v0.h[0]
+        srshr           v16.8h,  v16.8h,  #4
+        mov             w4,  #\h
+        b               idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+1:
+        ld1             {v0.s}[0], [x0], x1
+        ld1             {v0.s}[1], [x0], x1
+        ld1             {v1.s}[0], [x0], x1
+        ld1             {v1.s}[1], [x0], x1
+        subs            w4,  w4,  #4
+        sub             x0,  x0,  x1, lsl #2
+        uaddw           v0.8h,   v16.8h,  v0.8b
+        sqxtun          v0.8b,   v0.8h
+        uaddw           v1.8h,   v16.8h,  v1.8b
+        st1             {v0.s}[0], [x0], x1
+        sqxtun          v1.8b,   v1.8h
+        st1             {v0.s}[1], [x0], x1
+        st1             {v1.s}[0], [x0], x1
+        st1             {v1.s}[1], [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+function idct_dc_w8_neon
+1:
+        ld1             {v0.8b}, [x0], x1
+        ld1             {v1.8b}, [x0], x1
+        ld1             {v2.8b}, [x0], x1
+        uaddw           v20.8h,  v16.8h, v0.8b
+        ld1             {v3.8b}, [x0], x1
+        sub             x0,  x0,  x1, lsl #2
+        subs            w4,  w4,  #4
+        uaddw           v21.8h,  v16.8h, v1.8b
+        sqxtun          v0.8b,   v20.8h
+        uaddw           v22.8h,  v16.8h, v2.8b
+        sqxtun          v1.8b,   v21.8h
+        uaddw           v23.8h,  v16.8h, v3.8b
+        st1             {v0.8b}, [x0], x1
+        sqxtun          v2.8b,   v22.8h
+        st1             {v1.8b}, [x0], x1
+        sqxtun          v3.8b,   v23.8h
+        st1             {v2.8b}, [x0], x1
+        st1             {v3.8b}, [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+function idct_dc_w16_neon
+1:
+        ld1             {v0.16b}, [x0], x1
+        ld1             {v1.16b}, [x0], x1
+        ld1             {v2.16b}, [x0], x1
+        subs            w4,  w4,  #4
+        uaddw           v20.8h,  v16.8h, v0.8b
+        uaddw2          v21.8h,  v16.8h, v0.16b
+        ld1             {v3.16b}, [x0], x1
+        uaddw           v22.8h,  v16.8h, v1.8b
+        uaddw2          v23.8h,  v16.8h, v1.16b
+        sub             x0,  x0,  x1, lsl #2
+        uaddw           v24.8h,  v16.8h, v2.8b
+        uaddw2          v25.8h,  v16.8h, v2.16b
+        sqxtun          v0.8b,   v20.8h
+        sqxtun2         v0.16b,  v21.8h
+        uaddw           v26.8h,  v16.8h, v3.8b
+        uaddw2          v27.8h,  v16.8h, v3.16b
+        sqxtun          v1.8b,   v22.8h
+        sqxtun2         v1.16b,  v23.8h
+        sqxtun          v2.8b,   v24.8h
+        sqxtun2         v2.16b,  v25.8h
+        st1             {v0.16b}, [x0], x1
+        sqxtun          v3.8b,   v26.8h
+        sqxtun2         v3.16b,  v27.8h
+        st1             {v1.16b}, [x0], x1
+        st1             {v2.16b}, [x0], x1
+        st1             {v3.16b}, [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+function idct_dc_w32_neon
+1:
+        ld1             {v0.16b, v1.16b},  [x0], x1
+        subs            w4,  w4,  #2
+        uaddw           v20.8h,  v16.8h, v0.8b
+        uaddw2          v21.8h,  v16.8h, v0.16b
+        ld1             {v2.16b, v3.16b},  [x0]
+        uaddw           v22.8h,  v16.8h, v1.8b
+        uaddw2          v23.8h,  v16.8h, v1.16b
+        sub             x0,  x0,  x1
+        uaddw           v24.8h,  v16.8h, v2.8b
+        uaddw2          v25.8h,  v16.8h, v2.16b
+        sqxtun          v0.8b,   v20.8h
+        sqxtun2         v0.16b,  v21.8h
+        uaddw           v26.8h,  v16.8h, v3.8b
+        uaddw2          v27.8h,  v16.8h, v3.16b
+        sqxtun          v1.8b,   v22.8h
+        sqxtun2         v1.16b,  v23.8h
+        sqxtun          v2.8b,   v24.8h
+        sqxtun2         v2.16b,  v25.8h
+        st1             {v0.16b, v1.16b},  [x0], x1
+        sqxtun          v3.8b,   v26.8h
+        sqxtun2         v3.16b,  v27.8h
+        st1             {v2.16b, v3.16b},  [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+function idct_dc_w64_neon
+1:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b},  [x0]
+        subs            w4,  w4,  #1
+        uaddw           v20.8h,  v16.8h, v0.8b
+        uaddw2          v21.8h,  v16.8h, v0.16b
+        uaddw           v22.8h,  v16.8h, v1.8b
+        uaddw2          v23.8h,  v16.8h, v1.16b
+        uaddw           v24.8h,  v16.8h, v2.8b
+        uaddw2          v25.8h,  v16.8h, v2.16b
+        sqxtun          v0.8b,   v20.8h
+        sqxtun2         v0.16b,  v21.8h
+        uaddw           v26.8h,  v16.8h, v3.8b
+        uaddw2          v27.8h,  v16.8h, v3.16b
+        sqxtun          v1.8b,   v22.8h
+        sqxtun2         v1.16b,  v23.8h
+        sqxtun          v2.8b,   v24.8h
+        sqxtun2         v2.16b,  v25.8h
+        sqxtun          v3.8b,   v26.8h
+        sqxtun2         v3.16b,  v27.8h
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b},  [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+.macro iwht4
+        add             v16.4h,  v16.4h,  v17.4h
+        sub             v21.4h,  v18.4h,  v19.4h
+        sub             v20.4h,  v16.4h,  v21.4h
+        sshr            v20.4h,  v20.4h,  #1
+        sub             v18.4h,  v20.4h,  v17.4h
+        sub             v17.4h,  v20.4h,  v19.4h
+        add             v19.4h,  v21.4h,  v18.4h
+        sub             v16.4h,  v16.4h,  v17.4h
+.endm
+
+.macro idct_4 r0, r1, r2, r3, sz
+        smull_smlal     v6,  v7,  \r1, \r3, v0.h[3], v0.h[2], \sz
+        smull_smlsl     v4,  v5,  \r1, \r3, v0.h[2], v0.h[3], \sz
+        smull_smlal     v2,  v3,  \r0, \r2, v0.h[0], v0.h[0], \sz
+        rshrn_sz        v6,  v6,  v7,  #12, \sz
+        rshrn_sz        v7,  v4,  v5,  #12, \sz
+        smull_smlsl     v4,  v5,  \r0, \r2, v0.h[0], v0.h[0], \sz
+        rshrn_sz        v2,  v2,  v3,  #12, \sz
+        rshrn_sz        v3,  v4,  v5,  #12, \sz
+        sqadd           \r0\sz,  v2\sz,   v6\sz
+        sqsub           \r3\sz,  v2\sz,   v6\sz
+        sqadd           \r1\sz,  v3\sz,   v7\sz
+        sqsub           \r2\sz,  v3\sz,   v7\sz
+.endm
+
+function inv_dct_4h_x4_neon, export=1
+        movrel          x16, idct_coeffs
+        ld1             {v0.4h}, [x16]
+        idct_4          v16, v17, v18, v19, .4h
+        ret
+endfunc
+
+function inv_dct_8h_x4_neon, export=1
+        movrel          x16, idct_coeffs
+        ld1             {v0.4h}, [x16]
+        idct_4          v16, v17, v18, v19, .8h
+        ret
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+        movrel          x16, iadst4_coeffs
+        ld1             {v0.8h}, [x16]
+
+        ssubl           v3.4s,   v16.4h,  v18.4h
+        smull           v4.4s,   v16.4h,  v0.h[0]
+        smlal           v4.4s,   v18.4h,  v0.h[1]
+        smlal           v4.4s,   v19.4h,  v0.h[2]
+        smull           v7.4s,   v17.4h,  v0.h[3]
+        saddw           v3.4s,   v3.4s,   v19.4h
+        smull           v5.4s,   v16.4h,  v0.h[2]
+        smlsl           v5.4s,   v18.4h,  v0.h[0]
+        smlsl           v5.4s,   v19.4h,  v0.h[1]
+
+        add             \o3\().4s, v4.4s,     v5.4s
+        mul             \o2\().4s, v3.4s,     v0.s[2]
+        add             \o0\().4s, v4.4s,     v7.4s
+        add             \o1\().4s, v5.4s,     v7.4s
+        sub             \o3\().4s, \o3\().4s, v7.4s
+
+        rshrn           \o0\().4h, \o0\().4s, #12
+        rshrn           \o2\().4h, \o2\().4s, #12
+        rshrn           \o1\().4h, \o1\().4s, #12
+        rshrn           \o3\().4h, \o3\().4s, #12
+.endm
+
+function inv_adst_4h_x4_neon, export=1
+        iadst_4x4       v16, v17, v18, v19
+        ret
+endfunc
+
+function inv_flipadst_4h_x4_neon, export=1
+        iadst_4x4       v19, v18, v17, v16
+        ret
+endfunc
+
+.macro iadst_8x4 o0, o1, o2, o3
+        movrel          x16, iadst4_coeffs
+        ld1             {v0.8h}, [x16]
+
+        ssubl           v2.4s,   v16.4h,  v18.4h
+        ssubl2          v3.4s,   v16.8h,  v18.8h
+        smull           v4.4s,   v16.4h,  v0.h[0]
+        smlal           v4.4s,   v18.4h,  v0.h[1]
+        smlal           v4.4s,   v19.4h,  v0.h[2]
+        smull2          v5.4s,   v16.8h,  v0.h[0]
+        smlal2          v5.4s,   v18.8h,  v0.h[1]
+        smlal2          v5.4s,   v19.8h,  v0.h[2]
+        saddw           v2.4s,   v2.4s,   v19.4h
+        saddw2          v3.4s,   v3.4s,   v19.8h
+        smull           v6.4s,   v16.4h,  v0.h[2]
+        smlsl           v6.4s,   v18.4h,  v0.h[0]
+        smlsl           v6.4s,   v19.4h,  v0.h[1]
+        smull2          v7.4s,   v16.8h,  v0.h[2]
+        smlsl2          v7.4s,   v18.8h,  v0.h[0]
+        smlsl2          v7.4s,   v19.8h,  v0.h[1]
+
+        mul             v18.4s,  v2.4s,   v0.s[2]
+        mul             v19.4s,  v3.4s,   v0.s[2]
+
+        smull           v2.4s,   v17.4h,  v0.h[3]
+        smull2          v3.4s,   v17.8h,  v0.h[3]
+
+        add             v16.4s,  v4.4s,   v2.4s // out0
+        add             v17.4s,  v5.4s,   v3.4s
+
+        add             v4.4s,   v4.4s,   v6.4s // out3
+        add             v5.4s,   v5.4s,   v7.4s
+
+        add             v6.4s,   v6.4s,   v2.4s // out1
+        add             v7.4s,   v7.4s,   v3.4s
+
+        sub             v4.4s,   v4.4s,   v2.4s // out3
+        sub             v5.4s,   v5.4s,   v3.4s
+
+        rshrn           v18.4h,  v18.4s, #12
+        rshrn2          v18.8h,  v19.4s, #12
+
+        rshrn           \o0\().4h, v16.4s, #12
+        rshrn2          \o0\().8h, v17.4s, #12
+
+.ifc \o2, v17
+        mov             v17.16b,   v18.16b
+.endif
+
+        rshrn           \o1\().4h, v6.4s,  #12
+        rshrn2          \o1\().8h, v7.4s,  #12
+
+        rshrn           \o3\().4h, v4.4s,  #12
+        rshrn2          \o3\().8h, v5.4s,  #12
+.endm
+
+function inv_adst_8h_x4_neon, export=1
+        iadst_8x4       v16, v17, v18, v19
+        ret
+endfunc
+
+function inv_flipadst_8h_x4_neon, export=1
+        iadst_8x4       v19, v18, v17, v16
+        ret
+endfunc
+
+function inv_identity_4h_x4_neon, export=1
+        mov             w16, #(5793-4096)*8
+        dup             v0.4h,   w16
+        sqrdmulh        v4.4h,   v16.4h,  v0.h[0]
+        sqrdmulh        v5.4h,   v17.4h,  v0.h[0]
+        sqrdmulh        v6.4h,   v18.4h,  v0.h[0]
+        sqrdmulh        v7.4h,   v19.4h,  v0.h[0]
+        sqadd           v16.4h,  v16.4h,  v4.4h
+        sqadd           v17.4h,  v17.4h,  v5.4h
+        sqadd           v18.4h,  v18.4h,  v6.4h
+        sqadd           v19.4h,  v19.4h,  v7.4h
+        ret
+endfunc
+
+function inv_identity_8h_x4_neon, export=1
+        mov             w16, #(5793-4096)*8
+        dup             v0.4h,   w16
+        sqrdmulh        v4.8h,   v16.8h,  v0.h[0]
+        sqrdmulh        v5.8h,   v17.8h,  v0.h[0]
+        sqrdmulh        v6.8h,   v18.8h,  v0.h[0]
+        sqrdmulh        v7.8h,   v19.8h,  v0.h[0]
+        sqadd           v16.8h,  v16.8h,  v4.8h
+        sqadd           v17.8h,  v17.8h,  v5.8h
+        sqadd           v18.8h,  v18.8h,  v6.8h
+        sqadd           v19.8h,  v19.8h,  v7.8h
+        ret
+endfunc
+
+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h
+        sqrdmulh        v2.8h,  \i,  \c
+        srhadd          \i,     \i,  v2.8h
+.endr
+.endm
+
+function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
+        mov             x15, x30
+        movi            v31.8h,  #0
+        ld1             {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
+        st1             {v31.8h}, [x2], #16
+
+        sshr            v16.4h,  v16.4h,  #2
+        sshr            v17.4h,  v17.4h,  #2
+        sshr            v18.4h,  v18.4h,  #2
+        sshr            v19.4h,  v19.4h,  #2
+
+        iwht4
+
+        st1             {v31.8h}, [x2], #16
+        transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
+
+        iwht4
+
+        ld1             {v0.s}[0], [x0], x1
+        ld1             {v0.s}[1], [x0], x1
+        ins             v16.d[1], v17.d[0]
+        ins             v18.d[1], v19.d[0]
+        ld1             {v1.s}[0], [x0], x1
+        ld1             {v1.s}[1], [x0], x1
+
+        b               L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+        movi            v31.8h,  #0
+        ld1             {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
+        st1             {v31.8h}, [x2], #16
+
+        blr             x4
+
+        st1             {v31.8h}, [x2], #16
+        transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
+
+        blr             x5
+
+        ld1             {v0.s}[0], [x0], x1
+        ld1             {v0.s}[1], [x0], x1
+        ins             v16.d[1], v17.d[0]
+        ins             v18.d[1], v19.d[0]
+        ld1             {v1.s}[0], [x0], x1
+        ld1             {v1.s}[1], [x0], x1
+        srshr           v16.8h,  v16.8h,  #4
+        srshr           v18.8h,  v18.8h,  #4
+
+L(itx_4x4_end):
+        sub             x0,  x0,  x1, lsl #2
+        uaddw           v16.8h,  v16.8h,  v0.8b
+        sqxtun          v0.8b,   v16.8h
+        uaddw           v18.8h,  v18.8h,  v1.8b
+        st1             {v0.s}[0], [x0], x1
+        sqxtun          v1.8b,   v18.8h
+        st1             {v0.s}[1], [x0], x1
+        st1             {v1.s}[0], [x0], x1
+        st1             {v1.s}[1], [x0], x1
+
+        br              x15
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
+        mov             x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+        cbnz            w3,  1f
+        mov             w16, #2896*8
+        ld1r            {v16.8h}, [x2]
+        dup             v4.8h,   w16
+        strh            wzr, [x2]
+        sqrdmulh        v16.8h,  v16.8h,  v4.h[0]
+        ld1             {v0.s}[0], [x0], x1
+        sqrdmulh        v20.8h,  v16.8h,  v4.h[0]
+        ld1             {v0.s}[1], [x0], x1
+        srshr           v16.8h,  v20.8h,  #4
+        ld1             {v1.s}[0], [x0], x1
+        srshr           v18.8h,  v20.8h,  #4
+        ld1             {v1.s}[1], [x0], x1
+        b               L(itx_4x4_end)
+1:
+.endif
+        adr             x4,  inv_\txfm1\()_4h_x4_neon
+        adr             x5,  inv_\txfm2\()_4h_x4_neon
+        b               inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7, sz, szb
+        idct_4          \r0, \r2, \r4, \r6, \sz
+
+        smull_smlsl     v2,  v3,  \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a
+        smull_smlal     v4,  v5,  \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a
+        smull_smlsl     v6,  v7,  \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a
+        rshrn_sz        \r1, v2,  v3,  #12, \sz                   // t4a
+        rshrn_sz        \r7, v4,  v5,  #12, \sz                   // t7a
+        smull_smlal     v2,  v3,  \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a
+        rshrn_sz        \r3, v6,  v7,  #12, \sz                   // t5a
+        rshrn_sz        \r5, v2,  v3,  #12, \sz                   // taa
+
+        sqadd           v2\sz,   \r1\sz,  \r3\sz // t4
+        sqsub           \r1\sz,  \r1\sz,  \r3\sz // t5a
+        sqadd           v3\sz,   \r7\sz,  \r5\sz // t7
+        sqsub           \r3\sz,  \r7\sz,  \r5\sz // t6a
+
+        smull_smlsl     v4,  v5,  \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5
+        smull_smlal     v6,  v7,  \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6
+        rshrn_sz        v4,  v4,  v5,  #12, \sz // t5
+        rshrn_sz        v5,  v6,  v7,  #12, \sz // t6
+
+        sqsub           \r7\sz,  \r0\sz,  v3\sz // out7
+        sqadd           \r0\sz,  \r0\sz,  v3\sz // out0
+        sqadd           \r1\sz,  \r2\sz,  v5\sz // out1
+        sqsub           v6\sz,   \r2\sz,  v5\sz // out6
+        sqadd           \r2\sz,  \r4\sz,  v4\sz // out2
+        sqsub           \r5\sz,  \r4\sz,  v4\sz // out5
+        sqadd           \r3\sz,  \r6\sz,  v2\sz // out3
+        sqsub           \r4\sz,  \r6\sz,  v2\sz // out4
+        mov             \r6\szb, v6\szb         // out6
+.endm
+
+function inv_dct_8h_x8_neon, export=1
+        movrel          x16, idct_coeffs
+        ld1             {v0.8h}, [x16]
+        idct_8          v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b
+        ret
+endfunc
+
+function inv_dct_4h_x8_neon, export=1
+        movrel          x16, idct_coeffs
+        ld1             {v0.8h}, [x16]
+        idct_8          v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b
+        ret
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7, sz
+        movrel          x16, iadst8_coeffs
+        ld1             {v0.8h, v1.8h}, [x16]
+
+        smull_smlal     v2,  v3,  v23, v16, v0.h[0], v0.h[1], \sz
+        smull_smlsl     v4,  v5,  v23, v16, v0.h[1], v0.h[0], \sz
+        smull_smlal     v6,  v7,  v21, v18, v0.h[2], v0.h[3], \sz
+        rshrn_sz        v16, v2,  v3,  #12, \sz  // t0a
+        rshrn_sz        v23, v4,  v5,  #12, \sz  // t1a
+        smull_smlsl     v2,  v3,  v21, v18, v0.h[3], v0.h[2], \sz
+        smull_smlal     v4,  v5,  v19, v20, v0.h[4], v0.h[5], \sz
+        rshrn_sz        v18, v6,  v7,  #12, \sz  // t2a
+        rshrn_sz        v21, v2,  v3,  #12, \sz  // t3a
+        smull_smlsl     v6,  v7,  v19, v20, v0.h[5], v0.h[4], \sz
+        smull_smlal     v2,  v3,  v17, v22, v0.h[6], v0.h[7], \sz
+        rshrn_sz        v20, v4,  v5,  #12, \sz  // t4a
+        rshrn_sz        v19, v6,  v7,  #12, \sz  // t5a
+        smull_smlsl     v4,  v5,  v17, v22, v0.h[7], v0.h[6], \sz
+        rshrn_sz        v22, v2,  v3,  #12, \sz  // t6a
+        rshrn_sz        v17, v4,  v5,  #12, \sz  // t7a
+
+        sqadd           v2\sz,   v16\sz,  v20\sz // t0
+        sqsub           v3\sz,   v16\sz,  v20\sz // t4
+        sqadd           v4\sz,   v23\sz,  v19\sz // t1
+        sqsub           v5\sz,   v23\sz,  v19\sz // t5
+        sqadd           v6\sz,   v18\sz,  v22\sz // t2
+        sqsub           v7\sz,   v18\sz,  v22\sz // t6
+        sqadd           v18\sz,  v21\sz,  v17\sz // t3
+        sqsub           v19\sz,  v21\sz,  v17\sz // t7
+
+        smull_smlal     v16, v17, v3,  v5,  v1.h[3], v1.h[2], \sz
+        smull_smlsl     v20, v21, v3,  v5,  v1.h[2], v1.h[3], \sz
+        smull_smlsl     v22, v23, v19, v7,  v1.h[3], v1.h[2], \sz
+
+        rshrn_sz        v3,  v16, v17, #12, \sz  // t4a
+        rshrn_sz        v5,  v20, v21, #12, \sz  // t5a
+
+        smull_smlal     v16, v17, v19, v7,  v1.h[2], v1.h[3], \sz
+
+        rshrn_sz        v7,  v22, v23, #12, \sz  // t6a
+        rshrn_sz        v19, v16, v17, #12, \sz  // t7a
+
+        sqadd           \o0\()\sz, v2\sz, v6\sz  // out0
+        sqsub           v2\sz,     v2\sz, v6\sz  // t2
+        sqadd           \o7\()\sz, v4\sz, v18\sz // out7
+        sqsub           v4\sz,     v4\sz, v18\sz // t3
+        sqneg           \o7\()\sz, \o7\()\sz     // out7
+
+        sqadd           \o1\()\sz, v3\sz, v7\sz  // out1
+        sqsub           v3\sz,     v3\sz, v7\sz  // t6
+        sqadd           \o6\()\sz, v5\sz, v19\sz // out6
+        sqsub           v5\sz,     v5\sz, v19\sz // t7
+        sqneg           \o1\()\sz, \o1\()\sz     // out1
+
+        smull_smlal     v18, v19, v2,  v4,  v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20)
+        smull_smlsl     v6,  v7,  v2,  v4,  v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19)
+        smull_smlsl     v20, v21, v3,  v5,  v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18)
+        rshrn_sz        v2,  v18, v19, #12, \sz // out3
+        smull_smlal     v18, v19, v3,  v5,  v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21)
+        rshrn_sz        v3,  v20, v21, #12, \sz // out5
+        rshrn_sz        \o2, v18, v19, #12, \sz // out2 (v18 or v21)
+        rshrn_sz        \o4, v6,  v7,  #12, \sz // out4 (v20 or v19)
+
+        sqneg           \o3\()\sz, v2\sz     // out3
+        sqneg           \o5\()\sz, v3\sz     // out5
+.endm
+
+function inv_adst_8h_x8_neon, export=1
+        iadst_8         v16, v17, v18, v19, v20, v21, v22, v23, .8h
+        ret
+endfunc
+
+function inv_flipadst_8h_x8_neon, export=1
+        iadst_8         v23, v22, v21, v20, v19, v18, v17, v16, .8h
+        ret
+endfunc
+
+function inv_adst_4h_x8_neon, export=1
+        iadst_8         v16, v17, v18, v19, v20, v21, v22, v23, .4h
+        ret
+endfunc
+
+function inv_flipadst_4h_x8_neon, export=1
+        iadst_8         v23, v22, v21, v20, v19, v18, v17, v16, .4h
+        ret
+endfunc
+
+function inv_identity_8h_x8_neon, export=1
+        sqshl           v16.8h,  v16.8h,  #1
+        sqshl           v17.8h,  v17.8h,  #1
+        sqshl           v18.8h,  v18.8h,  #1
+        sqshl           v19.8h,  v19.8h,  #1
+        sqshl           v20.8h,  v20.8h,  #1
+        sqshl           v21.8h,  v21.8h,  #1
+        sqshl           v22.8h,  v22.8h,  #1
+        sqshl           v23.8h,  v23.8h,  #1
+        ret
+endfunc
+
+function inv_identity_4h_x8_neon, export=1
+        sqshl           v16.4h,  v16.4h,  #1
+        sqshl           v17.4h,  v17.4h,  #1
+        sqshl           v18.4h,  v18.4h,  #1
+        sqshl           v19.4h,  v19.4h,  #1
+        sqshl           v20.4h,  v20.4h,  #1
+        sqshl           v21.4h,  v21.4h,  #1
+        sqshl           v22.4h,  v22.4h,  #1
+        sqshl           v23.4h,  v23.4h,  #1
+        ret
+endfunc
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
+        movi            v28.8h,  #0
+        movi            v29.8h,  #0
+        movi            v30.8h,  #0
+        movi            v31.8h,  #0
+        ld1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x2]
+        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], #64
+        ld1             {v20.8h,v21.8h,v22.8h,v23.8h}, [x2]
+        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
+
+.ifc \variant, identity_
+        // The identity shl #1 and downshift srshr #1 cancel out
+.else
+        blr             x4
+
+        srshr           v16.8h,  v16.8h,  #1
+        srshr           v17.8h,  v17.8h,  #1
+        srshr           v18.8h,  v18.8h,  #1
+        srshr           v19.8h,  v19.8h,  #1
+        srshr           v20.8h,  v20.8h,  #1
+        srshr           v21.8h,  v21.8h,  #1
+        srshr           v22.8h,  v22.8h,  #1
+        srshr           v23.8h,  v23.8h,  #1
+.endif
+
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+
+        blr             x5
+
+        load_add_store_8x8 x0, x7
+        br              x15
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
+        mov             x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         8,   8,   1
+.endif
+        adr             x5,  inv_\txfm2\()_8h_x8_neon
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_8x8_neon
+.else
+        adr             x4,  inv_\txfm1\()_8h_x8_neon
+        b               inv_txfm_add_8x8_neon
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
+
+function inv_txfm_add_8x4_neon
+        movi            v30.8h,  #0
+        movi            v31.8h,  #0
+        mov             w16, #2896*8
+        dup             v0.4h,   w16
+        ld1             {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
+        st1             {v30.8h,v31.8h}, [x2], #32
+        ld1             {v20.4h,v21.4h,v22.4h,v23.4h}, [x2]
+        st1             {v30.8h,v31.8h}, [x2]
+
+        scale_input     .4h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+        blr             x4
+
+        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
+        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
+        ins             v16.d[1], v20.d[0]
+        ins             v17.d[1], v21.d[0]
+        ins             v18.d[1], v22.d[0]
+        ins             v19.d[1], v23.d[0]
+
+        blr             x5
+
+        load_add_store_8x4 x0, x7
+        br              x15
+endfunc
+
+function inv_txfm_add_4x8_neon
+        movi            v28.8h,  #0
+        movi            v29.8h,  #0
+        movi            v30.8h,  #0
+        movi            v31.8h,  #0
+        mov             w16, #2896*8
+        dup             v0.4h,   w16
+        ld1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x2]
+        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
+
+        scale_input     .8h, v0.h[0], v16, v17, v18, v19
+
+        blr             x4
+
+        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
+        ins             v20.d[0], v16.d[1]
+        ins             v21.d[0], v17.d[1]
+        ins             v22.d[0], v18.d[1]
+        ins             v23.d[0], v19.d[1]
+
+        blr             x5
+
+        load_add_store_4x8 x0, x7
+        br              x15
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+        mov             x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  0
+.endif
+        adr             x4,  inv_\txfm1\()_\h\()h_x\w\()_neon
+        adr             x5,  inv_\txfm2\()_\w\()h_x\h\()_neon
+        b               inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+
+.macro idct_16 sz, szb
+        idct_8          v16, v18, v20, v22, v24, v26, v28, v30, \sz, \szb
+
+        smull_smlsl     v2,  v3,  v17, v31, v1.h[0], v1.h[1], \sz // -> t8a
+        smull_smlal     v4,  v5,  v17, v31, v1.h[1], v1.h[0], \sz // -> t15a
+        smull_smlsl     v6,  v7,  v25, v23, v1.h[2], v1.h[3], \sz // -> t9a
+        rshrn_sz        v17, v2,  v3,  #12, \sz                   // t8a
+        rshrn_sz        v31, v4,  v5,  #12, \sz                   // t15a
+        smull_smlal     v2,  v3,  v25, v23, v1.h[3], v1.h[2], \sz // -> t14a
+        smull_smlsl     v4,  v5,  v21, v27, v1.h[4], v1.h[5], \sz // -> t10a
+        rshrn_sz        v23, v6,  v7,  #12, \sz                   // t9a
+        rshrn_sz        v25, v2,  v3,  #12, \sz                   // t14a
+        smull_smlal     v6,  v7,  v21, v27, v1.h[5], v1.h[4], \sz // -> t13a
+        smull_smlsl     v2,  v3,  v29, v19, v1.h[6], v1.h[7], \sz // -> t11a
+        rshrn_sz        v21, v4,  v5,  #12, \sz                   // t10a
+        rshrn_sz        v27, v6,  v7,  #12, \sz                   // t13a
+        smull_smlal     v4,  v5,  v29, v19, v1.h[7], v1.h[6], \sz // -> t12a
+        rshrn_sz        v19, v2,  v3,  #12, \sz                   // t11a
+        rshrn_sz        v29, v4,  v5,  #12, \sz                   // t12a
+
+        sqsub           v2\sz,   v17\sz,  v23\sz  // t9
+        sqadd           v17\sz,  v17\sz,  v23\sz  // t8
+        sqsub           v3\sz,   v31\sz,  v25\sz  // t14
+        sqadd           v31\sz,  v31\sz,  v25\sz  // t15
+        sqsub           v23\sz,  v19\sz,  v21\sz  // t10
+        sqadd           v19\sz,  v19\sz,  v21\sz  // t11
+        sqadd           v25\sz,  v29\sz,  v27\sz  // t12
+        sqsub           v29\sz,  v29\sz,  v27\sz  // t13
+
+        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[2], v0.h[3], \sz // -> t9a
+        smull_smlal     v6,  v7,  v3,  v2,  v0.h[3], v0.h[2], \sz // -> t14a
+        rshrn_sz        v21, v4,  v5,  #12, \sz                   // t9a
+        rshrn_sz        v27, v6,  v7,  #12, \sz                   // t14a
+
+        smull_smlsl     v4,  v5,  v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
+        smull_smlal     v6,  v7,  v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
+        rshrn_sz        v29, v4,  v5,  #12, \sz                   // t13a
+        neg             v6.4s,   v6.4s
+.ifc \sz, .8h
+        neg             v7.4s,   v7.4s
+.endif
+        rshrn_sz        v23, v6,  v7,  #12, \sz                   // t10a
+
+        sqsub           v2\sz,   v17\sz,  v19\sz  // t11a
+        sqadd           v17\sz,  v17\sz,  v19\sz  // t8a
+        sqsub           v3\sz,   v31\sz,  v25\sz  // t12a
+        sqadd           v31\sz,  v31\sz,  v25\sz  // t15a
+        sqadd           v19\sz,  v21\sz,  v23\sz  // t9
+        sqsub           v21\sz,  v21\sz,  v23\sz  // t10
+        sqsub           v25\sz,  v27\sz,  v29\sz  // t13
+        sqadd           v27\sz,  v27\sz,  v29\sz  // t14
+
+        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[0], v0.h[0], \sz // -> t11
+        smull_smlal     v6,  v7,  v3,  v2,  v0.h[0], v0.h[0], \sz // -> t12
+        smull_smlsl     v2,  v3,  v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
+
+        rshrn_sz        v4,  v4,  v5,  #12, \sz   // t11
+        rshrn_sz        v5,  v6,  v7,  #12, \sz   // t12
+        smull_smlal     v6,  v7,  v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
+        rshrn_sz        v2,  v2,  v3,  #12, \sz   // t10a
+        rshrn_sz        v3,  v6,  v7,  #12, \sz   // t13a
+
+        sqadd           v6\sz,   v16\sz,  v31\sz  // out0
+        sqsub           v31\sz,  v16\sz,  v31\sz  // out15
+        mov             v16\szb, v6\szb
+        sqadd           v23\sz,  v30\sz,  v17\sz  // out7
+        sqsub           v7\sz,   v30\sz,  v17\sz  // out8
+        sqadd           v17\sz,  v18\sz,  v27\sz  // out1
+        sqsub           v30\sz,  v18\sz,  v27\sz  // out14
+        sqadd           v18\sz,  v20\sz,  v3\sz   // out2
+        sqsub           v29\sz,  v20\sz,  v3\sz   // out13
+        sqadd           v3\sz,   v28\sz,  v19\sz  // out6
+        sqsub           v25\sz,  v28\sz,  v19\sz  // out9
+        sqadd           v19\sz,  v22\sz,  v5\sz   // out3
+        sqsub           v28\sz,  v22\sz,  v5\sz   // out12
+        sqadd           v20\sz,  v24\sz,  v4\sz   // out4
+        sqsub           v27\sz,  v24\sz,  v4\sz   // out11
+        sqadd           v21\sz,  v26\sz,  v2\sz   // out5
+        sqsub           v26\sz,  v26\sz,  v2\sz   // out10
+        mov             v24\szb, v7\szb
+        mov             v22\szb, v3\szb
+.endm
+
+function inv_dct_8h_x16_neon, export=1
+        movrel          x16, idct_coeffs
+        ld1             {v0.8h, v1.8h}, [x16]
+        idct_16         .8h, .16b
+        ret
+endfunc
+
+function inv_dct_4h_x16_neon, export=1
+        movrel          x16, idct_coeffs
+        ld1             {v0.8h, v1.8h}, [x16]
+        idct_16         .4h, .8b
+        ret
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15, sz, szb
+        movrel          x16, iadst16_coeffs
+        ld1             {v0.8h, v1.8h}, [x16]
+        movrel          x16, idct_coeffs
+
+        smull_smlal     v2,  v3,  v31, v16, v0.h[0], v0.h[1], \sz // -> t0
+        smull_smlsl     v4,  v5,  v31, v16, v0.h[1], v0.h[0], \sz // -> t1
+        smull_smlal     v6,  v7,  v29, v18, v0.h[2], v0.h[3], \sz // -> t2
+        rshrn_sz        v16, v2,  v3,  #12, \sz   // t0
+        rshrn_sz        v31, v4,  v5,  #12, \sz   // t1
+        smull_smlsl     v2,  v3,  v29, v18, v0.h[3], v0.h[2], \sz // -> t3
+        smull_smlal     v4,  v5,  v27, v20, v0.h[4], v0.h[5], \sz // -> t4
+        rshrn_sz        v18, v6,  v7,  #12, \sz   // t2
+        rshrn_sz        v29, v2,  v3,  #12, \sz   // t3
+        smull_smlsl     v6,  v7,  v27, v20, v0.h[5], v0.h[4], \sz // -> t5
+        smull_smlal     v2,  v3,  v25, v22, v0.h[6], v0.h[7], \sz // -> t6
+        rshrn_sz        v20, v4,  v5,  #12, \sz   // t4
+        rshrn_sz        v27, v6,  v7,  #12, \sz   // t5
+        smull_smlsl     v4,  v5,  v25, v22, v0.h[7], v0.h[6], \sz // -> t7
+        smull_smlal     v6,  v7,  v23, v24, v1.h[0], v1.h[1], \sz // -> t8
+        rshrn_sz        v22, v2,  v3,  #12, \sz   // t6
+        rshrn_sz        v25, v4,  v5,  #12, \sz   // t7
+        smull_smlsl     v2,  v3,  v23, v24, v1.h[1], v1.h[0], \sz // -> t9
+        smull_smlal     v4,  v5,  v21, v26, v1.h[2], v1.h[3], \sz // -> t10
+        rshrn_sz        v23, v6,  v7,  #12, \sz   // t8
+        rshrn_sz        v24, v2,  v3,  #12, \sz   // t9
+        smull_smlsl     v6,  v7,  v21, v26, v1.h[3], v1.h[2], \sz // -> t11
+        smull_smlal     v2,  v3,  v19, v28, v1.h[4], v1.h[5], \sz // -> t12
+        rshrn_sz        v21, v4,  v5,  #12, \sz   // t10
+        rshrn_sz        v26, v6,  v7,  #12, \sz   // t11
+        smull_smlsl     v4,  v5,  v19, v28, v1.h[5], v1.h[4], \sz // -> t13
+        smull_smlal     v6,  v7,  v17, v30, v1.h[6], v1.h[7], \sz // -> t14
+        rshrn_sz        v19, v2,  v3,  #12, \sz   // t12
+        rshrn_sz        v28, v4,  v5,  #12, \sz   // t13
+        smull_smlsl     v2,  v3,  v17, v30, v1.h[7], v1.h[6], \sz // -> t15
+        rshrn_sz        v17, v6,  v7,  #12, \sz   // t14
+        rshrn_sz        v30, v2,  v3,  #12, \sz   // t15
+
+        ld1             {v0.8h}, [x16]
+
+        sqsub           v2\sz,   v16\sz,  v23\sz // t8a
+        sqadd           v16\sz,  v16\sz,  v23\sz // t0a
+        sqsub           v3\sz,   v31\sz,  v24\sz // t9a
+        sqadd           v31\sz,  v31\sz,  v24\sz // t1a
+        sqadd           v23\sz,  v18\sz,  v21\sz // t2a
+        sqsub           v18\sz,  v18\sz,  v21\sz // t10a
+        sqadd           v24\sz,  v29\sz,  v26\sz // t3a
+        sqsub           v29\sz,  v29\sz,  v26\sz // t11a
+        sqadd           v21\sz,  v20\sz,  v19\sz // t4a
+        sqsub           v20\sz,  v20\sz,  v19\sz // t12a
+        sqadd           v26\sz,  v27\sz,  v28\sz // t5a
+        sqsub           v27\sz,  v27\sz,  v28\sz // t13a
+        sqadd           v19\sz,  v22\sz,  v17\sz // t6a
+        sqsub           v22\sz,  v22\sz,  v17\sz // t14a
+        sqadd           v28\sz,  v25\sz,  v30\sz // t7a
+        sqsub           v25\sz,  v25\sz,  v30\sz // t15a
+
+        smull_smlal     v4,  v5,  v2,  v3,  v0.h[5], v0.h[4], \sz // -> t8
+        smull_smlsl     v6,  v7,  v2,  v3,  v0.h[4], v0.h[5], \sz // -> t9
+        smull_smlal     v2,  v3,  v18, v29, v0.h[7], v0.h[6], \sz // -> t10
+        rshrn_sz        v17, v4,  v5,  #12, \sz  // t8
+        rshrn_sz        v30, v6,  v7,  #12, \sz  // t9
+        smull_smlsl     v4,  v5,  v18, v29, v0.h[6], v0.h[7], \sz // -> t11
+        smull_smlsl     v6,  v7,  v27, v20, v0.h[5], v0.h[4], \sz // -> t12
+        rshrn_sz        v18, v2,  v3,  #12, \sz  // t10
+        rshrn_sz        v29, v4,  v5,  #12, \sz  // t11
+        smull_smlal     v2,  v3,  v27, v20, v0.h[4], v0.h[5], \sz // -> t13
+        smull_smlsl     v4,  v5,  v25, v22, v0.h[7], v0.h[6], \sz // -> t14
+        rshrn_sz        v27, v6,  v7,  #12, \sz  // t12
+        rshrn_sz        v20, v2,  v3,  #12, \sz  // t13
+        smull_smlal     v6,  v7,  v25, v22, v0.h[6], v0.h[7], \sz // -> t15
+        rshrn_sz        v25, v4,  v5,  #12, \sz  // t14
+        rshrn_sz        v22, v6,  v7,  #12, \sz  // t15
+
+        sqsub           v2\sz,   v16\sz,  v21\sz // t4
+        sqadd           v16\sz,  v16\sz,  v21\sz // t0
+        sqsub           v3\sz,   v31\sz,  v26\sz // t5
+        sqadd           v31\sz,  v31\sz,  v26\sz // t1
+        sqadd           v21\sz,  v23\sz,  v19\sz // t2
+        sqsub           v23\sz,  v23\sz,  v19\sz // t6
+        sqadd           v26\sz,  v24\sz,  v28\sz // t3
+        sqsub           v24\sz,  v24\sz,  v28\sz // t7
+        sqadd           v19\sz,  v17\sz,  v27\sz // t8a
+        sqsub           v17\sz,  v17\sz,  v27\sz // t12a
+        sqadd           v28\sz,  v30\sz,  v20\sz // t9a
+        sqsub           v30\sz,  v30\sz,  v20\sz // t13a
+        sqadd           v27\sz,  v18\sz,  v25\sz // t10a
+        sqsub           v18\sz,  v18\sz,  v25\sz // t14a
+        sqadd           v20\sz,  v29\sz,  v22\sz // t11a
+        sqsub           v29\sz,  v29\sz,  v22\sz // t15a
+
+        smull_smlal     v4,  v5,  v2,  v3,  v0.h[3], v0.h[2], \sz // -> t4a
+        smull_smlsl     v6,  v7,  v2,  v3,  v0.h[2], v0.h[3], \sz // -> t5a
+        smull_smlsl     v2,  v3,  v24, v23, v0.h[3], v0.h[2], \sz // -> t6a
+        rshrn_sz        v22, v4,  v5,  #12, \sz // t4a
+        rshrn_sz        v25, v6,  v7,  #12, \sz // t5a
+        smull_smlal     v4,  v5,  v24, v23, v0.h[2], v0.h[3], \sz // -> t7a
+        smull_smlal     v6,  v7,  v17, v30, v0.h[3], v0.h[2], \sz // -> t12
+        rshrn_sz        v24, v2,  v3,  #12, \sz // t6a
+        rshrn_sz        v23, v4,  v5,  #12, \sz // t7a
+        smull_smlsl     v2,  v3,  v17, v30, v0.h[2], v0.h[3], \sz // -> t13
+        smull_smlsl     v4,  v5,  v29, v18, v0.h[3], v0.h[2], \sz // -> t14
+        rshrn_sz        v17, v6,  v7,  #12, \sz // t12
+        smull_smlal     v6,  v7,  v29, v18, v0.h[2], v0.h[3], \sz // -> t15
+        rshrn_sz        v29, v2,  v3,  #12, \sz // t13
+        rshrn_sz        v30, v4,  v5,  #12, \sz // t14
+        rshrn_sz        v18, v6,  v7,  #12, \sz // t15
+
+        sqsub           v2\sz,   v16\sz,  v21\sz // t2a
+.ifc \o0, v16
+        sqadd           \o0\sz,  v16\sz,  v21\sz // out0
+        sqsub           v21\sz,  v31\sz,  v26\sz // t3a
+        sqadd           \o15\sz, v31\sz,  v26\sz // out15
+.else
+        sqadd           v4\sz,   v16\sz,  v21\sz // out0
+        sqsub           v21\sz,  v31\sz,  v26\sz // t3a
+        sqadd           \o15\sz, v31\sz,  v26\sz // out15
+        mov             \o0\szb, v4\szb
+.endif
+        sqneg           \o15\sz, \o15\sz         // out15
+
+        sqsub           v3\sz,   v29\sz,  v18\sz // t15a
+        sqadd           \o13\sz, v29\sz,  v18\sz // out13
+        sqadd           \o2\sz,  v17\sz,  v30\sz // out2
+        sqsub           v26\sz,  v17\sz,  v30\sz // t14a
+        sqneg           \o13\sz, \o13\sz         // out13
+
+        sqadd           \o1\sz,  v19\sz,  v27\sz // out1
+        sqsub           v27\sz,  v19\sz,  v27\sz // t10
+        sqadd           \o14\sz, v28\sz,  v20\sz // out14
+        sqsub           v20\sz,  v28\sz,  v20\sz // t11
+        sqneg           \o1\sz,  \o1\sz          // out1
+
+        sqadd           \o3\sz,  v22\sz,  v24\sz // out3
+        sqsub           v22\sz,  v22\sz,  v24\sz // t6
+        sqadd           \o12\sz, v25\sz,  v23\sz // out12
+        sqsub           v23\sz,  v25\sz,  v23\sz // t7
+        sqneg           \o3\sz,  \o3\sz          // out3
+
+        smull_smlsl     v24, v25, v2,  v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23)
+        smull_smlal     v4,  v5,  v2,  v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24)
+        smull_smlal     v6,  v7,  v26, v3,  v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)
+
+        rshrn_sz        v24, v24, v25, #12, \sz // out8
+        rshrn_sz        v4,  v4,  v5,  #12, \sz // out7
+        rshrn_sz        v5,  v6,  v7,  #12, \sz // out5
+        smull_smlsl     v6,  v7,  v26, v3,  v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21)
+        smull_smlal     v2,  v3,  v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27)
+        rshrn_sz        v26, v6,  v7,  #12, \sz // out10
+
+        smull_smlsl     v6,  v7,  v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20)
+        smull_smlal     v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25)
+        smull_smlsl     v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)
+
+        rshrn_sz        \o4, v2,  v3,  #12, \sz // out4
+        rshrn_sz        v6,  v6,  v7,  #12, \sz // out11
+        rshrn_sz        v7,  v21, v25, #12, \sz // out9
+        rshrn_sz        \o6, v22, v23, #12, \sz // out6
+
+.ifc \o8, v23
+        mov             \o8\szb,  v24\szb
+        mov             \o10\szb, v26\szb
+.endif
+
+        sqneg           \o7\sz,  v4\sz // out7
+        sqneg           \o5\sz,  v5\sz // out5
+        sqneg           \o11\sz, v6\sz // out11
+        sqneg           \o9\sz,  v7\sz // out9
+.endm
+
+function inv_adst_8h_x16_neon, export=1
+        iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b
+        ret
+endfunc
+
+function inv_flipadst_8h_x16_neon, export=1
+        iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b
+        ret
+endfunc
+
+function inv_adst_4h_x16_neon, export=1
+        iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b
+        ret
+endfunc
+
+function inv_flipadst_4h_x16_neon, export=1
+        iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b
+        ret
+endfunc
+
+function inv_identity_8h_x16_neon, export=1
+        mov             w16, #2*(5793-4096)*8
+        dup             v0.4h,   w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        sqrdmulh        v2.8h,      v\i\().8h,  v0.h[0]
+        sqadd           v\i\().8h,  v\i\().8h,  v\i\().8h
+        sqadd           v\i\().8h,  v\i\().8h,  v2.8h
+.endr
+        ret
+endfunc
+
+function inv_identity_4h_x16_neon, export=1
+        mov             w16, #2*(5793-4096)*8
+        dup             v0.4h,   w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        sqrdmulh        v2.4h,      v\i\().4h,  v0.h[0]
+        sqadd           v\i\().4h,  v\i\().4h,  v\i\().4h
+        sqadd           v\i\().4h,  v\i\().4h,  v2.4h
+.endr
+        ret
+endfunc
+
+.macro identity_8x16_shift2 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        sqrdmulh        v2.8h,   \i,      \c
+        sshr            v2.8h,   v2.8h,   #1
+        srhadd          \i,      \i,      v2.8h
+.endr
+.endm
+
+.macro identity_8x16_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        sqrdmulh        v2.8h,   \i,      \c
+        srshr           v2.8h,   v2.8h,   #1
+        sqadd           \i,      \i,      v2.8h
+.endr
+.endm
+
+.macro identity_8x8_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        sqrdmulh        v2.8h,   \i,      \c
+        srshr           v2.8h,   v2.8h,   #1
+        sqadd           \i,      \i,      v2.8h
+.endr
+.endm
+
+.macro identity_8x8 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        sqrdmulh        v2.8h,   \i,      \c
+        sqadd           \i,      \i,      \i
+        sqadd           \i,      \i,      v2.8h
+.endr
+.endm
+
+.macro def_horz_16 scale=0, identity=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x8_neon
+        mov             x14, x30
+        movi            v7.8h,  #0
+.if \identity
+        mov             w16, #2*(5793-4096)*8
+        dup             v0.4h,   w16
+.elseif \scale
+        mov             w16, #2896*8
+        dup             v0.4h,   w16
+.endif
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        ld1             {\i}, [x7]
+        st1             {v7.8h}, [x7], x8
+.endr
+.if \scale
+        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+.if \identity
+        identity_8x16_shift2 v0.h[0]
+.else
+        blr             x4
+.endif
+.if \shift > 0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        srshr           \i,  \i,  #\shift
+.endr
+.endif
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
+
+.irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h
+        st1             {\i}, [x6], #16
+.endr
+
+        br              x14
+endfunc
+.endm
+
+def_horz_16 scale=0, identity=0, shift=2
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
+
+function inv_txfm_add_vert_8x16_neon
+        mov             x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ld1             {v\i\().8h}, [x7], x8
+.endr
+        blr             x5
+        load_add_store_8x16 x6, x7
+        br              x14
+endfunc
+
+function inv_txfm_add_16x16_neon
+        mov             x15, x30
+        sub             sp,  sp,  #512
+.irp i, 0, 8
+        add             x6,  sp,  #(\i*16*2)
+.if \i == 8
+        cmp             w3,  w13
+        b.lt            1f
+.endif
+        add             x7,  x2,  #(\i*2)
+        mov             x8,  #16*2
+        blr             x9
+.endr
+        b               2f
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+2:
+.irp i, 0, 8
+        add             x6,  x0,  #(\i)
+        add             x7,  sp,  #(\i*2)
+        mov             x8,  #32
+        bl              inv_txfm_add_vert_8x16_neon
+.endr
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
+
+.macro def_fn_16x16 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         16,  16,  2
+.endif
+.ifc \txfm1, identity
+        adr             x9,  inv_txfm_horz_identity_16x8_neon
+.else
+        adr             x9,  inv_txfm_horz_16x8_neon
+        adr             x4,  inv_\txfm1\()_8h_x16_neon
+.endif
+        adr             x5,  inv_\txfm2\()_8h_x16_neon
+        mov             x13, #\eob_half
+        b               inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct, 36
+def_fn_16x16 identity, identity, 36
+def_fn_16x16 dct, adst, 36
+def_fn_16x16 dct, flipadst, 36
+def_fn_16x16 dct, identity, 8
+def_fn_16x16 adst, dct, 36
+def_fn_16x16 adst, adst, 36
+def_fn_16x16 adst, flipadst, 36
+def_fn_16x16 flipadst, dct, 36
+def_fn_16x16 flipadst, adst, 36
+def_fn_16x16 flipadst, flipadst, 36
+def_fn_16x16 identity, dct, 8
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
+        mov             x15, x30
+        movi            v4.8h,  #0
+
+.ifc \variant, identity_
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h
+        ld1             {\i},    [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+.irp i, v16.d, v17.d, v18.d, v19.d
+        ld1             {\i}[1], [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+        mov             w16, #2*(5793-4096)*8
+        dup             v0.4h,   w16
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+        ld1             {\i},    [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+.irp i, v20.d, v21.d, v22.d, v23.d
+        ld1             {\i}[1], [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+
+        identity_8x16_shift1 v0.h[0]
+.else
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
+        ld1             {\i},    [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+
+        blr             x4
+
+        ins             v16.d[1], v20.d[0]
+        ins             v17.d[1], v21.d[0]
+        ins             v18.d[1], v22.d[0]
+        ins             v19.d[1], v23.d[0]
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+        srshr           \i,  \i,  #1
+.endr
+.endif
+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
+        blr             x5
+        mov             x6,  x0
+        load_add_store_8x4 x6, x7
+
+.ifc \variant, identity_
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        mov             v18.16b, v22.16b
+        mov             v19.16b, v23.16b
+.else
+        ins             v24.d[1], v28.d[0]
+        ins             v25.d[1], v29.d[0]
+        ins             v26.d[1], v30.d[0]
+        ins             v27.d[1], v31.d[0]
+        srshr           v16.8h,  v24.8h,  #1
+        srshr           v17.8h,  v25.8h,  #1
+        srshr           v18.8h,  v26.8h,  #1
+        srshr           v19.8h,  v27.8h,  #1
+.endif
+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
+        blr             x5
+        add             x6,  x0,  #8
+        load_add_store_8x4 x6, x7
+
+        br              x15
+endfunc
+
+function inv_txfm_\variant\()add_4x16_neon
+        mov             x15, x30
+        movi            v2.8h,   #0
+
+        mov             x11, #32
+        cmp             w3,  w13
+        b.lt            1f
+
+        add             x6,  x2,  #16
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+        ld1             {\i},    [x6]
+        st1             {v2.8h}, [x6], x11
+.endr
+        mov             w16, #(5793-4096)*8
+        dup             v0.4h,   w16
+        identity_8x4_shift1 v24, v25, v26, v27, v0.h[0]
+.else
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+        ld1             {\i},    [x6]
+        st1             {v2.8h}, [x6], x11
+.endr
+        blr             x4
+        srshr           v24.8h,  v16.8h,  #1
+        srshr           v25.8h,  v17.8h,  #1
+        srshr           v26.8h,  v18.8h,  #1
+        srshr           v27.8h,  v19.8h,  #1
+.endif
+        transpose_4x8h  v24, v25, v26, v27, v4,  v5,  v6,  v7
+        ins             v28.d[0], v24.d[1]
+        ins             v29.d[0], v25.d[1]
+        ins             v30.d[0], v26.d[1]
+        ins             v31.d[0], v27.d[1]
+
+        b               2f
+1:
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
+        movi            \i,  #0
+.endr
+2:
+        movi            v2.8h,   #0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+        ld1             {\i},    [x2]
+        st1             {v2.8h}, [x2], x11
+.endr
+.ifc \variant, identity_
+        mov             w16, #(5793-4096)*8
+        dup             v0.4h,   w16
+        identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
+.else
+        blr             x4
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+        srshr           \i,  \i,  #1
+.endr
+.endif
+        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
+        ins             v20.d[0], v16.d[1]
+        ins             v21.d[0], v17.d[1]
+        ins             v22.d[0], v18.d[1]
+        ins             v23.d[0], v19.d[1]
+
+        blr             x5
+
+        load_add_store_4x16 x0, x6
+
+        br              x15
+endfunc
+.endm
+
+def_fn_416_base
+def_fn_416_base identity_
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  1
+.endif
+.if \w == 4
+        adr             x4,  inv_\txfm1\()_8h_x\w\()_neon
+        adr             x5,  inv_\txfm2\()_4h_x\h\()_neon
+        mov             w13, #\eob_half
+.else
+        adr             x4,  inv_\txfm1\()_4h_x\w\()_neon
+        adr             x5,  inv_\txfm2\()_8h_x\h\()_neon
+.endif
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+        b               inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
+        mov             x15, x30
+        movi            v4.8h,  #0
+        mov             w16, #2896*8
+        dup             v0.4h,   w16
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        ld1             {\i},    [x2]
+        st1             {v4.8h}, [x2], #16
+.endr
+
+        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.ifc \variant, identity_
+        mov             w16, #2*(5793-4096)*8
+        dup             v0.4h,   w16
+        identity_8x16_shift1 v0.h[0]
+.else
+        blr             x4
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        srshr           \i,  \i,  #1
+.endr
+.endif
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+
+        blr             x5
+
+        mov             x6,  x0
+        load_add_store_8x8 x6, x7
+
+.ifc \variant, identity_
+        mov             v16.16b, v24.16b
+        mov             v17.16b, v25.16b
+        mov             v18.16b, v26.16b
+        mov             v19.16b, v27.16b
+        mov             v20.16b, v28.16b
+        mov             v21.16b, v29.16b
+        mov             v22.16b, v30.16b
+        mov             v23.16b, v31.16b
+.else
+        srshr           v16.8h,  v24.8h,  #1
+        srshr           v17.8h,  v25.8h,  #1
+        srshr           v18.8h,  v26.8h,  #1
+        srshr           v19.8h,  v27.8h,  #1
+        srshr           v20.8h,  v28.8h,  #1
+        srshr           v21.8h,  v29.8h,  #1
+        srshr           v22.8h,  v30.8h,  #1
+        srshr           v23.8h,  v31.8h,  #1
+.endif
+
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+
+        blr             x5
+
+        add             x0,  x0,  #8
+        load_add_store_8x8 x0, x7
+
+        br              x15
+endfunc
+
+function inv_txfm_\variant\()add_8x16_neon
+        mov             x15, x30
+        movi            v4.8h,   #0
+        mov             w16, #2896*8
+        dup             v0.4h,   w16
+        mov             x11, #32
+
+        cmp             w3,  w13
+        b.lt            1f
+
+        add             x6,  x2,  #16
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        ld1             {\i},    [x6]
+        st1             {v4.8h}, [x6], x11
+.endr
+        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+        // The identity shl #1 and downshift srshr #1 cancel out
+.else
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        ld1             {\i},    [x6]
+        st1             {v4.8h}, [x6], x11
+.endr
+        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+        blr             x4
+
+        srshr           v24.8h,  v16.8h,  #1
+        srshr           v25.8h,  v17.8h,  #1
+        srshr           v26.8h,  v18.8h,  #1
+        srshr           v27.8h,  v19.8h,  #1
+        srshr           v28.8h,  v20.8h,  #1
+        srshr           v29.8h,  v21.8h,  #1
+        srshr           v30.8h,  v22.8h,  #1
+        srshr           v31.8h,  v23.8h,  #1
+.endif
+        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        b               2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        movi            \i,  #0
+.endr
+
+2:
+        movi            v4.8h,   #0
+        mov             w16, #2896*8
+        dup             v0.4h,   w16
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        ld1             {\i},    [x2]
+        st1             {v4.8h}, [x2], x11
+.endr
+        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+.ifc \variant, identity_
+        // The identity shl #1 and downshift srshr #1 cancel out
+.else
+        blr             x4
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        srshr           \i,  \i,  #1
+.endr
+.endif
+
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+
+        blr             x5
+
+        load_add_store_8x16 x0, x6
+
+        br              x15
+endfunc
+.endm
+
+def_fn_816_base
+def_fn_816_base identity_
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  1
+.endif
+        adr             x4,  inv_\txfm1\()_8h_x\w\()_neon
+        adr             x5,  inv_\txfm2\()_8h_x\h\()_neon
+.if \w == 8
+        mov             x13, #\eob_half
+.endif
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+        b               inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43
+def_fn_816 \w, \h, identity, identity, 43
+def_fn_816 \w, \h, dct, adst, 43
+def_fn_816 \w, \h, dct, flipadst, 43
+def_fn_816 \w, \h, dct, identity, 8
+def_fn_816 \w, \h, adst, dct, 43
+def_fn_816 \w, \h, adst, adst, 43
+def_fn_816 \w, \h, adst, flipadst, 43
+def_fn_816 \w, \h, flipadst, dct, 43
+def_fn_816 \w, \h, flipadst, adst, 43
+def_fn_816 \w, \h, flipadst, flipadst, 43
+def_fn_816 \w, \h, identity, dct, 64
+def_fn_816 \w, \h, adst, identity, 8
+def_fn_816 \w, \h, flipadst, identity, 8
+def_fn_816 \w, \h, identity, adst, 64
+def_fn_816 \w, \h, identity, flipadst, 64
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_8h_x16_neon, export=1
+        movrel          x16, idct_coeffs, 2*16
+        ld1             {v0.8h, v1.8h}, [x16]
+        sub             x16, x16, #2*16
+
+        smull_smlsl     v2,  v3,  v16, v31, v0.h[0], v0.h[1], .8h // -> t16a
+        smull_smlal     v4,  v5,  v16, v31, v0.h[1], v0.h[0], .8h // -> t31a
+        smull_smlsl     v6,  v7,  v24, v23, v0.h[2], v0.h[3], .8h // -> t17a
+        rshrn_sz        v16, v2,  v3,  #12, .8h                   // t16a
+        rshrn_sz        v31, v4,  v5,  #12, .8h                   // t31a
+        smull_smlal     v2,  v3,  v24, v23, v0.h[3], v0.h[2], .8h // -> t30a
+        smull_smlsl     v4,  v5,  v20, v27, v0.h[4], v0.h[5], .8h // -> t18a
+        rshrn_sz        v24, v6,  v7,  #12, .8h                   // t17a
+        rshrn_sz        v23, v2,  v3,  #12, .8h                   // t30a
+        smull_smlal     v6,  v7,  v20, v27, v0.h[5], v0.h[4], .8h // -> t29a
+        smull_smlsl     v2,  v3,  v28, v19, v0.h[6], v0.h[7], .8h // -> t19a
+        rshrn_sz        v20, v4,  v5,  #12, .8h                   // t18a
+        rshrn_sz        v27, v6,  v7,  #12, .8h                   // t29a
+        smull_smlal     v4,  v5,  v28, v19, v0.h[7], v0.h[6], .8h // -> t28a
+        smull_smlsl     v6,  v7,  v18, v29, v1.h[0], v1.h[1], .8h // -> t20a
+        rshrn_sz        v28, v2,  v3,  #12, .8h                   // t19a
+        rshrn_sz        v19, v4,  v5,  #12, .8h                   // t28a
+        smull_smlal     v2,  v3,  v18, v29, v1.h[1], v1.h[0], .8h // -> t27a
+        smull_smlsl     v4,  v5,  v26, v21, v1.h[2], v1.h[3], .8h // -> t21a
+        rshrn_sz        v18, v6,  v7,  #12, .8h                   // t20a
+        rshrn_sz        v29, v2,  v3,  #12, .8h                   // t27a
+        smull_smlal     v6,  v7,  v26, v21, v1.h[3], v1.h[2], .8h // -> t26a
+        smull_smlsl     v2,  v3,  v22, v25, v1.h[4], v1.h[5], .8h // -> t22a
+        rshrn_sz        v26, v4,  v5,  #12, .8h                   // t21a
+        rshrn_sz        v21, v6,  v7,  #12, .8h                   // t26a
+        smull_smlal     v4,  v5,  v22, v25, v1.h[5], v1.h[4], .8h // -> t25a
+        smull_smlsl     v6,  v7,  v30, v17, v1.h[6], v1.h[7], .8h // -> t23a
+        rshrn_sz        v22, v2,  v3,  #12, .8h                   // t22a
+        rshrn_sz        v25, v4,  v5,  #12, .8h                   // t25a
+        smull_smlal     v2,  v3,  v30, v17, v1.h[7], v1.h[6], .8h // -> t24a
+        rshrn_sz        v30, v6,  v7,  #12, .8h                   // t23a
+        rshrn_sz        v17, v2,  v3,  #12, .8h                   // t24a
+
+        ld1             {v0.8h}, [x16]
+
+        sqsub           v2.8h,   v16.8h,  v24.8h // t17
+        sqadd           v16.8h,  v16.8h,  v24.8h // t16
+        sqsub           v3.8h,   v31.8h,  v23.8h // t30
+        sqadd           v31.8h,  v31.8h,  v23.8h // t31
+        sqsub           v24.8h,  v28.8h,  v20.8h // t18
+        sqadd           v28.8h,  v28.8h,  v20.8h // t19
+        sqadd           v23.8h,  v18.8h,  v26.8h // t20
+        sqsub           v18.8h,  v18.8h,  v26.8h // t21
+        sqsub           v20.8h,  v30.8h,  v22.8h // t22
+        sqadd           v30.8h,  v30.8h,  v22.8h // t23
+        sqadd           v26.8h,  v17.8h,  v25.8h // t24
+        sqsub           v17.8h,  v17.8h,  v25.8h // t25
+        sqsub           v22.8h,  v29.8h,  v21.8h // t26
+        sqadd           v29.8h,  v29.8h,  v21.8h // t27
+        sqadd           v25.8h,  v19.8h,  v27.8h // t28
+        sqsub           v19.8h,  v19.8h,  v27.8h // t29
+
+        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[4], v0.h[5], .8h // -> t17a
+        smull_smlal     v6,  v7,  v3,  v2,  v0.h[5], v0.h[4], .8h // -> t30a
+        smull_smlal     v2,  v3,  v19, v24, v0.h[5], v0.h[4], .8h // -> t18a
+        rshrn_sz        v21, v4,  v5,  #12, .8h                   // t17a
+        rshrn_sz        v27, v6,  v7,  #12, .8h                   // t30a
+        neg             v2.4s,   v2.4s                            // -> t18a
+        neg             v3.4s,   v3.4s                            // -> t18a
+        smull_smlsl     v4,  v5,  v19, v24, v0.h[4], v0.h[5], .8h // -> t29a
+        smull_smlsl     v6,  v7,  v22, v18, v0.h[6], v0.h[7], .8h // -> t21a
+        rshrn_sz        v19, v2,  v3,  #12, .8h                   // t18a
+        rshrn_sz        v24, v4,  v5,  #12, .8h                   // t29a
+        smull_smlal     v2,  v3,  v22, v18, v0.h[7], v0.h[6], .8h // -> t26a
+        smull_smlal     v4,  v5,  v17, v20, v0.h[7], v0.h[6], .8h // -> t22a
+        rshrn_sz        v22, v6,  v7,  #12, .8h                   // t21a
+        rshrn_sz        v18, v2,  v3,  #12, .8h                   // t26a
+        neg             v4.4s,   v4.4s                            // -> t22a
+        neg             v5.4s,   v5.4s                            // -> t22a
+        smull_smlsl     v6,  v7,  v17, v20, v0.h[6], v0.h[7], .8h // -> t25a
+        rshrn_sz        v17, v4,  v5,  #12, .8h                   // t22a
+        rshrn_sz        v20, v6,  v7,  #12, .8h                   // t25a
+
+        sqsub           v2.8h,   v27.8h,  v24.8h // t29
+        sqadd           v27.8h,  v27.8h,  v24.8h // t30
+        sqsub           v3.8h,   v21.8h,  v19.8h // t18
+        sqadd           v21.8h,  v21.8h,  v19.8h // t17
+        sqsub           v24.8h,  v16.8h,  v28.8h // t19a
+        sqadd           v16.8h,  v16.8h,  v28.8h // t16a
+        sqsub           v19.8h,  v30.8h,  v23.8h // t20a
+        sqadd           v30.8h,  v30.8h,  v23.8h // t23a
+        sqsub           v28.8h,  v17.8h,  v22.8h // t21
+        sqadd           v17.8h,  v17.8h,  v22.8h // t22
+        sqadd           v23.8h,  v26.8h,  v29.8h // t24a
+        sqsub           v26.8h,  v26.8h,  v29.8h // t27a
+        sqadd           v22.8h,  v20.8h,  v18.8h // t25
+        sqsub           v20.8h,  v20.8h,  v18.8h // t26
+        sqsub           v29.8h,  v31.8h,  v25.8h // t28a
+        sqadd           v31.8h,  v31.8h,  v25.8h // t31a
+
+        smull_smlsl     v4,  v5,  v2,  v3,  v0.h[2], v0.h[3], .8h // -> t18a
+        smull_smlal     v6,  v7,  v2,  v3,  v0.h[3], v0.h[2], .8h // -> t29a
+        smull_smlsl     v2,  v3,  v29, v24, v0.h[2], v0.h[3], .8h // -> t19
+        rshrn_sz        v18, v4,  v5,  #12, .8h                   // t18a
+        rshrn_sz        v25, v6,  v7,  #12, .8h                   // t29a
+        smull_smlal     v4,  v5,  v29, v24, v0.h[3], v0.h[2], .8h // -> t28
+        smull_smlal     v6,  v7,  v26, v19, v0.h[3], v0.h[2], .8h // -> t20
+        rshrn_sz        v29, v2,  v3,  #12, .8h                   // t19
+        rshrn_sz        v24, v4,  v5,  #12, .8h                   // t28
+        neg             v6.4s,   v6.4s                            // -> t20
+        neg             v7.4s,   v7.4s                            // -> t20
+        smull_smlsl     v2,  v3,  v26, v19, v0.h[2], v0.h[3], .8h // -> t27
+        smull_smlal     v4,  v5,  v20, v28, v0.h[3], v0.h[2], .8h // -> t21a
+        rshrn_sz        v26, v6,  v7,  #12, .8h                   // t20
+        rshrn_sz        v19, v2,  v3,  #12, .8h                   // t27
+        neg             v4.4s,   v4.4s                            // -> t21a
+        neg             v5.4s,   v5.4s                            // -> t21a
+        smull_smlsl     v6,  v7,  v20, v28, v0.h[2], v0.h[3], .8h // -> t26a
+        rshrn_sz        v20, v4,  v5,  #12, .8h                   // t21a
+        rshrn_sz        v28, v6,  v7,  #12, .8h                   // t26a
+
+        sqsub           v2.8h,   v16.8h,  v30.8h // t23
+        sqadd           v16.8h,  v16.8h,  v30.8h // t16 = out16
+        sqsub           v3.8h,   v31.8h,  v23.8h // t24
+        sqadd           v31.8h,  v31.8h,  v23.8h // t31 = out31
+        sqsub           v23.8h,  v21.8h,  v17.8h // t22a
+        sqadd           v17.8h,  v21.8h,  v17.8h // t17a = out17
+        sqadd           v30.8h,  v27.8h,  v22.8h // t30a = out30
+        sqsub           v21.8h,  v27.8h,  v22.8h // t25a
+        sqsub           v27.8h,  v18.8h,  v20.8h // t21
+        sqadd           v18.8h,  v18.8h,  v20.8h // t18 = out18
+        sqadd           v4.8h,   v29.8h,  v26.8h // t19a = out19
+        sqsub           v26.8h,  v29.8h,  v26.8h // t20a
+        sqadd           v29.8h,  v25.8h,  v28.8h // t29 = out29
+        sqsub           v25.8h,  v25.8h,  v28.8h // t26
+        sqadd           v28.8h,  v24.8h,  v19.8h // t28a = out28
+        sqsub           v24.8h,  v24.8h,  v19.8h // t27a
+        mov             v19.16b, v4.16b          // out19
+
+        smull_smlsl     v4,  v5,  v24, v26, v0.h[0], v0.h[0], .8h // -> t20
+        smull_smlal     v6,  v7,  v24, v26, v0.h[0], v0.h[0], .8h // -> t27
+        rshrn_sz        v20, v4,  v5,  #12, .8h   // t20
+        rshrn_sz        v22, v6,  v7,  #12, .8h   // t27
+
+        smull_smlal     v4,  v5,  v25, v27, v0.h[0], v0.h[0], .8h // -> t26a
+        smull_smlsl     v6,  v7,  v25, v27, v0.h[0], v0.h[0], .8h // -> t21a
+        mov             v27.16b,  v22.16b         // t27
+        rshrn_sz        v26, v4,  v5,  #12, .8h   // t26a
+
+        smull_smlsl     v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22
+        smull_smlal     v4,  v5,  v21, v23, v0.h[0], v0.h[0], .8h // -> t25
+        rshrn_sz        v21, v6,  v7,  #12, .8h   // t21a
+        rshrn_sz        v22, v24, v25, #12, .8h   // t22
+        rshrn_sz        v25, v4,  v5,  #12, .8h   // t25
+
+        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[0], v0.h[0], .8h // -> t23a
+        smull_smlal     v6,  v7,  v3,  v2,  v0.h[0], v0.h[0], .8h // -> t24a
+        rshrn_sz        v23, v4,  v5,  #12, .8h   // t23a
+        rshrn_sz        v24, v6,  v7,  #12, .8h   // t24a
+
+        ret
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x8_neon
+        mov             x14, x30
+        movi            v7.8h,  #0
+        lsl             x8,  x8,  #1
+.if \scale
+        mov             w16, #2896*8
+        dup             v0.4h,   w16
+.endif
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        ld1             {\i}, [x7]
+        st1             {v7.8h}, [x7], x8
+.endr
+        sub             x7,  x7,  x8, lsl #4
+        add             x7,  x7,  x8, lsr #1
+.if \scale
+        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+        bl              inv_dct_8h_x16_neon
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
+
+.macro store1 r0, r1
+        st1             {\r0}, [x6], #16
+        st1             {\r1}, [x6], #16
+        add             x6,  x6,  #32
+.endm
+        store1          v16.8h,  v24.8h
+        store1          v17.8h,  v25.8h
+        store1          v18.8h,  v26.8h
+        store1          v19.8h,  v27.8h
+        store1          v20.8h,  v28.8h
+        store1          v21.8h,  v29.8h
+        store1          v22.8h,  v30.8h
+        store1          v23.8h,  v31.8h
+.purgem store1
+        sub             x6,  x6,  #64*8
+
+        movi            v7.8h,  #0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        ld1             {\i}, [x7]
+        st1             {v7.8h}, [x7], x8
+.endr
+.if \scale
+        // This relies on the fact that the idct also leaves the right coeff in v0.h[1]
+        scale_input     .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+        bl              inv_dct32_odd_8h_x16_neon
+        transpose_8x8h  v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
+        transpose_8x8h  v23, v22, v21, v20, v19, v18, v17, v16, v4, v5
+.macro store2 r0, r1, shift
+        ld1             {v4.8h, v5.8h}, [x6]
+        sqsub           v7.8h,   v4.8h,   \r0
+        sqsub           v6.8h,   v5.8h,   \r1
+        sqadd           v4.8h,   v4.8h,   \r0
+        sqadd           v5.8h,   v5.8h,   \r1
+        rev64           v6.8h,   v6.8h
+        rev64           v7.8h,   v7.8h
+        srshr           v4.8h,   v4.8h,   #\shift
+        srshr           v5.8h,   v5.8h,   #\shift
+        srshr           v6.8h,   v6.8h,   #\shift
+        srshr           v7.8h,   v7.8h,   #\shift
+        ext             v6.16b,  v6.16b,  v6.16b,  #8
+        st1             {v4.8h, v5.8h}, [x6], #32
+        ext             v7.16b,  v7.16b,  v7.16b,  #8
+        st1             {v6.8h, v7.8h}, [x6], #32
+.endm
+
+        store2          v31.8h,  v23.8h, \shift
+        store2          v30.8h,  v22.8h, \shift
+        store2          v29.8h,  v21.8h, \shift
+        store2          v28.8h,  v20.8h, \shift
+        store2          v27.8h,  v19.8h, \shift
+        store2          v26.8h,  v18.8h, \shift
+        store2          v25.8h,  v17.8h, \shift
+        store2          v24.8h,  v16.8h, \shift
+.purgem store2
+        br              x14
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_8x32_neon
+        mov             x14, x30
+        lsl             x8,  x8,  #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ld1             {v\i\().8h}, [x7], x8
+.endr
+        sub             x7,  x7,  x8, lsl #4
+
+        bl              inv_dct_8h_x16_neon
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        st1             {v\i\().8h}, [x7], x8
+.endr
+        sub             x7,  x7,  x8, lsl #4
+        add             x7,  x7,  x8, lsr #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ld1             {v\i\().8h}, [x7], x8
+.endr
+        sub             x7,  x7,  x8, lsl #4
+        sub             x7,  x7,  x8, lsr #1
+        bl              inv_dct32_odd_8h_x16_neon
+
+        neg             x9,  x8
+        mov             x10, x6
+.macro combine r0, r1, r2, r3, op, stride
+        ld1             {v5.8h}, [x7],    \stride
+        ld1             {v2.8b}, [x10],   x1
+        ld1             {v6.8h}, [x7],    \stride
+        ld1             {v3.8b}, [x10],   x1
+        \op             v5.8h,   v5.8h,   \r0
+        ld1             {v7.8h}, [x7],    \stride
+        ld1             {v4.8b}, [x10],   x1
+        srshr           v5.8h,   v5.8h,   #4
+        \op             v6.8h,   v6.8h,   \r1
+        uaddw           v5.8h,   v5.8h,   v2.8b
+        srshr           v6.8h,   v6.8h,   #4
+        \op             v7.8h,   v7.8h,   \r2
+        sqxtun          v2.8b,   v5.8h
+        ld1             {v5.8h}, [x7],    \stride
+        uaddw           v6.8h,   v6.8h,   v3.8b
+        srshr           v7.8h,   v7.8h,   #4
+        \op             v5.8h,   v5.8h,   \r3
+        st1             {v2.8b}, [x6],    x1
+        ld1             {v2.8b}, [x10],   x1
+        sqxtun          v3.8b,   v6.8h
+        uaddw           v7.8h,   v7.8h,   v4.8b
+        srshr           v5.8h,   v5.8h,   #4
+        st1             {v3.8b}, [x6],    x1
+        sqxtun          v4.8b,   v7.8h
+        uaddw           v5.8h,   v5.8h,   v2.8b
+        st1             {v4.8b}, [x6],    x1
+        sqxtun          v2.8b,   v5.8h
+        st1             {v2.8b}, [x6],    x1
+.endm
+        combine         v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
+        combine         v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
+        combine         v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
+        combine         v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
+        sub             x7,  x7,  x8
+        combine         v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
+        combine         v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
+        combine         v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
+        combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
+.purgem combine
+
+        br              x14
+endfunc
+
+const eob_32x32
+        .short 36, 136, 300, 1024
+endconst
+
+const eob_16x32
+        .short 36, 151, 279, 512
+endconst
+
+const eob_16x32_shortside
+        .short 36, 512
+endconst
+
+const eob_8x32
+        .short 43, 107, 171, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
+        movi            v0.8h,  #0
+        movrel          x13, eob_32x32
+
+        mov             x8,  #2*32
+1:
+        mov             w9,  #0
+        movrel          x12, eob_32x32
+2:
+        add             w9,  w9,  #8
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        ld1             {v\i\().8h}, [x2]
+        st1             {v0.8h}, [x2], x8
+.endr
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+        load_add_store_8x8 x0, x7, shiftbits=2
+        ldrh            w11, [x12], #2
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+        cmp             w3,  w11
+        b.ge            2b
+
+        ldrh            w11, [x13], #2
+        cmp             w3,  w11
+        b.lt            9f
+
+        sub             x0,  x0,  w9, uxtw
+        add             x0,  x0,  x1, lsl #3
+        msub            x2,  x8,  x9,  x2
+        add             x2,  x2,  #2*8
+        b               1b
+9:
+        ret
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        \op             \i,  \i,  #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+        mov             w16, #2896*8
+        mov             w17, #2*(5793-4096)*8
+        dup             v1.4h,   w16
+        movi            v0.8h,   #0
+        mov             v1.h[1], w17
+        movrel          x13, eob_16x32\hshort
+
+        mov             x8,  #2*\h
+1:
+        mov             w9,  #0
+        movrel          x12, eob_16x32\wshort
+2:
+        add             w9,  w9,  #8
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        ld1             {\i}, [x2]
+        st1             {v0.8h}, [x2], x8
+.endr
+        scale_input     .8h, v1.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+.if \w == 16
+        // 16x32
+        identity_8x8_shift1 v1.h[1]
+.else
+        // 32x16
+        shift_8_regs    sqshl, 1
+        identity_8x8    v1.h[1]
+.endif
+
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+.if \w == 16
+        load_add_store_8x8 x0, x7, shiftbits=2
+.else
+        load_add_store_8x8 x0, x7, shiftbits=4
+.endif
+        ldrh            w11, [x12], #2
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+        cmp             w3,  w11
+        b.ge            2b
+
+        ldrh            w11, [x13], #2
+        cmp             w3,  w11
+        b.lt            9f
+
+        sub             x0,  x0,  w9, uxtw
+        add             x0,  x0,  x1, lsl #3
+        msub            x2,  x8,  x9,  x2
+        add             x2,  x2,  #2*8
+        b               1b
+9:
+        ret
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+        movi            v0.8h,  #0
+        movrel          x13, eob_8x32
+
+        mov             w8,  #2*\h
+1:
+        ldrh            w12, [x13], #2
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        ld1             {\i}, [x2]
+        st1             {v0.8h}, [x2], x8
+.endr
+
+.if \w == 8
+        // 8x32
+        shift_8_regs    srshr, 1
+.endif
+
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+        cmp             w3,  w12
+.if \w == 8
+        load_add_store_8x8 x0, x7, shiftbits=2
+.else
+        load_add_store_8x8 x0, x7, shiftbits=3
+.endif
+
+        b.lt            9f
+.if \w == 8
+        sub             x2,  x2,  x8, lsl #3
+        add             x2,  x2,  #2*8
+.else
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+.endif
+        b               1b
+
+9:
+        ret
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
+        idct_dc         32,  32,  2
+
+        mov             x15, x30
+        sub             sp,  sp,  #2048
+        movrel          x13, eob_32x32
+        ldrh            w12, [x13], #2
+
+.irp i, 0, 8, 16, 24
+        add             x6,  sp,  #(\i*32*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.if \i < 24
+        ldrh            w12, [x13], #2
+.endif
+.endif
+        add             x7,  x2,  #(\i*2)
+        mov             x8,  #32*2
+        bl              inv_txfm_horz_dct_32x8_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24
+        add             x6,  x0,  #(\i)
+        add             x7,  sp,  #(\i*2)
+        mov             x8,  #32*2
+        bl              inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+        add             sp,  sp,  #2048
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
+        idct_dc         16,  32,  1
+
+        mov             x15, x30
+        sub             sp,  sp,  #1024
+        movrel          x13, eob_16x32
+        ldrh            w12, [x13], #2
+        adr             x4,  inv_dct_8h_x16_neon
+
+.irp i, 0, 8, 16, 24
+        add             x6,  sp,  #(\i*16*2)
+        add             x7,  x2,  #(\i*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.if \i < 24
+        ldrh            w12, [x13], #2
+.endif
+.endif
+        mov             x8,  #2*32
+        bl              inv_txfm_horz_scale_16x8_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #8
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8
+        add             x6,  x0,  #(\i)
+        add             x7,  sp,  #(\i*2)
+        mov             x8,  #16*2
+        bl              inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+        add             sp,  sp,  #1024
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
+        idct_dc         32,  16,  1
+
+        mov             x15, x30
+        sub             sp,  sp,  #1024
+
+        adr             x5,  inv_dct_8h_x16_neon
+
+.irp i, 0, 8
+        add             x6,  sp,  #(\i*32*2)
+        add             x7,  x2,  #(\i*2)
+.if \i > 0
+        mov             w8,  #(16 - \i)
+        cmp             w3,  #36
+        b.lt            1f
+.endif
+        mov             x8,  #2*16
+        bl              inv_txfm_horz_scale_dct_32x8_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24
+        add             x6,  x0,  #(\i)
+        add             x7,  sp,  #(\i*2)
+        mov             x8,  #32*2
+        bl              inv_txfm_add_vert_8x16_neon
+.endr
+
+        add             sp,  sp,  #1024
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
+        idct_dc         8,   32, 2
+
+        mov             x15, x30
+        sub             sp,  sp,  #512
+
+        movrel          x13, eob_8x32
+
+        movi            v28.8h,  #0
+        mov             x8,  #2*32
+        mov             w9,  #32
+        mov             x6,  sp
+1:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        ld1             {v\i\().8h}, [x2]
+        st1             {v28.8h}, [x2], x8
+.endr
+        ldrh            w12, [x13], #2
+        sub             x2,  x2,  x8, lsl #3
+        sub             w9,  w9,  #8
+        add             x2,  x2,  #2*8
+
+        bl              inv_dct_8h_x8_neon
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        srshr           v\i\().8h,  v\i\().8h,  #2
+.endr
+
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+        cmp             w3,  w12
+        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
+
+        b.ge            1b
+        cbz             w9,  3f
+
+        movi            v29.8h,  #0
+        movi            v30.8h,  #0
+        movi            v31.8h,  #0
+2:
+        subs            w9,  w9,  #8
+.rept 2
+        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+        mov             x6,  x0
+        mov             x7,  sp
+        mov             x8,  #8*2
+        bl              inv_txfm_add_vert_dct_8x32_neon
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
+        idct_dc         32,  8,   2
+
+        mov             x15, x30
+        sub             sp,  sp,  #512
+
+        mov             x6,  sp
+        mov             x7,  x2
+        mov             x8,  #8*2
+        bl              inv_txfm_horz_dct_32x8_neon
+
+        mov             x8,  #2*32
+        mov             w9,  #0
+1:
+        add             x6,  x0,  x9
+        add             x7,  sp,  x9, lsl #1 // #(\i*2)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        ld1             {v\i\().8h}, [x7], x8
+.endr
+        add             w9,  w9,  #8
+
+        bl              inv_dct_8h_x8_neon
+
+        cmp             w9,  #32
+
+        load_add_store_8x8 x6, x7
+
+        b.lt            1b
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
+
+function inv_dct64_step1_neon
+        // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+        // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+        // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+        // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+        ld1             {v0.8h, v1.8h}, [x17], #32
+
+        sqrdmulh        v23.8h,  v16.8h,  v0.h[1]   // t63a
+        sqrdmulh        v16.8h,  v16.8h,  v0.h[0]   // t32a
+        sqrdmulh        v22.8h,  v17.8h,  v0.h[2]   // t62a
+        sqrdmulh        v17.8h,  v17.8h,  v0.h[3]   // t33a
+        sqrdmulh        v21.8h,  v18.8h,  v0.h[5]   // t61a
+        sqrdmulh        v18.8h,  v18.8h,  v0.h[4]   // t34a
+        sqrdmulh        v20.8h,  v19.8h,  v0.h[6]   // t60a
+        sqrdmulh        v19.8h,  v19.8h,  v0.h[7]   // t35a
+
+        sqadd           v24.8h,  v16.8h,  v17.8h    // t32
+        sqsub           v25.8h,  v16.8h,  v17.8h    // t33
+        sqsub           v26.8h,  v19.8h,  v18.8h    // t34
+        sqadd           v27.8h,  v19.8h,  v18.8h    // t35
+        sqadd           v28.8h,  v20.8h,  v21.8h    // t60
+        sqsub           v29.8h,  v20.8h,  v21.8h    // t61
+        sqsub           v30.8h,  v23.8h,  v22.8h    // t62
+        sqadd           v31.8h,  v23.8h,  v22.8h    // t63
+
+        smull_smlal     v2,  v3,  v29, v26, v1.h[0], v1.h[1], .8h // -> t34a
+        smull_smlsl     v4,  v5,  v29, v26, v1.h[1], v1.h[0], .8h // -> t61a
+        neg             v2.4s,   v2.4s              // t34a
+        neg             v3.4s,   v3.4s              // t34a
+        smull_smlsl     v6,  v7,  v30, v25, v1.h[1], v1.h[0], .8h // -> t33a
+        rshrn_sz        v26, v2,  v3,  #12, .8h     // t34a
+        smull_smlal     v2,  v3,  v30, v25, v1.h[0], v1.h[1], .8h // -> t62a
+        rshrn_sz        v29, v4,  v5,  #12, .8h     // t61a
+        rshrn_sz        v25, v6,  v7,  #12, .8h     // t33a
+        rshrn_sz        v30, v2,  v3,  #12, .8h     // t62a
+
+        sqadd           v16.8h,  v24.8h,  v27.8h    // t32a
+        sqsub           v19.8h,  v24.8h,  v27.8h    // t35a
+        sqadd           v17.8h,  v25.8h,  v26.8h    // t33
+        sqsub           v18.8h,  v25.8h,  v26.8h    // t34
+        sqsub           v20.8h,  v31.8h,  v28.8h    // t60a
+        sqadd           v23.8h,  v31.8h,  v28.8h    // t63a
+        sqsub           v21.8h,  v30.8h,  v29.8h    // t61
+        sqadd           v22.8h,  v30.8h,  v29.8h    // t62
+
+        smull_smlal     v2,  v3,  v21, v18, v1.h[2], v1.h[3], .8h // -> t61a
+        smull_smlsl     v4,  v5,  v21, v18, v1.h[3], v1.h[2], .8h // -> t34a
+        smull_smlal     v6,  v7,  v20, v19, v1.h[2], v1.h[3], .8h // -> t60
+        rshrn_sz        v21, v2,  v3,  #12, .8h     // t61a
+        rshrn_sz        v18, v4,  v5,  #12, .8h     // t34a
+        smull_smlsl     v2,  v3,  v20, v19, v1.h[3], v1.h[2], .8h // -> t35
+        rshrn_sz        v20, v6,  v7,  #12, .8h     // t60
+        rshrn_sz        v19, v2,  v3,  #12, .8h     // t35
+
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
+
+        ret
+endfunc
+
+function inv_dct64_step2_neon
+        movrel          x16, idct_coeffs
+        ld1             {v0.4h}, [x16]
+1:
+        // t32a/33/34a/35/60/61a/62/63a
+        // t56a/57/58a/59/36/37a/38/39a
+        // t40a/41/42a/43/52/53a/54/55a
+        // t48a/49/50a/51/44/45a/46/47a
+        ldr             q16, [x6, #2*8*0]  // t32a
+        ldr             q17, [x9, #2*8*8]  // t39a
+        ldr             q18, [x9, #2*8*0]  // t63a
+        ldr             q19, [x6, #2*8*8]  // t56a
+        ldr             q20, [x6, #2*8*16] // t40a
+        ldr             q21, [x9, #2*8*24] // t47a
+        ldr             q22, [x9, #2*8*16] // t55a
+        ldr             q23, [x6, #2*8*24] // t48a
+
+        sqadd           v24.8h,  v16.8h, v17.8h // t32
+        sqsub           v25.8h,  v16.8h, v17.8h // t39
+        sqadd           v26.8h,  v18.8h, v19.8h // t63
+        sqsub           v27.8h,  v18.8h, v19.8h // t56
+        sqsub           v28.8h,  v21.8h, v20.8h // t40
+        sqadd           v29.8h,  v21.8h, v20.8h // t47
+        sqadd           v30.8h,  v23.8h, v22.8h // t48
+        sqsub           v31.8h,  v23.8h, v22.8h // t55
+
+        smull_smlal     v2,  v3,  v27, v25, v0.h[3], v0.h[2], .8h // -> t56a
+        smull_smlsl     v4,  v5,  v27, v25, v0.h[2], v0.h[3], .8h // -> t39a
+        smull_smlal     v6,  v7,  v31, v28, v0.h[3], v0.h[2], .8h // -> t40a
+        rshrn_sz        v25, v2,  v3,  #12, .8h     // t56a
+        rshrn_sz        v27, v4,  v5,  #12, .8h     // t39a
+        neg             v6.4s,   v6.4s              // t40a
+        neg             v7.4s,   v7.4s              // t40a
+        smull_smlsl     v2,  v3,  v31, v28, v0.h[2], v0.h[3], .8h // -> t55a
+        rshrn_sz        v31, v6,  v7,  #12, .8h     // t40a
+        rshrn_sz        v28, v2,  v3,  #12, .8h     // t55a
+
+        sqadd           v16.8h,  v24.8h,  v29.8h    // t32a
+        sqsub           v19.8h,  v24.8h,  v29.8h    // t47a
+        sqadd           v17.8h,  v27.8h,  v31.8h    // t39
+        sqsub           v18.8h,  v27.8h,  v31.8h    // t40
+        sqsub           v20.8h,  v26.8h,  v30.8h    // t48a
+        sqadd           v23.8h,  v26.8h,  v30.8h    // t63a
+        sqsub           v21.8h,  v25.8h,  v28.8h    // t55
+        sqadd           v22.8h,  v25.8h,  v28.8h    // t56
+
+        smull_smlsl     v2,  v3,  v21, v18, v0.h[0], v0.h[0], .8h // -> t40a
+        smull_smlal     v4,  v5,  v21, v18, v0.h[0], v0.h[0], .8h // -> t55a
+        smull_smlsl     v6,  v7,  v20, v19, v0.h[0], v0.h[0], .8h // -> t47
+        rshrn_sz        v18, v2,  v3,  #12, .8h     // t40a
+        rshrn_sz        v21, v4,  v5,  #12, .8h     // t55a
+        smull_smlal     v2,  v3,  v20, v19, v0.h[0], v0.h[0], .8h // -> t48
+        rshrn_sz        v19, v6,  v7,  #12, .8h     // t47
+        rshrn_sz        v20, v2,  v3,  #12, .8h     // t48
+
+        str             q16, [x6, #2*8*0]  // t32a
+        str             q17, [x9, #2*8*0]  // t39
+        str             q18, [x6, #2*8*8]  // t40a
+        str             q19, [x9, #2*8*8]  // t47
+        str             q20, [x6, #2*8*16] // t48
+        str             q21, [x9, #2*8*16] // t55a
+        str             q22, [x6, #2*8*24] // t56
+        str             q23, [x9, #2*8*24] // t63a
+
+        add             x6,  x6,  #2*8
+        sub             x9,  x9,  #2*8
+        cmp             x6,  x9
+        b.lt            1b
+        ret
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+.if \clear
+        ld1             {\i}, [\src]
+        st1             {\zero}, [\src], \strd
+.else
+        ld1             {\i}, [\src], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        st1             {\i}, [\dst], #16
+.endr
+.endm
+
+.macro clear_upper8
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        movi            \i,  #0
+.endr
+.endm
+
+.macro movi_if reg, val, cond
+.if \cond
+        movi            \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+        mov             \gpr, \val
+        dup             \reg, \gpr
+.endif
+.endm
+
+.macro st1_if regs, dst, cond
+.if \cond
+        st1             \regs, \dst
+.endif
+.endm
+
+.macro str_if reg, dst, cond
+.if \cond
+        str             \reg, \dst
+.endif
+.endm
+
+.macro stroff_if reg, dst, dstoff, cond
+.if \cond
+        str             \reg, \dst, \dstoff
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+        scale_input     .8h, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_8h_x64_neon, export=1
+        mov             x14, x30
+        mov             x6,  sp
+        lsl             x8,  x8,  #2
+
+        movdup_if       v0.4h, w16, #2896*8, \scale
+        movi_if         v7.8h,  #0, \clear
+        load8           x7,  x8,  v7.8h, \clear
+        clear_upper8
+        sub             x7,  x7,  x8, lsl #3
+        add             x7,  x7,  x8, lsr #1
+        scale_if        \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+        bl              inv_dct_8h_x16_neon
+
+        store16         x6
+
+        movdup_if       v0.4h, w16, #2896*8, \scale
+        movi_if         v7.8h,  #0, \clear
+        load8           x7,  x8,  v7.8h, \clear
+        clear_upper8
+        sub             x7,  x7,  x8, lsl #3
+        lsr             x8,  x8,  #1
+        sub             x7,  x7,  x8, lsr #1
+        scale_if        \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+        bl              inv_dct32_odd_8h_x16_neon
+
+        add             x10, x6,  #16*15
+        sub             x6,  x6,  #16*16
+
+        mov             x9,  #-16
+
+.macro store_addsub r0, r1, r2, r3
+        ld1             {v2.8h}, [x6], #16
+        ld1             {v3.8h}, [x6], #16
+        sqadd           v6.8h,  v2.8h,  \r0
+        sqsub           \r0,    v2.8h,  \r0
+        ld1             {v4.8h}, [x6], #16
+        sqadd           v7.8h,  v3.8h,  \r1
+        sqsub           \r1,    v3.8h,  \r1
+        ld1             {v5.8h}, [x6], #16
+        sqadd           v2.8h,  v4.8h,  \r2
+        sub             x6,  x6,  #16*4
+        sqsub           \r2,    v4.8h,  \r2
+        st1             {v6.8h}, [x6], #16
+        st1             {\r0},   [x10], x9
+        sqadd           v3.8h,  v5.8h,  \r3
+        sqsub           \r3,    v5.8h,  \r3
+        st1             {v7.8h}, [x6], #16
+        st1             {\r1},   [x10], x9
+        st1             {v2.8h}, [x6], #16
+        st1             {\r2},   [x10], x9
+        st1             {v3.8h}, [x6], #16
+        st1             {\r3},   [x10], x9
+.endm
+        store_addsub    v31.8h, v30.8h, v29.8h, v28.8h
+        store_addsub    v27.8h, v26.8h, v25.8h, v24.8h
+        store_addsub    v23.8h, v22.8h, v21.8h, v20.8h
+        store_addsub    v19.8h, v18.8h, v17.8h, v16.8h
+.purgem store_addsub
+
+        add             x6,  x6,  #2*8*16
+
+        movrel          x17, idct64_coeffs
+        movdup_if       v0.4h, w16, #2896*8, \scale
+        movi_if         v7.8h,  #0, \clear
+        add             x9,  x7,  x8, lsl #4 // offset 16
+        add             x10, x7,  x8, lsl #3 // offset 8
+        sub             x9,  x9,  x8         // offset 15
+        sub             x11, x10, x8         // offset 7
+        ld1             {v16.8h}, [x7]  // in1  (offset 0)
+        ld1             {v17.8h}, [x9]  // in31 (offset 15)
+        ld1             {v18.8h}, [x10] // in17 (offset 8)
+        ld1             {v19.8h}, [x11] // in15 (offset 7)
+        st1_if          {v7.8h}, [x7],  \clear
+        st1_if          {v7.8h}, [x9],  \clear
+        st1_if          {v7.8h}, [x10], \clear
+        st1_if          {v7.8h}, [x11], \clear
+        scale_if        \scale, v0.h[0], v16, v17, v18, v19
+        bl              inv_dct64_step1_neon
+        movdup_if       v0.4h, w16, #2896*8, \scale
+        movi_if         v7.8h,  #0, \clear
+        add             x7,  x7,  x8, lsl #2 // offset 4
+        sub             x9,  x9,  x8, lsl #2 // offset 11
+        sub             x10, x7,  x8         // offset 3
+        add             x11, x9,  x8         // offset 12
+        ld1             {v16.8h}, [x10] // in7  (offset 3)
+        ld1             {v17.8h}, [x11] // in25 (offset 12)
+        ld1             {v18.8h}, [x9]  // in23 (offset 11)
+        ld1             {v19.8h}, [x7]  // in9  (offset 4)
+        st1_if          {v7.8h}, [x7],  \clear
+        st1_if          {v7.8h}, [x9],  \clear
+        st1_if          {v7.8h}, [x10], \clear
+        st1_if          {v7.8h}, [x11], \clear
+        scale_if        \scale, v0.h[0], v16, v17, v18, v19
+        bl              inv_dct64_step1_neon
+        movdup_if       v0.4h, w16, #2896*8, \scale
+        movi_if         v7.8h,  #0, \clear
+        sub             x10, x10, x8, lsl #1 // offset 1
+        sub             x9,  x9,  x8, lsl #1 // offset 9
+        add             x7,  x7,  x8         // offset 5
+        add             x11, x11, x8         // offset 13
+        ldr             q16, [x10, x8] // in5  (offset 2)
+        ldr             q17, [x11]     // in27 (offset 13)
+        ldr             q18, [x9,  x8] // in21 (offset 10)
+        ldr             q19, [x7]      // in11 (offset 5)
+        stroff_if       q7,  [x10, x8], \clear
+        str_if          q7,  [x11],     \clear
+        stroff_if       q7,  [x9,  x8], \clear
+        str_if          q7,  [x7],      \clear
+        scale_if        \scale, v0.h[0], v16, v17, v18, v19
+        bl              inv_dct64_step1_neon
+        movdup_if       v0.4h, w16, #2896*8, \scale
+        movi_if         v7.8h,  #0, \clear
+        ldr             q16, [x10]     // in3  (offset 1)
+        ldr             q17, [x11, x8] // in29 (offset 14)
+        ldr             q18, [x9]      // in19 (offset 9)
+        ldr             q19, [x7,  x8] // in13 (offset 6)
+        str_if          q7,  [x10],     \clear
+        stroff_if       q7,  [x11, x8], \clear
+        str_if          q7,  [x9],      \clear
+        stroff_if       q7,  [x7,  x8], \clear
+        scale_if        \scale, v0.h[0], v16, v17, v18, v19
+        bl              inv_dct64_step1_neon
+
+        sub             x6,  x6,  #2*8*32
+        add             x9,  x6,  #2*8*7
+
+        bl              inv_dct64_step2_neon
+
+        br              x14
+endfunc
+.endm
+
+def_dct64_func
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+
+function inv_txfm_horz_dct_64x8_neon
+        mov             x14, x30
+
+        mov             x7,  sp
+        add             x8,  sp,  #2*8*(64 - 4)
+        add             x9,  x6,  #2*56
+        mov             x10, #2*64
+        mov             x11, #-2*8*4
+
+        dup             v7.8h,  w12
+1:
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+        transpose_8x8h  v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
+
+.macro store_addsub src0, src1, src2, src3
+        sqsub           v1.8h,   \src0,   \src1
+        sqadd           v0.8h,   \src0,   \src1
+        sqsub           v3.8h,   \src2,   \src3
+        srshl           v1.8h,   v1.8h,   v7.8h
+        sqadd           v2.8h,   \src2,   \src3
+        srshl           v0.8h,   v0.8h,   v7.8h
+        srshl           v3.8h,   v3.8h,   v7.8h
+        rev64           v1.8h,   v1.8h
+        srshl           v2.8h,   v2.8h,   v7.8h
+        rev64           v3.8h,   v3.8h
+        ext             v1.16b,  v1.16b,  v1.16b,  #8
+        st1             {v0.8h},  [x6], x10
+        ext             v3.16b,  v3.16b,  v3.16b,  #8
+        st1             {v1.8h},  [x9], x10
+        st1             {v2.8h},  [x6], x10
+        st1             {v3.8h},  [x9], x10
+.endm
+        store_addsub    v16.8h,  v31.8h,  v17.8h,  v30.8h
+        store_addsub    v18.8h,  v29.8h,  v19.8h,  v28.8h
+        store_addsub    v20.8h,  v27.8h,  v21.8h,  v26.8h
+        store_addsub    v22.8h,  v25.8h,  v23.8h,  v24.8h
+.purgem store_addsub
+        sub             x6,  x6,  x10, lsl #3
+        sub             x9,  x9,  x10, lsl #3
+        add             x6,  x6,  #16
+        sub             x9,  x9,  #16
+
+        cmp             x7,  x8
+        b.lt            1b
+        br              x14
+endfunc
+
+function inv_txfm_add_vert_dct_8x64_neon
+        mov             x14, x30
+        lsl             x8,  x8,  #1
+
+        mov             x7,  sp
+        add             x8,  sp,  #2*8*(64 - 4)
+        add             x9,  x6,  x1, lsl #6
+        sub             x9,  x9,  x1
+        neg             x10, x1
+        mov             x11, #-2*8*4
+
+1:
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+
+.macro add_dest_addsub src0, src1, src2, src3
+        ld1             {v0.8b}, [x6], x1
+        ld1             {v1.8b}, [x9], x10
+        sqadd           v4.8h,   \src0,   \src1
+        ld1             {v2.8b}, [x6]
+        sqsub           v5.8h,   \src0,   \src1
+        ld1             {v3.8b}, [x9]
+        sqadd           v6.8h,   \src2,   \src3
+        sqsub           v7.8h,   \src2,   \src3
+        sub             x6,  x6,  x1
+        sub             x9,  x9,  x10
+        srshr           v4.8h,   v4.8h,   #4
+        srshr           v5.8h,   v5.8h,   #4
+        srshr           v6.8h,   v6.8h,   #4
+        uaddw           v4.8h,   v4.8h,   v0.8b
+        srshr           v7.8h,   v7.8h,   #4
+        uaddw           v5.8h,   v5.8h,   v1.8b
+        uaddw           v6.8h,   v6.8h,   v2.8b
+        sqxtun          v0.8b,   v4.8h
+        uaddw           v7.8h,   v7.8h,   v3.8b
+        sqxtun          v1.8b,   v5.8h
+        st1             {v0.8b}, [x6], x1
+        sqxtun          v2.8b,   v6.8h
+        st1             {v1.8b}, [x9], x10
+        sqxtun          v3.8b,   v7.8h
+        st1             {v2.8b}, [x6], x1
+        st1             {v3.8b}, [x9], x10
+.endm
+        add_dest_addsub v16.8h,  v31.8h,  v17.8h,  v30.8h
+        add_dest_addsub v18.8h,  v29.8h,  v19.8h,  v28.8h
+        add_dest_addsub v20.8h,  v27.8h,  v21.8h,  v26.8h
+        add_dest_addsub v22.8h,  v25.8h,  v23.8h,  v24.8h
+.purgem add_dest_addsub
+        cmp             x7,  x8
+        b.lt            1b
+
+        br              x14
+endfunc
+
+.macro sub_sp space
+#ifdef _WIN32
+.if \space > 8192
+        // Here, we'd need to touch two (or more) pages while decrementing
+        // the stack pointer.
+        .error          "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+        sub             x16, sp,  #4096
+        ldr             xzr, [x16]
+        sub             sp,  x16, #(\space - 4096)
+.else
+        sub             sp,  sp,  #\space
+.endif
+#else
+.if \space >= 4096
+        sub             sp,  sp,  #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+        sub             sp,  sp,  #(\space)%4096
+.endif
+#endif
+.endm
+
+function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
+        idct_dc         64,  64,  2
+
+        mov             x15, x30
+
+        sub_sp          64*32*2+64*8*2
+        add             x5,  sp, #64*8*2
+
+        movrel          x13, eob_32x32
+
+.irp i, 0, 8, 16, 24
+        add             x6,  x5,  #(\i*64*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.endif
+        add             x7,  x2,  #(\i*2)
+        mov             x8,  #32*2
+        mov             x12, #-2 // shift
+        bl              inv_txfm_dct_clear_8h_x64_neon
+        add             x6,  x5,  #(\i*64*2)
+        bl              inv_txfm_horz_dct_64x8_neon
+.if \i < 24
+        ldrh            w12, [x13], #2
+.endif
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #2
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+        add             x7,  x5,  #(\i*2)
+        mov             x8,  #64*2
+        bl              inv_txfm_dct_8h_x64_neon
+        add             x6,  x0,  #(\i)
+        bl              inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+        add             sp,  x5,  #64*32*2
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
+        idct_dc         64,  32,  1
+
+        mov             x15, x30
+
+        sub_sp          64*32*2+64*8*2
+        add             x5,  sp, #64*8*2
+
+        movrel          x13, eob_32x32
+
+.irp i, 0, 8, 16, 24
+        add             x6,  x5,  #(\i*64*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.endif
+        add             x7,  x2,  #(\i*2)
+        mov             x8,  #32*2
+        mov             x12, #-1 // shift
+        bl              inv_txfm_dct_clear_scale_8h_x64_neon
+        add             x6,  x5,  #(\i*64*2)
+        bl              inv_txfm_horz_dct_64x8_neon
+.if \i < 24
+        ldrh            w12, [x13], #2
+.endif
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #2
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+        add             x6,  x0,  #(\i)
+        add             x7,  x5,  #(\i*2)
+        mov             x8,  #64*2
+        bl              inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+        add             sp,  x5,  #64*32*2
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
+        idct_dc         32,  64,  1
+
+        mov             x15, x30
+
+        sub_sp          32*32*2+64*8*2
+        add             x5,  sp, #64*8*2
+
+        movrel          x13, eob_32x32
+        ldrh            w12, [x13], #2
+
+.irp i, 0, 8, 16, 24
+        add             x6,  x5,  #(\i*32*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+        ldrh            w12, [x13], #2
+.endif
+        add             x7,  x2,  #(\i*2)
+        mov             x8,  #32*2
+        bl              inv_txfm_horz_scale_dct_32x8_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24
+        add             x7,  x5,  #(\i*2)
+        mov             x8,  #32*2
+        bl              inv_txfm_dct_8h_x64_neon
+        add             x6,  x0,  #(\i)
+        bl              inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+        add             sp,  x5,  #32*32*2
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
+        idct_dc         64,  16,  2
+
+        mov             x15, x30
+
+        sub_sp          64*16*2+64*8*2
+        add             x4,  sp, #64*8*2
+
+        movrel          x13, eob_16x32
+
+.irp i, 0, 8
+        add             x6,  x4,  #(\i*64*2)
+.if \i > 0
+        mov             w8,  #(16 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.endif
+        add             x7,  x2,  #(\i*2)
+        mov             x8,  #16*2
+        mov             x12, #-2 // shift
+        bl              inv_txfm_dct_clear_8h_x64_neon
+        add             x6,  x4,  #(\i*64*2)
+        bl              inv_txfm_horz_dct_64x8_neon
+.if \i < 8
+        ldrh            w12, [x13], #2
+.endif
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #2
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+        adr             x5,  inv_dct_8h_x16_neon
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+        add             x6,  x0,  #(\i)
+        add             x7,  x4,  #(\i*2)
+        mov             x8,  #64*2
+        bl              inv_txfm_add_vert_8x16_neon
+.endr
+
+        add             sp,  x4,  #64*16*2
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
+        idct_dc         16,  64,  2
+
+        mov             x15, x30
+
+        sub_sp          16*32*2+64*8*2
+        add             x5,  sp, #64*8*2
+
+        movrel          x13, eob_16x32
+        ldrh            w12, [x13], #2
+
+        adr             x4,  inv_dct_8h_x16_neon
+.irp i, 0, 8, 16, 24
+        add             x6,  x5,  #(\i*16*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+        ldrh            w12, [x13], #2
+.endif
+        add             x7,  x2,  #(\i*2)
+        mov             x8,  #32*2
+        bl              inv_txfm_horz_16x8_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #8
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8
+        add             x7,  x5,  #(\i*2)
+        mov             x8,  #16*2
+        bl              inv_txfm_dct_8h_x64_neon
+        add             x6,  x0,  #(\i)
+        bl              inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+        add             sp,  x5,  #16*32*2
+        br              x15
+endfunc
diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S
new file mode 100644 (file)
index 0000000..266f57e
--- /dev/null
@@ -0,0 +1,3526 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob,
+//                int bitdepth_max);
+
+// Most of the functions use the following register layout:
+// x0-x3  external parameters
+// x4     function pointer to first transform
+// x5     function pointer to second transform
+// x6     output parameter for helper function
+// x7     input parameter for helper function
+// x8     input stride for helper function
+// x9-x12 scratch variables for helper functions
+// x13    pointer to list of eob thresholds
+// x14    return pointer for helper function
+// x15    return pointer for main function
+
+// The SIMD registers most often use the following layout:
+// v0-v1   multiplication coefficients
+// v2-v7   scratch registers
+// v8-v15  unused
+// v16-v31 inputs/outputs of transforms
+
+const idct_coeffs, align=4
+        // idct4
+        .int            2896, 2896*8*(1<<16), 1567, 3784
+        // idct8
+        .int            799, 4017, 3406, 2276
+        // idct16
+        .int            401, 4076, 3166, 2598
+        .int            1931, 3612, 3920, 1189
+        // idct32
+        .int            201, 4091, 3035, 2751
+        .int            1751, 3703, 3857, 1380
+        .int            995, 3973, 3513, 2106
+        .int            2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+        .int            101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
+        .int            1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
+        .int            4076, 401, 4017, 799
+
+        .int            4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
+        .int            3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
+        .int            -3166, -2598, -799, -4017
+
+        .int            501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
+        .int            2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
+        .int            3612, 1931, 2276, 3406
+
+        .int            4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
+        .int            3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
+        .int            -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+        .int            1321, 3803, 2482, 3344
+endconst
+
+const iadst8_coeffs, align=4
+        .int            4076, 401, 3612, 1931
+        .int            2598, 3166, 1189, 3920
+        // idct_coeffs
+        .int            2896, 0, 1567, 3784
+endconst
+
+const iadst16_coeffs, align=4
+        .int            4091, 201, 3973, 995
+        .int            3703, 1751, 3290, 2440
+        .int            2751, 3035, 2106, 3513
+        .int            1380, 3857, 601, 4052
+endconst
+
+.macro mul_mla d, s0, s1, c0, c1
+        mul             \d\().4s, \s0\().4s, \c0
+        mla             \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro mul_mls d, s0, s1, c0, c1
+        mul             \d\().4s, \s0\().4s, \c0
+        mls             \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
+        sqrdmulh        \r0\sz,  \r0\sz,  \c
+        sqrdmulh        \r1\sz,  \r1\sz,  \c
+        sqrdmulh        \r2\sz,  \r2\sz,  \c
+        sqrdmulh        \r3\sz,  \r3\sz,  \c
+.ifnb \r4
+        sqrdmulh        \r4\sz,  \r4\sz,  \c
+        sqrdmulh        \r5\sz,  \r5\sz,  \c
+        sqrdmulh        \r6\sz,  \r6\sz,  \c
+        sqrdmulh        \r7\sz,  \r7\sz,  \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4
+.ifnb \load
+        ld1             {\load},  [\src], x1
+.endif
+.ifnb \shift
+        srshr           \shift,  \shift,  #\shiftbits
+.endif
+.ifnb \addsrc
+        sqadd           \adddst, \adddst, \addsrc
+.endif
+.ifnb \max
+        smax            \max,  \max,  v6.8h
+.endif
+.ifnb \min
+        smin            \min,  \min,  v7.8h
+.endif
+.ifnb \store
+        st1             {\store},  [\dst], x1
+.endif
+.endm
+.macro load_add_store_8x16 dst, src
+        mov             \src, \dst
+        movi            v6.8h,   #0
+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
+        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src
+        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src
+        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src
+        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src
+        load_add_store  v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src
+        load_add_store  v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src
+        load_add_store  v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src
+        load_add_store  v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src
+        load_add_store  v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src
+        load_add_store  v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src
+        load_add_store  v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src
+        load_add_store  v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src
+        load_add_store  v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src
+        load_add_store  v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src
+        load_add_store  v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src
+        load_add_store  v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src
+        load_add_store       ,       , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src
+        load_add_store       ,       , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src
+        load_add_store       ,       ,      ,       , v31.8h, v30.8h, v29.8h, \dst, \src
+        load_add_store       ,       ,      ,       ,       , v31.8h, v30.8h, \dst, \src
+        load_add_store       ,       ,      ,       ,       ,       , v31.8h, \dst, \src
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+        mov             \src, \dst
+        movi            v6.8h,   #0
+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
+        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
+        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
+        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src, \shiftbits
+        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src, \shiftbits
+        load_add_store  v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src, \shiftbits
+        load_add_store  v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
+        load_add_store  v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
+        load_add_store  v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits
+        load_add_store       ,       , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits
+        load_add_store       ,       , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       ,       , v23.8h, v22.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       ,       ,       , v23.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src, shiftbits=4
+        mov             \src, \dst
+        movi            v6.8h,   #0
+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
+        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
+        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
+        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src, \shiftbits
+        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src, \shiftbits
+        load_add_store       ,       , v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src, \shiftbits
+        load_add_store       ,       , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       ,       , v19.8h, v18.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       ,       ,       , v19.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src
+.ifnb \load
+        ld1             {\load}[0],  [\src], x1
+.endif
+.ifnb \inssrc
+        ins             \insdst\().d[1],   \inssrc\().d[0]
+.endif
+.ifnb \shift
+        srshr           \shift,  \shift,  #4
+.endif
+.ifnb \load
+        ld1             {\load}[1],  [\src], x1
+.endif
+.ifnb \addsrc
+        sqadd           \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+        st1             {\store}[0],  [\dst], x1
+.endif
+.ifnb \max
+        smax            \max,  \max,  v6.8h
+.endif
+.ifnb \min
+        smin            \min,  \min,  v7.8h
+.endif
+.ifnb \store
+        st1             {\store}[1],  [\dst], x1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+        mov             \src, \dst
+        movi            v6.8h,   #0
+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
+        load_add_store4 v0.d, v17, v16,       ,      ,       ,       ,       ,      , \dst, \src
+        load_add_store4 v1.d, v19, v18,       ,      ,       ,       ,       ,      , \dst, \src
+        load_add_store4 v2.d, v21, v20, v16.8h,      ,       ,       ,       ,      , \dst, \src
+        load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h,       ,       ,      , \dst, \src
+        load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h,       ,      , \dst, \src
+        load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h,      , \dst, \src
+        load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
+        load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src
+        load_add_store4     ,    ,    , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src
+        load_add_store4     ,    ,    , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src
+        load_add_store4     ,    ,    ,       , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       , v30.8h, v28.8h, v26.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       ,       , v30.8h, v28.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       ,       ,       , v30.d, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+        mov             \src, \dst
+        movi            v6.8h,   #0
+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
+        load_add_store4 v0.d, v17, v16,       ,      ,       ,       ,       ,      , \dst, \src
+        load_add_store4 v1.d, v19, v18,       ,      ,       ,       ,       ,      , \dst, \src
+        load_add_store4 v2.d, v21, v20, v16.8h,      ,       ,       ,       ,      , \dst, \src
+        load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h,       ,       ,      , \dst, \src
+        load_add_store4     ,    ,    , v20.8h, v1.8h, v18.8h, v16.8h,       ,      , \dst, \src
+        load_add_store4     ,    ,    , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h,      , \dst, \src
+        load_add_store4     ,    ,    ,       , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       , v22.8h, v20.8h, v18.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       ,       , v22.8h, v20.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       ,       ,       , v22.d, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+        cbnz            w3,  1f
+        movz            w16, #2896*8, lsl #16
+        ld1r            {v16.4s}, [x2]
+        dup             v0.2s,   w16
+        sqrdmulh        v20.4s,  v16.4s,  v0.s[0]
+        str             wzr, [x2]
+.if (\w == 2*\h) || (2*\w == \h)
+        sqrdmulh        v20.4s,  v20.4s,  v0.s[0]
+.endif
+.if \shift > 0
+        sqrshrn         v16.4h,  v20.4s,  #\shift
+        sqrshrn2        v16.8h,  v20.4s,  #\shift
+.else
+        sqxtn           v16.4h,  v20.4s
+        sqxtn2          v16.8h,  v20.4s
+.endif
+        sqrdmulh        v16.8h,  v16.8h,  v0.h[1]
+        srshr           v16.8h,  v16.8h,  #4
+        mov             w4,  #\h
+        b               idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+        movi            v30.8h,  #0
+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+1:
+        ld1             {v0.d}[0], [x0], x1
+        ld1             {v0.d}[1], [x0], x1
+        ld1             {v1.d}[0], [x0], x1
+        subs            w4,  w4,  #4
+        ld1             {v1.d}[1], [x0], x1
+        sqadd           v0.8h,   v0.8h,   v16.8h
+        sub             x0,  x0,  x1, lsl #2
+        sqadd           v1.8h,   v1.8h,   v16.8h
+        smax            v0.8h,   v0.8h,   v30.8h
+        smax            v1.8h,   v1.8h,   v30.8h
+        smin            v0.8h,   v0.8h,   v31.8h
+        st1             {v0.d}[0], [x0], x1
+        smin            v1.8h,   v1.8h,   v31.8h
+        st1             {v0.d}[1], [x0], x1
+        st1             {v1.d}[0], [x0], x1
+        st1             {v1.d}[1], [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+function idct_dc_w8_neon
+        movi            v30.8h,  #0
+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+1:
+        ld1             {v0.8h}, [x0], x1
+        subs            w4,  w4,  #4
+        ld1             {v1.8h}, [x0], x1
+        sqadd           v0.8h,   v0.8h,   v16.8h
+        ld1             {v2.8h}, [x0], x1
+        sqadd           v1.8h,   v1.8h,   v16.8h
+        ld1             {v3.8h}, [x0], x1
+        sqadd           v2.8h,   v2.8h,   v16.8h
+        sqadd           v3.8h,   v3.8h,   v16.8h
+        sub             x0,  x0,  x1, lsl #2
+        smax            v0.8h,   v0.8h,   v30.8h
+        smax            v1.8h,   v1.8h,   v30.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smin            v0.8h,   v0.8h,   v31.8h
+        smin            v1.8h,   v1.8h,   v31.8h
+        st1             {v0.8h}, [x0], x1
+        smin            v2.8h,   v2.8h,   v31.8h
+        st1             {v1.8h}, [x0], x1
+        smin            v3.8h,   v3.8h,   v31.8h
+        st1             {v2.8h}, [x0], x1
+        st1             {v3.8h}, [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+function idct_dc_w16_neon
+        movi            v30.8h,  #0
+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+1:
+        ld1             {v0.8h, v1.8h}, [x0], x1
+        subs            w4,  w4,  #2
+        ld1             {v2.8h, v3.8h}, [x0], x1
+        sqadd           v0.8h,   v0.8h,   v16.8h
+        sqadd           v1.8h,   v1.8h,   v16.8h
+        sub             x0,  x0,  x1, lsl #1
+        sqadd           v2.8h,   v2.8h,   v16.8h
+        sqadd           v3.8h,   v3.8h,   v16.8h
+        smax            v0.8h,   v0.8h,   v30.8h
+        smax            v1.8h,   v1.8h,   v30.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smin            v0.8h,   v0.8h,   v31.8h
+        smin            v1.8h,   v1.8h,   v31.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        st1             {v0.8h, v1.8h}, [x0], x1
+        smin            v3.8h,   v3.8h,   v31.8h
+        st1             {v2.8h, v3.8h}, [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+function idct_dc_w32_neon
+        movi            v30.8h,  #0
+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+        subs            w4,  w4,  #1
+        sqadd           v0.8h,   v0.8h,   v16.8h
+        sqadd           v1.8h,   v1.8h,   v16.8h
+        sqadd           v2.8h,   v2.8h,   v16.8h
+        sqadd           v3.8h,   v3.8h,   v16.8h
+        smax            v0.8h,   v0.8h,   v30.8h
+        smax            v1.8h,   v1.8h,   v30.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smin            v0.8h,   v0.8h,   v31.8h
+        smin            v1.8h,   v1.8h,   v31.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        smin            v3.8h,   v3.8h,   v31.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+function idct_dc_w64_neon
+        movi            v30.8h,  #0
+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+        sub             x1,  x1,  #64
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        subs            w4,  w4,  #1
+        sqadd           v0.8h,   v0.8h,   v16.8h
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
+        sqadd           v1.8h,   v1.8h,   v16.8h
+        sub             x0,  x0,  #64
+        sqadd           v2.8h,   v2.8h,   v16.8h
+        sqadd           v3.8h,   v3.8h,   v16.8h
+        sqadd           v4.8h,   v4.8h,   v16.8h
+        sqadd           v5.8h,   v5.8h,   v16.8h
+        sqadd           v6.8h,   v6.8h,   v16.8h
+        sqadd           v7.8h,   v7.8h,   v16.8h
+        smax            v0.8h,   v0.8h,   v30.8h
+        smax            v1.8h,   v1.8h,   v30.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smax            v4.8h,   v4.8h,   v30.8h
+        smax            v5.8h,   v5.8h,   v30.8h
+        smax            v6.8h,   v6.8h,   v30.8h
+        smax            v7.8h,   v7.8h,   v30.8h
+        smin            v0.8h,   v0.8h,   v31.8h
+        smin            v1.8h,   v1.8h,   v31.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        smin            v3.8h,   v3.8h,   v31.8h
+        smin            v4.8h,   v4.8h,   v31.8h
+        smin            v5.8h,   v5.8h,   v31.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        smin            v6.8h,   v6.8h,   v31.8h
+        smin            v7.8h,   v7.8h,   v31.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+.macro iwht4
+        add             v16.4s,  v16.4s,  v17.4s
+        sub             v21.4s,  v18.4s,  v19.4s
+        sub             v20.4s,  v16.4s,  v21.4s
+        sshr            v20.4s,  v20.4s,  #1
+        sub             v18.4s,  v20.4s,  v17.4s
+        sub             v17.4s,  v20.4s,  v19.4s
+        add             v19.4s,  v21.4s,  v18.4s
+        sub             v16.4s,  v16.4s,  v17.4s
+.endm
+
+.macro idct_4 r0, r1, r2, r3
+        mul_mla         v6,  \r1, \r3, v0.s[3], v0.s[2]
+        mul_mls         v4,  \r1, \r3, v0.s[2], v0.s[3]
+        mul_mla         v2,  \r0, \r2, v0.s[0], v0.s[0]
+        mul_mls         v3,  \r0, \r2, v0.s[0], v0.s[0]
+        srshr           v6.4s,  v6.4s,  #12
+        srshr           v7.4s,  v4.4s,  #12
+        srshr           v2.4s,  v2.4s,  #12
+        srshr           v3.4s,  v3.4s,  #12
+        sqadd           \r0\().4s,  v2.4s,   v6.4s
+        sqsub           \r3\().4s,  v2.4s,   v6.4s
+        sqadd           \r1\().4s,  v3.4s,   v7.4s
+        sqsub           \r2\().4s,  v3.4s,   v7.4s
+.endm
+
+function inv_dct_4s_x4_neon
+        movrel          x16, idct_coeffs
+        ld1             {v0.4s}, [x16]
+        idct_4          v16, v17, v18, v19
+        ret
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+        movrel          x16, iadst4_coeffs
+        ld1             {v0.4s}, [x16]
+
+        sub             v3.4s,   v16.4s,  v18.4s
+        mul             v4.4s,   v16.4s,  v0.s[0]
+        mla             v4.4s,   v18.4s,  v0.s[1]
+        mla             v4.4s,   v19.4s,  v0.s[2]
+        mul             v7.4s,   v17.4s,  v0.s[3]
+        add             v3.4s,   v3.4s,   v19.4s
+        mul             v5.4s,   v16.4s,  v0.s[2]
+        mls             v5.4s,   v18.4s,  v0.s[0]
+        mls             v5.4s,   v19.4s,  v0.s[1]
+
+        add             \o3\().4s, v4.4s,     v5.4s
+        mul             \o2\().4s, v3.4s,     v0.s[3]
+        add             \o0\().4s, v4.4s,     v7.4s
+        add             \o1\().4s, v5.4s,     v7.4s
+        sub             \o3\().4s, \o3\().4s, v7.4s
+
+        srshr           \o0\().4s, \o0\().4s, #12
+        srshr           \o2\().4s, \o2\().4s, #12
+        srshr           \o1\().4s, \o1\().4s, #12
+        srshr           \o3\().4s, \o3\().4s, #12
+.endm
+
+function inv_adst_4s_x4_neon
+        iadst_4x4       v16, v17, v18, v19
+        ret
+endfunc
+
+function inv_flipadst_4s_x4_neon
+        iadst_4x4       v19, v18, v17, v16
+        ret
+endfunc
+
+function inv_identity_4s_x4_neon
+        movz            w16, #(5793-4096)*8, lsl #16
+        dup             v0.2s,   w16
+        sqrdmulh        v4.4s,   v16.4s,  v0.s[0]
+        sqrdmulh        v5.4s,   v17.4s,  v0.s[0]
+        sqrdmulh        v6.4s,   v18.4s,  v0.s[0]
+        sqrdmulh        v7.4s,   v19.4s,  v0.s[0]
+        sqadd           v16.4s,  v16.4s,  v4.4s
+        sqadd           v17.4s,  v17.4s,  v5.4s
+        sqadd           v18.4s,  v18.4s,  v6.4s
+        sqadd           v19.4s,  v19.4s,  v7.4s
+        ret
+endfunc
+
+function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
+        mov             x15, x30
+        movi            v30.4s,  #0
+        movi            v31.4s,  #0
+        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+        st1             {v30.4s, v31.4s}, [x2], #32
+
+        sshr            v16.4s,  v16.4s,  #2
+        sshr            v17.4s,  v17.4s,  #2
+        sshr            v18.4s,  v18.4s,  #2
+        sshr            v19.4s,  v19.4s,  #2
+
+        iwht4
+
+        st1             {v30.4s, v31.4s}, [x2], #32
+        transpose_4x4s  v16, v17, v18, v19, v20, v21, v22, v23
+
+        iwht4
+
+        ld1             {v0.d}[0], [x0], x1
+        sqxtn           v16.4h,  v16.4s
+        ld1             {v0.d}[1], [x0], x1
+        sqxtn2          v16.8h,  v17.4s
+        ld1             {v1.d}[0], [x0], x1
+        sqxtn           v18.4h,  v18.4s
+        ld1             {v1.d}[1], [x0], x1
+        sqxtn2          v18.8h,  v19.4s
+
+        b               L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+        movi            v30.4s,  #0
+        movi            v31.4s,  #0
+        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+        st1             {v30.4s, v31.4s}, [x2], #32
+
+        blr             x4
+
+        st1             {v30.4s, v31.4s}, [x2], #32
+        sqxtn           v16.4h,  v16.4s
+        sqxtn           v17.4h,  v17.4s
+        sqxtn           v18.4h,  v18.4s
+        sqxtn           v19.4h,  v19.4s
+        transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
+
+        blr             x5
+
+        ld1             {v0.d}[0], [x0], x1
+        ld1             {v0.d}[1], [x0], x1
+        ins             v16.d[1], v17.d[0]
+        ins             v18.d[1], v19.d[0]
+        ld1             {v1.d}[0], [x0], x1
+        ld1             {v1.d}[1], [x0], x1
+        srshr           v16.8h,  v16.8h,  #4
+        srshr           v18.8h,  v18.8h,  #4
+
+L(itx_4x4_end):
+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+        sub             x0,  x0,  x1, lsl #2
+        sqadd           v16.8h,  v16.8h,  v0.8h
+        sqadd           v18.8h,  v18.8h,  v1.8h
+        smax            v16.8h,  v16.8h,  v30.8h
+        smax            v18.8h,  v18.8h,  v30.8h
+        smin            v16.8h,  v16.8h,  v31.8h
+        st1             {v16.d}[0], [x0], x1
+        smin            v18.8h,  v18.8h,  v31.8h
+        st1             {v16.d}[1], [x0], x1
+        st1             {v18.d}[0], [x0], x1
+        st1             {v18.d}[1], [x0], x1
+
+        br              x15
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
+        mov             x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+        cbnz            w3,  1f
+        movz            w16, #2896*8, lsl #16
+        ld1r            {v16.4s}, [x2]
+        dup             v4.2s,   w16
+        str             wzr, [x2]
+        sqrdmulh        v16.4s,  v16.4s,  v4.s[0]
+        ld1             {v0.d}[0], [x0], x1
+        sqxtn           v20.4h,  v16.4s
+        sqxtn2          v20.8h,  v16.4s
+        ld1             {v0.d}[1], [x0], x1
+        sqrdmulh        v20.8h,  v20.8h,  v4.h[1]
+        ld1             {v1.d}[0], [x0], x1
+        srshr           v16.8h,  v20.8h,  #4
+        ld1             {v1.d}[1], [x0], x1
+        srshr           v18.8h,  v20.8h,  #4
+        movi            v30.8h,  #0
+        b               L(itx_4x4_end)
+1:
+.endif
+        adr             x4,  inv_\txfm1\()_4s_x4_neon
+        movrel          x5,  X(inv_\txfm2\()_4h_x4_neon)
+        b               inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
+        idct_4          \r0, \r2, \r4, \r6
+
+        mul_mls         v2,  \r1, \r7, v1.s[0], v1.s[1]  // -> t4a
+        mul_mla         v4,  \r1, \r7, v1.s[1], v1.s[0]  // -> t7a
+        mul_mls         v6,  \r5, \r3, v1.s[2], v1.s[3]  // -> t5a
+        mul_mla         v7,  \r5, \r3, v1.s[3], v1.s[2]  // -> t6a
+        srshr           \r1\().4s, v2.4s,  #12           // t4a
+        srshr           \r7\().4s, v4.4s,  #12           // t7a
+        srshr           \r3\().4s, v6.4s,  #12           // t5a
+        srshr           \r5\().4s, v7.4s,  #12           // taa
+
+        sqadd           v2.4s,     \r1\().4s,  \r3\().4s // t4
+        sqsub           \r1\().4s, \r1\().4s,  \r3\().4s // t5a
+        sqadd           v3.4s,     \r7\().4s,  \r5\().4s // t7
+        sqsub           \r3\().4s, \r7\().4s,  \r5\().4s // t6a
+
+        mul_mls         v4,  \r3, \r1, v0.s[0], v0.s[0]  // -> t5
+        mul_mla         v6,  \r3, \r1, v0.s[0], v0.s[0]  // -> t6
+        srshr           v4.4s,  v4.4s,  #12              // t5
+        srshr           v5.4s,  v6.4s,  #12              // t6
+
+        sqsub           \r7\().4s,  \r0\().4s,  v3.4s    // out7
+        sqadd           \r0\().4s,  \r0\().4s,  v3.4s    // out0
+        sqadd           \r1\().4s,  \r2\().4s,  v5.4s    // out1
+        sqsub           v6.4s,      \r2\().4s,  v5.4s    // out6
+        sqadd           \r2\().4s,  \r4\().4s,  v4.4s    // out2
+        sqsub           \r5\().4s,  \r4\().4s,  v4.4s    // out5
+        sqadd           \r3\().4s,  \r6\().4s,  v2.4s    // out3
+        sqsub           \r4\().4s,  \r6\().4s,  v2.4s    // out4
+        mov             \r6\().16b, v6.16b               // out6
+.endm
+
+function inv_dct_4s_x8_neon
+        movrel          x16, idct_coeffs
+        ld1             {v0.4s, v1.4s}, [x16]
+        idct_8          v16, v17, v18, v19, v20, v21, v22, v23
+        ret
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
+        movrel          x16, iadst8_coeffs
+        ld1             {v0.4s, v1.4s}, [x16], #32
+
+        mul_mla         v2,  v23, v16, v0.s[0], v0.s[1]
+        mul_mls         v4,  v23, v16, v0.s[1], v0.s[0]
+        mul_mla         v6,  v21, v18, v0.s[2], v0.s[3]
+        srshr           v16.4s, v2.4s,  #12  // t0a
+        srshr           v23.4s, v4.4s,  #12  // t1a
+        mul_mls         v2,  v21, v18, v0.s[3], v0.s[2]
+        mul_mla         v4,  v19, v20, v1.s[0], v1.s[1]
+        srshr           v18.4s, v6.4s,  #12  // t2a
+        srshr           v21.4s, v2.4s,  #12  // t3a
+        mul_mls         v6,  v19, v20, v1.s[1], v1.s[0]
+        mul_mla         v2,  v17, v22, v1.s[2], v1.s[3]
+        srshr           v20.4s, v4.4s,  #12  // t4a
+        srshr           v19.4s, v6.4s,  #12  // t5a
+        mul_mls         v4,  v17, v22, v1.s[3], v1.s[2]
+        srshr           v22.4s, v2.4s,  #12  // t6a
+        srshr           v17.4s, v4.4s,  #12  // t7a
+
+        ld1             {v0.4s}, [x16]
+
+        sqadd           v2.4s,   v16.4s,  v20.4s // t0
+        sqsub           v3.4s,   v16.4s,  v20.4s // t4
+        sqadd           v4.4s,   v23.4s,  v19.4s // t1
+        sqsub           v5.4s,   v23.4s,  v19.4s // t5
+        sqadd           v6.4s,   v18.4s,  v22.4s // t2
+        sqsub           v7.4s,   v18.4s,  v22.4s // t6
+        sqadd           v18.4s,  v21.4s,  v17.4s // t3
+        sqsub           v19.4s,  v21.4s,  v17.4s // t7
+
+        mul_mla         v16, v3,  v5,  v0.s[3], v0.s[2]
+        mul_mls         v20, v3,  v5,  v0.s[2], v0.s[3]
+        mul_mls         v22, v19, v7,  v0.s[3], v0.s[2]
+
+        srshr           v3.4s,  v16.4s, #12  // t4a
+        srshr           v5.4s,  v20.4s, #12  // t5a
+
+        mul_mla         v16, v19, v7,  v0.s[2], v0.s[3]
+
+        srshr           v7.4s,  v22.4s, #12  // t6a
+        srshr           v19.4s, v16.4s, #12  // t7a
+
+        sqadd           \o0\().4s, v2.4s, v6.4s  // out0
+        sqsub           v2.4s,     v2.4s, v6.4s  // t2
+        sqadd           \o7\().4s, v4.4s, v18.4s // out7
+        sqsub           v4.4s,     v4.4s, v18.4s // t3
+        sqneg           \o7\().4s, \o7\().4s     // out7
+
+        sqadd           \o1\().4s, v3.4s, v7.4s  // out1
+        sqsub           v3.4s,     v3.4s, v7.4s  // t6
+        sqadd           \o6\().4s, v5.4s, v19.4s // out6
+        sqsub           v5.4s,     v5.4s, v19.4s // t7
+        sqneg           \o1\().4s, \o1\().4s     // out1
+
+        mul_mla         v18, v2,  v4,  v0.s[0], v0.s[0] // -> out3 (v19 or v20)
+        mul_mls         v6,  v2,  v4,  v0.s[0], v0.s[0] // -> out4 (v20 or v19)
+        mul_mls         v20, v3,  v5,  v0.s[0], v0.s[0] // -> out5 (v21 or v18)
+        srshr           v2.4s,  v18.4s, #12 // out3
+        mul_mla         v18, v3,  v5,  v0.s[0], v0.s[0] // -> out2 (v18 or v21)
+        srshr           v3.4s,  v20.4s, #12 // out5
+        srshr           \o2\().4s, v18.4s, #12 // out2 (v18 or v21)
+        srshr           \o4\().4s, v6.4s,  #12 // out4 (v20 or v19)
+
+        sqneg           \o3\().4s, v2.4s     // out3
+        sqneg           \o5\().4s, v3.4s     // out5
+.endm
+
+function inv_adst_4s_x8_neon
+        iadst_8         v16, v17, v18, v19, v20, v21, v22, v23
+        ret
+endfunc
+
+function inv_flipadst_4s_x8_neon
+        iadst_8         v23, v22, v21, v20, v19, v18, v17, v16
+        ret
+endfunc
+
+function inv_identity_4s_x8_neon
+        sqshl           v16.4s,  v16.4s,  #1
+        sqshl           v17.4s,  v17.4s,  #1
+        sqshl           v18.4s,  v18.4s,  #1
+        sqshl           v19.4s,  v19.4s,  #1
+        sqshl           v20.4s,  v20.4s,  #1
+        sqshl           v21.4s,  v21.4s,  #1
+        sqshl           v22.4s,  v22.4s,  #1
+        sqshl           v23.4s,  v23.4s,  #1
+        ret
+endfunc
+
+function inv_txfm_add_8x8_neon
+        movi            v31.4s,  #0
+
+        cmp             w3,  w13
+        mov             x11, #32
+        b.lt            1f
+
+        add             x6,  x2,  #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        ld1             {\i},     [x6]
+        st1             {v31.4s}, [x6], x11
+.endr
+
+        blr             x4
+
+        sqrshrn         v24.4h,  v16.4s,  #1
+        sqrshrn         v25.4h,  v17.4s,  #1
+        sqrshrn         v26.4h,  v18.4s,  #1
+        sqrshrn         v27.4h,  v19.4s,  #1
+        sqrshrn2        v24.8h,  v20.4s,  #1
+        sqrshrn2        v25.8h,  v21.4s,  #1
+        sqrshrn2        v26.8h,  v22.4s,  #1
+        sqrshrn2        v27.8h,  v23.4s,  #1
+
+        transpose_4x8h  v24, v25, v26, v27, v2, v3, v4, v5
+
+        b               2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+        movi            \i,  #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        ld1             {\i},     [x2]
+        st1             {v31.4s}, [x2], x11
+.endr
+
+        blr             x4
+
+        sqrshrn         v16.4h,  v16.4s,  #1
+        sqrshrn         v17.4h,  v17.4s,  #1
+        sqrshrn         v18.4h,  v18.4s,  #1
+        sqrshrn         v19.4h,  v19.4s,  #1
+        sqrshrn2        v16.8h,  v20.4s,  #1
+        sqrshrn2        v17.8h,  v21.4s,  #1
+        sqrshrn2        v18.8h,  v22.4s,  #1
+        sqrshrn2        v19.8h,  v23.4s,  #1
+
+        transpose_4x8h  v16, v17, v18, v19, v20, v21, v22, v23
+
+        mov             v20.16b, v24.16b
+        mov             v21.16b, v25.16b
+        mov             v22.16b, v26.16b
+        mov             v23.16b, v27.16b
+
+        blr             x5
+
+        load_add_store_8x8 x0, x7
+        br              x15
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
+        mov             x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         8,   8,   1
+.endif
+        movrel          x5,  X(inv_\txfm2\()_8h_x8_neon)
+        mov             w13, #\eob_half
+        adr             x4,  inv_\txfm1\()_4s_x8_neon
+        b               inv_txfm_add_8x8_neon
+endfunc
+.endm
+
+def_fn_8x8 dct, dct, 10
+def_fn_8x8 identity, identity, 10
+def_fn_8x8 dct, adst, 10
+def_fn_8x8 dct, flipadst, 10
+def_fn_8x8 dct, identity, 4
+def_fn_8x8 adst, dct, 10
+def_fn_8x8 adst, adst, 10
+def_fn_8x8 adst, flipadst, 10
+def_fn_8x8 flipadst, dct, 10
+def_fn_8x8 flipadst, adst, 10
+def_fn_8x8 flipadst, flipadst, 10
+def_fn_8x8 identity, dct, 4
+def_fn_8x8 adst, identity, 4
+def_fn_8x8 flipadst, identity, 4
+def_fn_8x8 identity, adst, 4
+def_fn_8x8 identity, flipadst, 4
+
+function inv_txfm_add_8x4_neon
+        movi            v28.4s,  #0
+        movi            v29.4s,  #0
+        movi            v30.4s,  #0
+        movi            v31.4s,  #0
+        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+        ld1             {v20.4s,v21.4s,v22.4s,v23.4s}, [x2]
+        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x2]
+
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+        blr             x4
+
+        sqxtn           v16.4h,  v16.4s
+        sqxtn           v17.4h,  v17.4s
+        sqxtn           v18.4h,  v18.4s
+        sqxtn           v19.4h,  v19.4s
+        sqxtn           v20.4h,  v20.4s
+        sqxtn           v21.4h,  v21.4s
+        sqxtn           v22.4h,  v22.4s
+        sqxtn           v23.4h,  v23.4s
+
+        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
+        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
+        ins             v16.d[1], v20.d[0]
+        ins             v17.d[1], v21.d[0]
+        ins             v18.d[1], v22.d[0]
+        ins             v19.d[1], v23.d[0]
+
+        blr             x5
+
+        load_add_store_8x4 x0, x7
+        br              x15
+endfunc
+
+function inv_txfm_add_4x8_neon
+        movz            w16, #2896*8, lsl #16
+        movi            v31.4s,  #0
+        dup             v30.2s,  w16
+
+        cmp             w3,  w13
+        mov             x11, #32
+        b.lt            1f
+
+        add             x6,  x2,  #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+        ld1             {\i},     [x6]
+        st1             {v31.4s}, [x6], x11
+.endr
+        scale_input     .4s, v30.s[0], v16, v17, v18, v19
+        blr             x4
+        sqxtn           v20.4h,  v16.4s
+        sqxtn           v21.4h,  v17.4s
+        sqxtn           v22.4h,  v18.4s
+        sqxtn           v23.4h,  v19.4s
+        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
+
+        b               2f
+
+1:
+.irp i, v20, v21, v22, v23
+        movi            \i\().4h, #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+        ld1             {\i},     [x2]
+        st1             {v31.4s}, [x2], x11
+.endr
+        scale_input     .4s, v30.s[0], v16, v17, v18, v19
+        blr             x4
+        sqxtn           v16.4h,  v16.4s
+        sqxtn           v17.4h,  v17.4s
+        sqxtn           v18.4h,  v18.4s
+        sqxtn           v19.4h,  v19.4s
+        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
+
+        blr             x5
+
+        load_add_store_4x8 x0, x7
+        br              x15
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+        mov             x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  0
+.endif
+        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
+.if \w == 4
+        mov             w13, #\eob_half
+.endif
+        movrel          x5,  X(inv_\txfm2\()_\w\()h_x\h\()_neon)
+        b               inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct, 13
+def_fn_48 \w, \h, identity, identity, 13
+def_fn_48 \w, \h, dct, adst, 13
+def_fn_48 \w, \h, dct, flipadst, 13
+def_fn_48 \w, \h, dct, identity, 4
+def_fn_48 \w, \h, adst, dct, 13
+def_fn_48 \w, \h, adst, adst, 13
+def_fn_48 \w, \h, adst, flipadst, 13
+def_fn_48 \w, \h, flipadst, dct, 13
+def_fn_48 \w, \h, flipadst, adst, 13
+def_fn_48 \w, \h, flipadst, flipadst, 13
+def_fn_48 \w, \h, identity, dct, 16
+def_fn_48 \w, \h, adst, identity, 4
+def_fn_48 \w, \h, flipadst, identity, 4
+def_fn_48 \w, \h, identity, adst, 16
+def_fn_48 \w, \h, identity, flipadst, 16
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+
+function inv_dct_4s_x16_neon
+        movrel          x16, idct_coeffs
+        ld1             {v0.4s, v1.4s}, [x16], #32
+
+        idct_8          v16, v18, v20, v22, v24, v26, v28, v30
+
+        ld1             {v0.4s, v1.4s}, [x16]
+        sub             x16, x16, #32
+
+        mul_mls         v2,  v17, v31, v0.s[0], v0.s[1] // -> t8a
+        mul_mla         v4,  v17, v31, v0.s[1], v0.s[0] // -> t15a
+        mul_mls         v6,  v25, v23, v0.s[2], v0.s[3] // -> t9a
+        srshr           v17.4s, v2.4s,  #12             // t8a
+        srshr           v31.4s, v4.4s,  #12             // t15a
+        mul_mla         v2,  v25, v23, v0.s[3], v0.s[2] // -> t14a
+        mul_mls         v4,  v21, v27, v1.s[0], v1.s[1] // -> t10a
+        srshr           v23.4s, v6.4s,  #12             // t9a
+        srshr           v25.4s, v2.4s,  #12             // t14a
+        mul_mla         v6,  v21, v27, v1.s[1], v1.s[0] // -> t13a
+        mul_mls         v2,  v29, v19, v1.s[2], v1.s[3] // -> t11a
+        srshr           v21.4s, v4.4s,  #12             // t10a
+        srshr           v27.4s, v6.4s,  #12             // t13a
+        mul_mla         v4,  v29, v19, v1.s[3], v1.s[2] // -> t12a
+        srshr           v19.4s, v2.4s,  #12             // t11a
+        srshr           v29.4s, v4.4s,  #12             // t12a
+
+        ld1             {v0.4s}, [x16]
+
+        sqsub           v2.4s,   v17.4s,  v23.4s  // t9
+        sqadd           v17.4s,  v17.4s,  v23.4s  // t8
+        sqsub           v3.4s,   v31.4s,  v25.4s  // t14
+        sqadd           v31.4s,  v31.4s,  v25.4s  // t15
+        sqsub           v23.4s,  v19.4s,  v21.4s  // t10
+        sqadd           v19.4s,  v19.4s,  v21.4s  // t11
+        sqadd           v25.4s,  v29.4s,  v27.4s  // t12
+        sqsub           v29.4s,  v29.4s,  v27.4s  // t13
+
+        mul_mls         v4,  v3,  v2,  v0.s[2], v0.s[3] // -> t9a
+        mul_mla         v6,  v3,  v2,  v0.s[3], v0.s[2] // -> t14a
+        srshr           v21.4s, v4.4s,  #12             // t9a
+        srshr           v27.4s, v6.4s,  #12             // t14a
+
+        mul_mls         v4,  v29, v23, v0.s[2], v0.s[3] // -> t13a
+        mul_mla         v6,  v29, v23, v0.s[3], v0.s[2] // -> t10a
+        srshr           v29.4s, v4.4s,  #12             // t13a
+        neg             v6.4s,   v6.4s
+        srshr           v23.4s, v6.4s,  #12             // t10a
+
+        sqsub           v2.4s,   v17.4s,  v19.4s  // t11a
+        sqadd           v17.4s,  v17.4s,  v19.4s  // t8a
+        sqsub           v3.4s,   v31.4s,  v25.4s  // t12a
+        sqadd           v31.4s,  v31.4s,  v25.4s  // t15a
+        sqadd           v19.4s,  v21.4s,  v23.4s  // t9
+        sqsub           v21.4s,  v21.4s,  v23.4s  // t10
+        sqsub           v25.4s,  v27.4s,  v29.4s  // t13
+        sqadd           v27.4s,  v27.4s,  v29.4s  // t14
+
+        mul_mls         v4,  v3,  v2,  v0.s[0], v0.s[0] // -> t11
+        mul_mla         v6,  v3,  v2,  v0.s[0], v0.s[0] // -> t12
+        mul_mls         v2,  v25, v21, v0.s[0], v0.s[0] // -> t10a
+
+        srshr           v4.4s,  v4.4s,  #12   // t11
+        srshr           v5.4s,  v6.4s,  #12   // t12
+        mul_mla         v6,  v25, v21, v0.s[0], v0.s[0] // -> t10a
+        srshr           v2.4s,  v2.4s,  #12   // t10a
+        srshr           v3.4s,  v6.4s,  #12   // t13a
+
+        sqadd           v6.4s,   v16.4s,  v31.4s  // out0
+        sqsub           v31.4s,  v16.4s,  v31.4s  // out15
+        mov             v16.16b, v6.16b
+        sqadd           v23.4s,  v30.4s,  v17.4s  // out7
+        sqsub           v7.4s,   v30.4s,  v17.4s  // out8
+        sqadd           v17.4s,  v18.4s,  v27.4s  // out1
+        sqsub           v30.4s,  v18.4s,  v27.4s  // out14
+        sqadd           v18.4s,  v20.4s,  v3.4s   // out2
+        sqsub           v29.4s,  v20.4s,  v3.4s   // out13
+        sqadd           v3.4s,   v28.4s,  v19.4s  // out6
+        sqsub           v25.4s,  v28.4s,  v19.4s  // out9
+        sqadd           v19.4s,  v22.4s,  v5.4s   // out3
+        sqsub           v28.4s,  v22.4s,  v5.4s   // out12
+        sqadd           v20.4s,  v24.4s,  v4.4s   // out4
+        sqsub           v27.4s,  v24.4s,  v4.4s   // out11
+        sqadd           v21.4s,  v26.4s,  v2.4s   // out5
+        sqsub           v26.4s,  v26.4s,  v2.4s   // out10
+        mov             v24.16b, v7.16b
+        mov             v22.16b, v3.16b
+
+        ret
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+        movrel          x16, iadst16_coeffs
+        ld1             {v0.4s, v1.4s}, [x16], #32
+
+        mul_mla         v2,  v31, v16, v0.s[0], v0.s[1] // -> t0
+        mul_mls         v4,  v31, v16, v0.s[1], v0.s[0] // -> t1
+        mul_mla         v6,  v29, v18, v0.s[2], v0.s[3] // -> t2
+        srshr           v16.4s, v2.4s,  #12             // t0
+        srshr           v31.4s, v4.4s,  #12             // t1
+        mul_mls         v2,  v29, v18, v0.s[3], v0.s[2] // -> t3
+        mul_mla         v4,  v27, v20, v1.s[0], v1.s[1] // -> t4
+        srshr           v18.4s, v6.4s,  #12             // t2
+        srshr           v29.4s, v2.4s,  #12             // t3
+        mul_mls         v6,  v27, v20, v1.s[1], v1.s[0] // -> t5
+        mul_mla         v2,  v25, v22, v1.s[2], v1.s[3] // -> t6
+        srshr           v20.4s, v4.4s,  #12             // t4
+        srshr           v27.4s, v6.4s,  #12             // t5
+        mul_mls         v4,  v25, v22, v1.s[3], v1.s[2] // -> t7
+        ld1             {v0.4s, v1.4s}, [x16]
+        movrel          x16, idct_coeffs
+        mul_mla         v6,  v23, v24, v0.s[0], v0.s[1] // -> t8
+        srshr           v22.4s, v2.4s,  #12             // t6
+        srshr           v25.4s, v4.4s,  #12             // t7
+        mul_mls         v2,  v23, v24, v0.s[1], v0.s[0] // -> t9
+        mul_mla         v4,  v21, v26, v0.s[2], v0.s[3] // -> t10
+        srshr           v23.4s, v6.4s,  #12             // t8
+        srshr           v24.4s, v2.4s,  #12             // t9
+        mul_mls         v6,  v21, v26, v0.s[3], v0.s[2] // -> t11
+        mul_mla         v2,  v19, v28, v1.s[0], v1.s[1] // -> t12
+        srshr           v21.4s, v4.4s,  #12             // t10
+        srshr           v26.4s, v6.4s,  #12             // t11
+        mul_mls         v4,  v19, v28, v1.s[1], v1.s[0] // -> t13
+        mul_mla         v6,  v17, v30, v1.s[2], v1.s[3] // -> t14
+        srshr           v19.4s, v2.4s,  #12             // t12
+        srshr           v28.4s, v4.4s,  #12             // t13
+        mul_mls         v2,  v17, v30, v1.s[3], v1.s[2] // -> t15
+        srshr           v17.4s, v6.4s,  #12             // t14
+        srshr           v30.4s, v2.4s,  #12             // t15
+
+        ld1             {v0.4s, v1.4s}, [x16]
+
+        sqsub           v2.4s,   v16.4s,  v23.4s // t8a
+        sqadd           v16.4s,  v16.4s,  v23.4s // t0a
+        sqsub           v3.4s,   v31.4s,  v24.4s // t9a
+        sqadd           v31.4s,  v31.4s,  v24.4s // t1a
+        sqadd           v23.4s,  v18.4s,  v21.4s // t2a
+        sqsub           v18.4s,  v18.4s,  v21.4s // t10a
+        sqadd           v24.4s,  v29.4s,  v26.4s // t3a
+        sqsub           v29.4s,  v29.4s,  v26.4s // t11a
+        sqadd           v21.4s,  v20.4s,  v19.4s // t4a
+        sqsub           v20.4s,  v20.4s,  v19.4s // t12a
+        sqadd           v26.4s,  v27.4s,  v28.4s // t5a
+        sqsub           v27.4s,  v27.4s,  v28.4s // t13a
+        sqadd           v19.4s,  v22.4s,  v17.4s // t6a
+        sqsub           v22.4s,  v22.4s,  v17.4s // t14a
+        sqadd           v28.4s,  v25.4s,  v30.4s // t7a
+        sqsub           v25.4s,  v25.4s,  v30.4s // t15a
+
+        mul_mla         v4,  v2,  v3,  v1.s[1], v1.s[0] // -> t8
+        mul_mls         v6,  v2,  v3,  v1.s[0], v1.s[1] // -> t9
+        mul_mla         v2,  v18, v29, v1.s[3], v1.s[2] // -> t10
+        srshr           v17.4s, v4.4s,  #12             // t8
+        srshr           v30.4s, v6.4s,  #12             // t9
+        mul_mls         v4,  v18, v29, v1.s[2], v1.s[3] // -> t11
+        mul_mls         v6,  v27, v20, v1.s[1], v1.s[0] // -> t12
+        srshr           v18.4s, v2.4s,  #12             // t10
+        srshr           v29.4s, v4.4s,  #12             // t11
+        mul_mla         v2,  v27, v20, v1.s[0], v1.s[1] // -> t13
+        mul_mls         v4,  v25, v22, v1.s[3], v1.s[2] // -> t14
+        srshr           v27.4s, v6.4s,  #12             // t12
+        srshr           v20.4s, v2.4s,  #12             // t13
+        mul_mla         v6,  v25, v22, v1.s[2], v1.s[3] // -> t15
+        srshr           v25.4s, v4.4s,  #12             // t14
+        srshr           v22.4s, v6.4s,  #12             // t15
+
+        sqsub           v2.4s,   v16.4s,  v21.4s // t4
+        sqadd           v16.4s,  v16.4s,  v21.4s // t0
+        sqsub           v3.4s,   v31.4s,  v26.4s // t5
+        sqadd           v31.4s,  v31.4s,  v26.4s // t1
+        sqadd           v21.4s,  v23.4s,  v19.4s // t2
+        sqsub           v23.4s,  v23.4s,  v19.4s // t6
+        sqadd           v26.4s,  v24.4s,  v28.4s // t3
+        sqsub           v24.4s,  v24.4s,  v28.4s // t7
+        sqadd           v19.4s,  v17.4s,  v27.4s // t8a
+        sqsub           v17.4s,  v17.4s,  v27.4s // t12a
+        sqadd           v28.4s,  v30.4s,  v20.4s // t9a
+        sqsub           v30.4s,  v30.4s,  v20.4s // t13a
+        sqadd           v27.4s,  v18.4s,  v25.4s // t10a
+        sqsub           v18.4s,  v18.4s,  v25.4s // t14a
+        sqadd           v20.4s,  v29.4s,  v22.4s // t11a
+        sqsub           v29.4s,  v29.4s,  v22.4s // t15a
+
+        mul_mla         v4,  v2,  v3,  v0.s[3], v0.s[2] // -> t4a
+        mul_mls         v6,  v2,  v3,  v0.s[2], v0.s[3] // -> t5a
+        mul_mls         v2,  v24, v23, v0.s[3], v0.s[2] // -> t6a
+        srshr           v22.4s, v4.4s,  #12             // t4a
+        srshr           v25.4s, v6.4s,  #12             // t5a
+        mul_mla         v4,  v24, v23, v0.s[2], v0.s[3] // -> t7a
+        mul_mla         v6,  v17, v30, v0.s[3], v0.s[2] // -> t12
+        srshr           v24.4s, v2.4s,  #12             // t6a
+        srshr           v23.4s, v4.4s,  #12             // t7a
+        mul_mls         v2,  v17, v30, v0.s[2], v0.s[3] // -> t13
+        mul_mls         v4,  v29, v18, v0.s[3], v0.s[2] // -> t14
+        srshr           v17.4s, v6.4s,  #12             // t12
+        mul_mla         v6,  v29, v18, v0.s[2], v0.s[3] // -> t15
+        srshr           v29.4s, v2.4s,  #12             // t13
+        srshr           v30.4s, v4.4s,  #12             // t14
+        srshr           v18.4s, v6.4s,  #12             // t15
+
+        sqsub           v2.4s,   v16.4s,  v21.4s // t2a
+.ifc \o0, v16
+        sqadd           \o0\().4s,  v16.4s,  v21.4s // out0
+        sqsub           v21.4s,     v31.4s,  v26.4s // t3a
+        sqadd           \o15\().4s, v31.4s,  v26.4s // out15
+.else
+        sqadd           v4.4s,      v16.4s,  v21.4s // out0
+        sqsub           v21.4s,     v31.4s,  v26.4s // t3a
+        sqadd           \o15\().4s, v31.4s,  v26.4s // out15
+        mov             \o0\().16b, v4.16b
+.endif
+        sqneg           \o15\().4s, \o15\().4s      // out15
+
+        sqsub           v3.4s,      v29.4s,  v18.4s // t15a
+        sqadd           \o13\().4s, v29.4s,  v18.4s // out13
+        sqadd           \o2\().4s,  v17.4s,  v30.4s // out2
+        sqsub           v26.4s,     v17.4s,  v30.4s // t14a
+        sqneg           \o13\().4s, \o13\().4s      // out13
+
+        sqadd           \o1\().4s,  v19.4s,  v27.4s // out1
+        sqsub           v27.4s,     v19.4s,  v27.4s // t10
+        sqadd           \o14\().4s, v28.4s,  v20.4s // out14
+        sqsub           v20.4s,     v28.4s,  v20.4s // t11
+        sqneg           \o1\().4s,  \o1\().4s       // out1
+
+        sqadd           \o3\().4s,  v22.4s,  v24.4s // out3
+        sqsub           v22.4s,     v22.4s,  v24.4s // t6
+        sqadd           \o12\().4s, v25.4s,  v23.4s // out12
+        sqsub           v23.4s,     v25.4s,  v23.4s // t7
+        sqneg           \o3\().4s,  \o3\().4s       // out3
+
+        mul_mls         v24, v2,  v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
+        mul_mla         v4,  v2,  v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24)
+        mul_mla         v6,  v26, v3,  v0.s[0], v0.s[0] // -> out5 (v21 or v26)
+
+        srshr           v24.4s, v24.4s, #12             // out8
+        srshr           v4.4s,  v4.4s,  #12             // out7
+        srshr           v5.4s,  v6.4s,  #12             // out5
+        mul_mls         v6,  v26, v3,  v0.s[0], v0.s[0] // -> out10 (v26 or v21)
+        mul_mla         v2,  v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27)
+        srshr           v26.4s, v6.4s,  #12             // out10
+
+        mul_mls         v6,  v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20)
+        mul_mla         v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25)
+        mul_mls         v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22)
+
+        srshr           \o4\().4s,   v2.4s,  #12        // out4
+        srshr           v6.4s,       v6.4s,  #12        // out11
+        srshr           v7.4s,       v21.4s, #12        // out9
+        srshr           \o6\().4s,   v22.4s, #12        // out6
+
+.ifc \o8, v23
+        mov             \o8\().16b,  v24.16b
+        mov             \o10\().16b, v26.16b
+.endif
+
+        sqneg           \o7\().4s,   v4.4s // out7
+        sqneg           \o5\().4s,   v5.4s // out5
+        sqneg           \o11\().4s,  v6.4s // out11
+        sqneg           \o9\().4s,   v7.4s // out9
+.endm
+
+function inv_adst_4s_x16_neon
+        iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+        ret
+endfunc
+
+function inv_flipadst_4s_x16_neon
+        iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16
+        ret
+endfunc
+
+function inv_identity_4s_x16_neon
+        movz            w16, #2*(5793-4096)*8, lsl #16
+        dup             v0.2s,   w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        sqrdmulh        v2.4s,      v\i\().4s,  v0.s[0]
+        sqadd           v\i\().4s,  v\i\().4s,  v\i\().4s
+        sqadd           v\i\().4s,  v\i\().4s,  v2.4s
+.endr
+        ret
+endfunc
+
+.macro identity_4x16_shift1 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        sqrdmulh        v3.4s,   \i,      \c
+        srshr           v3.4s,   v3.4s,   #1
+        sqadd           \i,      \i,      v3.4s
+.endr
+.endm
+
+.macro identity_4x16 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        sqrdmulh        v3.4s,   \i,      \c
+        sqadd           \i,      \i,      \i
+        sqadd           \i,      \i,      v3.4s
+.endr
+.endm
+
+.macro def_horz_16 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+        mov             x14, x30
+        movi            v7.4s,  #0
+.if \scale
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+.endif
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        ld1             {\i}, [x7]
+        st1             {v7.4s}, [x7], x8
+.endr
+.if \scale
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+        blr             x4
+        sqrshrn         v16.4h,  v16.4s,  #\shift
+        sqrshrn         v17.4h,  v17.4s,  #\shift
+        sqrshrn         v18.4h,  v18.4s,  #\shift
+        sqrshrn         v19.4h,  v19.4s,  #\shift
+        sqrshrn2        v16.8h,  v20.4s,  #\shift
+        sqrshrn2        v17.8h,  v21.4s,  #\shift
+        sqrshrn2        v18.8h,  v22.4s,  #\shift
+        sqrshrn2        v19.8h,  v23.4s,  #\shift
+        sqrshrn         v20.4h,  v24.4s,  #\shift
+        sqrshrn         v21.4h,  v25.4s,  #\shift
+        sqrshrn         v22.4h,  v26.4s,  #\shift
+        sqrshrn         v23.4h,  v27.4s,  #\shift
+        sqrshrn2        v20.8h,  v28.4s,  #\shift
+        sqrshrn2        v21.8h,  v29.4s,  #\shift
+        sqrshrn2        v22.8h,  v30.4s,  #\shift
+        sqrshrn2        v23.8h,  v31.4s,  #\shift
+        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
+        transpose_4x8h  v20, v21, v22, v23, v4,  v5,  v6,  v7
+
+.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h
+        st1             {\i}, [x6], #16
+.endr
+
+        br              x14
+endfunc
+.endm
+
+def_horz_16 scale=0, shift=2
+def_horz_16 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_8x16_neon
+        mov             x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ld1             {v\i\().8h}, [x7], x8
+.endr
+        blr             x5
+        load_add_store_8x16 x6, x7
+        br              x14
+endfunc
+
+function inv_txfm_add_16x16_neon
+        mov             x15, x30
+        sub             sp,  sp,  #512
+        ldrh            w12, [x13], #2
+.irp i, 0, 4, 8, 12
+        add             x6,  sp,  #(\i*16*2)
+.if \i > 0
+        mov             w8,  #(16 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.if \i < 12
+        ldrh            w12, [x13], #2
+.endif
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #16*4
+        bl              inv_txfm_horz_16x4_neon
+.endr
+        b               3f
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 2
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+3:
+.irp i, 0, 8
+        add             x6,  x0,  #(\i*2)
+        add             x7,  sp,  #(\i*2)
+        mov             x8,  #32
+        bl              inv_txfm_add_vert_8x16_neon
+.endr
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
+
+const eob_16x16
+        .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+        .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         16,  16,  2
+.endif
+        adr             x4,  inv_\txfm1\()_4s_x16_neon
+        movrel          x5,  X(inv_\txfm2\()_8h_x16_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+        movrel          x13, eob_16x16
+.else
+        movrel          x13, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+        movrel          x13, eob_16x16_identity
+.else
+        movrel          x13, eob_16x16
+.endif
+.endif
+        b               inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+function inv_txfm_add_16x4_neon
+        mov             x15, x30
+        movi            v4.4s,  #0
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        ld1             {\i},    [x2]
+        st1             {v4.4s}, [x2], #16
+.endr
+
+        blr             x4
+
+        sqrshrn         v16.4h,  v16.4s,  #1
+        sqrshrn         v17.4h,  v17.4s,  #1
+        sqrshrn         v18.4h,  v18.4s,  #1
+        sqrshrn         v19.4h,  v19.4s,  #1
+        sqrshrn2        v16.8h,  v20.4s,  #1
+        sqrshrn2        v17.8h,  v21.4s,  #1
+        sqrshrn2        v18.8h,  v22.4s,  #1
+        sqrshrn2        v19.8h,  v23.4s,  #1
+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
+        blr             x5
+        mov             x6,  x0
+        load_add_store_8x4 x6, x7
+
+        sqrshrn         v16.4h,  v24.4s,  #1
+        sqrshrn         v17.4h,  v25.4s,  #1
+        sqrshrn         v18.4h,  v26.4s,  #1
+        sqrshrn         v19.4h,  v27.4s,  #1
+        sqrshrn2        v16.8h,  v28.4s,  #1
+        sqrshrn2        v17.8h,  v29.4s,  #1
+        sqrshrn2        v18.8h,  v30.4s,  #1
+        sqrshrn2        v19.8h,  v31.4s,  #1
+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
+        blr             x5
+        add             x6,  x0,  #16
+        load_add_store_8x4 x6, x7
+
+        br              x15
+endfunc
+
+function inv_txfm_add_4x16_neon
+        ldrh            w12, [x13, #4]
+        mov             x15, x30
+
+        mov             x11, #64
+
+        cmp             w3,  w12
+        ldrh            w12, [x13, #2]
+        b.lt            1f
+
+        add             x6,  x2,  #48
+        movi            v2.4s,   #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+        ld1             {\i},    [x6]
+        st1             {v2.4s}, [x6], x11
+.endr
+        blr             x4
+        rshrn           v28.4h,  v16.4s,  #1
+        rshrn           v29.4h,  v17.4s,  #1
+        rshrn           v30.4h,  v18.4s,  #1
+        rshrn           v31.4h,  v19.4s,  #1
+        transpose_4x4h  v28, v29, v30, v31, v4,  v5,  v6,  v7
+
+        b               2f
+1:
+.irp i, v28.4h, v29.4h, v30.4h, v31.4h
+        movi            \i,  #0
+.endr
+2:
+        cmp             w3,  w12
+        ldrh            w12, [x13, #0]
+        b.lt            1f
+
+        add             x6,  x2,  #32
+        movi            v2.4s,   #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+        ld1             {\i},    [x6]
+        st1             {v2.4s}, [x6], x11
+.endr
+        blr             x4
+        rshrn           v24.4h,  v16.4s,  #1
+        rshrn           v25.4h,  v17.4s,  #1
+        rshrn           v26.4h,  v18.4s,  #1
+        rshrn           v27.4h,  v19.4s,  #1
+        transpose_4x4h  v24, v25, v26, v27, v4,  v5,  v6,  v7
+
+        b               2f
+1:
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h
+        movi            \i,  #0
+.endr
+2:
+        cmp             w3,  w12
+        b.lt            1f
+
+        add             x6,  x2,  #16
+        movi            v2.4s,   #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+        ld1             {\i},    [x6]
+        st1             {v2.4s}, [x6], x11
+.endr
+        blr             x4
+        rshrn           v20.4h,  v16.4s,  #1
+        rshrn           v21.4h,  v17.4s,  #1
+        rshrn           v22.4h,  v18.4s,  #1
+        rshrn           v23.4h,  v19.4s,  #1
+        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
+
+        b               2f
+1:
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+        movi            \i,  #0
+.endr
+2:
+
+        movi            v2.4s,   #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+        ld1             {\i},    [x2]
+        st1             {v2.4s}, [x2], x11
+.endr
+        blr             x4
+        rshrn           v16.4h,  v16.4s,  #1
+        rshrn           v17.4h,  v17.4s,  #1
+        rshrn           v18.4h,  v18.4s,  #1
+        rshrn           v19.4h,  v19.4s,  #1
+        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
+
+        blr             x5
+
+        load_add_store_4x16 x0, x6
+
+        br              x15
+endfunc
+
+const eob_4x16
+        .short 13, 29, 45, 64
+endconst
+
+const eob_4x16_identity1
+        .short 16, 32, 48, 64
+endconst
+
+const eob_4x16_identity2
+        .short 4, 8, 12, 64
+endconst
+
+.macro def_fn_416 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  1
+.endif
+.if \w == 4
+        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
+        movrel          x5,  X(inv_\txfm2\()_4h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+        movrel          x13, eob_4x16
+.else
+        movrel          x13, eob_4x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+        movrel          x13, eob_4x16_identity2
+.else
+        movrel          x13, eob_4x16
+.endif
+.endif
+.else
+        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
+        movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)
+.endif
+        b               inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct
+def_fn_416 \w, \h, identity, identity
+def_fn_416 \w, \h, dct, adst
+def_fn_416 \w, \h, dct, flipadst
+def_fn_416 \w, \h, dct, identity
+def_fn_416 \w, \h, adst, dct
+def_fn_416 \w, \h, adst, adst
+def_fn_416 \w, \h, adst, flipadst
+def_fn_416 \w, \h, flipadst, dct
+def_fn_416 \w, \h, flipadst, adst
+def_fn_416 \w, \h, flipadst, flipadst
+def_fn_416 \w, \h, identity, dct
+def_fn_416 \w, \h, adst, identity
+def_fn_416 \w, \h, flipadst, identity
+def_fn_416 \w, \h, identity, adst
+def_fn_416 \w, \h, identity, flipadst
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+
+function inv_txfm_add_16x8_neon
+        mov             x15, x30
+        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]
+
+        cmp             w3,  w13
+        mov             x11, #32
+        b.lt            1f
+
+        movi            v4.4s,  #0
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+
+        add             x6,  x2,  #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        ld1             {\i},    [x6]
+        st1             {v4.4s}, [x6], x11
+.endr
+
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+        blr             x4
+
+        sqrshrn         v8.4h,   v16.4s,  #1
+        sqrshrn         v9.4h,   v17.4s,  #1
+        sqrshrn         v10.4h,  v18.4s,  #1
+        sqrshrn         v11.4h,  v19.4s,  #1
+        sqrshrn2        v8.8h,   v20.4s,  #1
+        sqrshrn2        v9.8h,   v21.4s,  #1
+        sqrshrn2        v10.8h,  v22.4s,  #1
+        sqrshrn2        v11.8h,  v23.4s,  #1
+        sqrshrn         v12.4h,  v24.4s,  #1
+        sqrshrn         v13.4h,  v25.4s,  #1
+        sqrshrn         v14.4h,  v26.4s,  #1
+        sqrshrn         v15.4h,  v27.4s,  #1
+        sqrshrn2        v12.8h,  v28.4s,  #1
+        sqrshrn2        v13.8h,  v29.4s,  #1
+        sqrshrn2        v14.8h,  v30.4s,  #1
+        sqrshrn2        v15.8h,  v31.4s,  #1
+
+        transpose_4x8h  v8,  v9,  v10, v11, v2,  v3,  v4,  v5
+        transpose_4x8h  v12, v13, v14, v15, v2,  v3,  v4,  v5
+
+        b               2f
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h
+        movi            \i,  #0
+.endr
+2:
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+
+        movi            v4.4s,  #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        ld1             {\i},    [x2]
+        st1             {v4.4s}, [x2], x11
+.endr
+
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+        blr             x4
+
+        sqrshrn         v16.4h,  v16.4s,  #1
+        sqrshrn         v17.4h,  v17.4s,  #1
+        sqrshrn         v18.4h,  v18.4s,  #1
+        sqrshrn         v19.4h,  v19.4s,  #1
+        sqrshrn2        v16.8h,  v20.4s,  #1
+        sqrshrn2        v17.8h,  v21.4s,  #1
+        sqrshrn2        v18.8h,  v22.4s,  #1
+        sqrshrn2        v19.8h,  v23.4s,  #1
+
+        mov             v20.16b, v8.16b
+        mov             v21.16b, v9.16b
+        mov             v22.16b, v10.16b
+        mov             v23.16b, v11.16b
+
+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
+
+        sqrshrn         v8.4h,   v24.4s,  #1
+        sqrshrn         v9.4h,   v25.4s,  #1
+        sqrshrn         v10.4h,  v26.4s,  #1
+        sqrshrn         v11.4h,  v27.4s,  #1
+        sqrshrn2        v8.8h,   v28.4s,  #1
+        sqrshrn2        v9.8h,   v29.4s,  #1
+        sqrshrn2        v10.8h,  v30.4s,  #1
+        sqrshrn2        v11.8h,  v31.4s,  #1
+
+        transpose_4x8h  v8,  v9, v10, v11, v2,  v3,  v4,  v5
+
+        blr             x5
+
+        mov             x6,  x0
+        load_add_store_8x8 x6, x7
+
+        mov             v16.16b, v8.16b
+        mov             v17.16b, v9.16b
+        mov             v18.16b, v10.16b
+        mov             v19.16b, v11.16b
+        mov             v20.16b, v12.16b
+        mov             v21.16b, v13.16b
+        mov             v22.16b, v14.16b
+        mov             v23.16b, v15.16b
+
+        blr             x5
+
+        add             x0,  x0,  #16
+        load_add_store_8x8 x0, x7
+
+        ldp             d14, d15, [sp, #0x30]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x40
+        br              x15
+endfunc
+
+function inv_txfm_add_8x16_neon
+        mov             x15, x30
+        stp             d8,  d9,  [sp, #-0x20]!
+        stp             d10, d11, [sp, #0x10]
+        ldrh            w12, [x13, #4]
+
+        mov             x11, #64
+
+        cmp             w3,  w12
+        ldrh            w12, [x13, #2]
+        b.lt            1f
+
+        add             x6,  x2,  #48
+        movi            v4.4s,   #0
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        ld1             {\i},    [x6]
+        st1             {v4.4s}, [x6], x11
+.endr
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        blr             x4
+
+        sqrshrn         v28.4h,  v16.4s,  #1
+        sqrshrn         v29.4h,  v17.4s,  #1
+        sqrshrn         v30.4h,  v18.4s,  #1
+        sqrshrn         v31.4h,  v19.4s,  #1
+        sqrshrn2        v28.8h,  v20.4s,  #1
+        sqrshrn2        v29.8h,  v21.4s,  #1
+        sqrshrn2        v30.8h,  v22.4s,  #1
+        sqrshrn2        v31.8h,  v23.4s,  #1
+        transpose_4x8h  v28, v29, v30, v31, v2, v3, v4, v5
+
+        b               2f
+
+1:
+.irp i, v28.8h, v29.8h, v30.8h, v31.8h
+        movi            \i,  #0
+.endr
+
+2:
+        cmp             w3,  w12
+        ldrh            w12, [x13, #0]
+        b.lt            1f
+
+        add             x6,  x2,  #32
+        movi            v4.4s,   #0
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        ld1             {\i},    [x6]
+        st1             {v4.4s}, [x6], x11
+.endr
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        blr             x4
+
+        sqrshrn         v24.4h,  v16.4s,  #1
+        sqrshrn         v25.4h,  v17.4s,  #1
+        sqrshrn         v26.4h,  v18.4s,  #1
+        sqrshrn         v27.4h,  v19.4s,  #1
+        sqrshrn2        v24.8h,  v20.4s,  #1
+        sqrshrn2        v25.8h,  v21.4s,  #1
+        sqrshrn2        v26.8h,  v22.4s,  #1
+        sqrshrn2        v27.8h,  v23.4s,  #1
+        transpose_4x8h  v24, v25, v26, v27, v2, v3, v4, v5
+
+        b               2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+        movi            \i,  #0
+.endr
+
+2:
+        cmp             w3,  w12
+        b.lt            1f
+
+        add             x6,  x2,  #16
+        movi            v4.4s,   #0
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        ld1             {\i},    [x6]
+        st1             {v4.4s}, [x6], x11
+.endr
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        blr             x4
+
+        sqrshrn         v8.4h,   v16.4s,  #1
+        sqrshrn         v9.4h,   v17.4s,  #1
+        sqrshrn         v10.4h,  v18.4s,  #1
+        sqrshrn         v11.4h,  v19.4s,  #1
+        sqrshrn2        v8.8h,   v20.4s,  #1
+        sqrshrn2        v9.8h,   v21.4s,  #1
+        sqrshrn2        v10.8h,  v22.4s,  #1
+        sqrshrn2        v11.8h,  v23.4s,  #1
+        transpose_4x8h  v8,  v9,  v10, v11, v2, v3, v4, v5
+
+        b               2f
+
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h
+        movi            \i,  #0
+.endr
+
+2:
+        movi            v4.4s,   #0
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        ld1             {\i},    [x2]
+        st1             {v4.4s}, [x2], x11
+.endr
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        blr             x4
+
+        sqrshrn         v16.4h,  v16.4s,  #1
+        sqrshrn         v17.4h,  v17.4s,  #1
+        sqrshrn         v18.4h,  v18.4s,  #1
+        sqrshrn         v19.4h,  v19.4s,  #1
+        sqrshrn2        v16.8h,  v20.4s,  #1
+        sqrshrn2        v17.8h,  v21.4s,  #1
+        sqrshrn2        v18.8h,  v22.4s,  #1
+        sqrshrn2        v19.8h,  v23.4s,  #1
+        transpose_4x8h  v16, v17, v18, v19, v2, v3, v4, v5
+
+        mov             v20.16b, v8.16b
+        mov             v21.16b, v9.16b
+        mov             v22.16b, v10.16b
+        mov             v23.16b, v11.16b
+
+        blr             x5
+
+        load_add_store_8x16 x0, x6
+
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x20
+
+        br              x15
+endfunc
+
+const eob_8x16
+        .short 10, 43, 75, 128
+endconst
+
+const eob_8x16_identity1
+        .short 4, 64, 96, 128
+endconst
+
+const eob_8x16_identity2
+        .short 4, 8, 12, 128
+endconst
+
+.macro def_fn_816 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  1
+.endif
+        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
+        movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+        movrel          x13, eob_8x16
+.else
+        movrel          x13, eob_8x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+        movrel          x13, eob_8x16_identity2
+.else
+        movrel          x13, eob_8x16
+.endif
+.endif
+.if \h == 8
+        ldrh            w13, [x13]
+.endif
+        b               inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct
+def_fn_816 \w, \h, identity, identity
+def_fn_816 \w, \h, dct, adst
+def_fn_816 \w, \h, dct, flipadst
+def_fn_816 \w, \h, dct, identity
+def_fn_816 \w, \h, adst, dct
+def_fn_816 \w, \h, adst, adst
+def_fn_816 \w, \h, adst, flipadst
+def_fn_816 \w, \h, flipadst, dct
+def_fn_816 \w, \h, flipadst, adst
+def_fn_816 \w, \h, flipadst, flipadst
+def_fn_816 \w, \h, identity, dct
+def_fn_816 \w, \h, adst, identity
+def_fn_816 \w, \h, flipadst, identity
+def_fn_816 \w, \h, identity, adst
+def_fn_816 \w, \h, identity, flipadst
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4s_x16_neon
+        movrel          x16, idct_coeffs, 4*16
+        ld1             {v0.4s, v1.4s}, [x16], #32
+
+        mul_mls         v2,  v16, v31, v0.s[0], v0.s[1] // -> t16a
+        mul_mla         v4,  v16, v31, v0.s[1], v0.s[0] // -> t31a
+        mul_mls         v6,  v24, v23, v0.s[2], v0.s[3] // -> t17a
+        srshr           v16.4s, v2.4s,  #12             // t16a
+        srshr           v31.4s, v4.4s,  #12             // t31a
+        mul_mla         v2,  v24, v23, v0.s[3], v0.s[2] // -> t30a
+        mul_mls         v4,  v20, v27, v1.s[0], v1.s[1] // -> t18a
+        srshr           v24.4s, v6.4s,  #12             // t17a
+        srshr           v23.4s, v2.4s,  #12             // t30a
+        mul_mla         v6,  v20, v27, v1.s[1], v1.s[0] // -> t29a
+        mul_mls         v2,  v28, v19, v1.s[2], v1.s[3] // -> t19a
+        srshr           v20.4s, v4.4s,  #12             // t18a
+        srshr           v27.4s, v6.4s,  #12             // t29a
+        mul_mla         v4,  v28, v19, v1.s[3], v1.s[2] // -> t28a
+        ld1             {v0.4s, v1.4s}, [x16]
+        sub             x16, x16, #4*24
+        mul_mls         v6,  v18, v29, v0.s[0], v0.s[1] // -> t20a
+        srshr           v28.4s, v2.4s,  #12             // t19a
+        srshr           v19.4s, v4.4s,  #12             // t28a
+        mul_mla         v2,  v18, v29, v0.s[1], v0.s[0] // -> t27a
+        mul_mls         v4,  v26, v21, v0.s[2], v0.s[3] // -> t21a
+        srshr           v18.4s, v6.4s,  #12             // t20a
+        srshr           v29.4s, v2.4s,  #12             // t27a
+        mul_mla         v6,  v26, v21, v0.s[3], v0.s[2] // -> t26a
+        mul_mls         v2,  v22, v25, v1.s[0], v1.s[1] // -> t22a
+        srshr           v26.4s, v4.4s,  #12             // t21a
+        srshr           v21.4s, v6.4s,  #12             // t26a
+        mul_mla         v4,  v22, v25, v1.s[1], v1.s[0] // -> t25a
+        mul_mls         v6,  v30, v17, v1.s[2], v1.s[3] // -> t23a
+        srshr           v22.4s, v2.4s,  #12             // t22a
+        srshr           v25.4s, v4.4s,  #12             // t25a
+        mul_mla         v2,  v30, v17, v1.s[3], v1.s[2] // -> t24a
+        srshr           v30.4s, v6.4s,  #12             // t23a
+        srshr           v17.4s, v2.4s,  #12             // t24a
+
+        ld1             {v0.4s, v1.4s}, [x16]
+
+        sqsub           v2.4s,   v16.4s,  v24.4s // t17
+        sqadd           v16.4s,  v16.4s,  v24.4s // t16
+        sqsub           v3.4s,   v31.4s,  v23.4s // t30
+        sqadd           v31.4s,  v31.4s,  v23.4s // t31
+        sqsub           v24.4s,  v28.4s,  v20.4s // t18
+        sqadd           v28.4s,  v28.4s,  v20.4s // t19
+        sqadd           v23.4s,  v18.4s,  v26.4s // t20
+        sqsub           v18.4s,  v18.4s,  v26.4s // t21
+        sqsub           v20.4s,  v30.4s,  v22.4s // t22
+        sqadd           v30.4s,  v30.4s,  v22.4s // t23
+        sqadd           v26.4s,  v17.4s,  v25.4s // t24
+        sqsub           v17.4s,  v17.4s,  v25.4s // t25
+        sqsub           v22.4s,  v29.4s,  v21.4s // t26
+        sqadd           v29.4s,  v29.4s,  v21.4s // t27
+        sqadd           v25.4s,  v19.4s,  v27.4s // t28
+        sqsub           v19.4s,  v19.4s,  v27.4s // t29
+
+        mul_mls         v4,  v3,  v2,  v1.s[0], v1.s[1] // -> t17a
+        mul_mla         v6,  v3,  v2,  v1.s[1], v1.s[0] // -> t30a
+        mul_mla         v2,  v19, v24, v1.s[1], v1.s[0] // -> t18a
+        srshr           v21.4s, v4.4s,  #12             // t17a
+        srshr           v27.4s, v6.4s,  #12             // t30a
+        neg             v2.4s,   v2.4s                  // -> t18a
+        mul_mls         v4,  v19, v24, v1.s[0], v1.s[1] // -> t29a
+        mul_mls         v6,  v22, v18, v1.s[2], v1.s[3] // -> t21a
+        srshr           v19.4s, v2.4s,  #12             // t18a
+        srshr           v24.4s, v4.4s,  #12             // t29a
+        mul_mla         v2,  v22, v18, v1.s[3], v1.s[2] // -> t26a
+        mul_mla         v4,  v17, v20, v1.s[3], v1.s[2] // -> t22a
+        srshr           v22.4s, v6.4s,  #12             // t21a
+        srshr           v18.4s, v2.4s,  #12             // t26a
+        neg             v4.4s,   v4.4s                  // -> t22a
+        mul_mls         v6,  v17, v20, v1.s[2], v1.s[3] // -> t25a
+        srshr           v17.4s, v4.4s,  #12             // t22a
+        srshr           v20.4s, v6.4s,  #12             // t25a
+
+        sqsub           v2.4s,   v27.4s,  v24.4s // t29
+        sqadd           v27.4s,  v27.4s,  v24.4s // t30
+        sqsub           v3.4s,   v21.4s,  v19.4s // t18
+        sqadd           v21.4s,  v21.4s,  v19.4s // t17
+        sqsub           v24.4s,  v16.4s,  v28.4s // t19a
+        sqadd           v16.4s,  v16.4s,  v28.4s // t16a
+        sqsub           v19.4s,  v30.4s,  v23.4s // t20a
+        sqadd           v30.4s,  v30.4s,  v23.4s // t23a
+        sqsub           v28.4s,  v17.4s,  v22.4s // t21
+        sqadd           v17.4s,  v17.4s,  v22.4s // t22
+        sqadd           v23.4s,  v26.4s,  v29.4s // t24a
+        sqsub           v26.4s,  v26.4s,  v29.4s // t27a
+        sqadd           v22.4s,  v20.4s,  v18.4s // t25
+        sqsub           v20.4s,  v20.4s,  v18.4s // t26
+        sqsub           v29.4s,  v31.4s,  v25.4s // t28a
+        sqadd           v31.4s,  v31.4s,  v25.4s // t31a
+
+        mul_mls         v4,  v2,  v3,  v0.s[2], v0.s[3] // -> t18a
+        mul_mla         v6,  v2,  v3,  v0.s[3], v0.s[2] // -> t29a
+        mul_mls         v2,  v29, v24, v0.s[2], v0.s[3] // -> t19
+        srshr           v18.4s, v4.4s,  #12             // t18a
+        srshr           v25.4s, v6.4s,  #12             // t29a
+        mul_mla         v4,  v29, v24, v0.s[3], v0.s[2] // -> t28
+        mul_mla         v6,  v26, v19, v0.s[3], v0.s[2] // -> t20
+        srshr           v29.4s, v2.4s,  #12             // t19
+        srshr           v24.4s, v4.4s,  #12             // t28
+        neg             v6.4s,   v6.4s                  // -> t20
+        mul_mls         v2,  v26, v19, v0.s[2], v0.s[3] // -> t27
+        mul_mla         v4,  v20, v28, v0.s[3], v0.s[2] // -> t21a
+        srshr           v26.4s, v6.4s,  #12             // t20
+        srshr           v19.4s, v2.4s,  #12             // t27
+        neg             v4.4s,   v4.4s                  // -> t21a
+        mul_mls         v6,  v20, v28, v0.s[2], v0.s[3] // -> t26a
+        srshr           v20.4s, v4.4s,  #12             // t21a
+        srshr           v28.4s, v6.4s,  #12             // t26a
+
+        sqsub           v2.4s,   v16.4s,  v30.4s // t23
+        sqadd           v16.4s,  v16.4s,  v30.4s // t16 = out16
+        sqsub           v3.4s,   v31.4s,  v23.4s // t24
+        sqadd           v31.4s,  v31.4s,  v23.4s // t31 = out31
+        sqsub           v23.4s,  v21.4s,  v17.4s // t22a
+        sqadd           v17.4s,  v21.4s,  v17.4s // t17a = out17
+        sqadd           v30.4s,  v27.4s,  v22.4s // t30a = out30
+        sqsub           v21.4s,  v27.4s,  v22.4s // t25a
+        sqsub           v27.4s,  v18.4s,  v20.4s // t21
+        sqadd           v18.4s,  v18.4s,  v20.4s // t18 = out18
+        sqadd           v4.4s,   v29.4s,  v26.4s // t19a = out19
+        sqsub           v26.4s,  v29.4s,  v26.4s // t20a
+        sqadd           v29.4s,  v25.4s,  v28.4s // t29 = out29
+        sqsub           v25.4s,  v25.4s,  v28.4s // t26
+        sqadd           v28.4s,  v24.4s,  v19.4s // t28a = out28
+        sqsub           v24.4s,  v24.4s,  v19.4s // t27a
+        mov             v19.16b, v4.16b          // out19
+
+        mul_mls         v4,  v24, v26, v0.s[0], v0.s[0] // -> t20
+        mul_mla         v6,  v24, v26, v0.s[0], v0.s[0] // -> t27
+        srshr           v20.4s, v4.4s,  #12             // t20
+        srshr           v22.4s, v6.4s,  #12             // t27
+
+        mul_mla         v4,  v25, v27, v0.s[0], v0.s[0] // -> t26a
+        mul_mls         v6,  v25, v27, v0.s[0], v0.s[0] // -> t21a
+        mov             v27.16b,  v22.16b               // t27
+        srshr           v26.4s, v4.4s,  #12             // t26a
+
+        mul_mls         v24, v21, v23, v0.s[0], v0.s[0] // -> t22
+        mul_mla         v4,  v21, v23, v0.s[0], v0.s[0] // -> t25
+        srshr           v21.4s, v6.4s,  #12             // t21a
+        srshr           v22.4s, v24.4s, #12             // t22
+        srshr           v25.4s, v4.4s,  #12             // t25
+
+        mul_mls         v4,  v3,  v2,  v0.s[0], v0.s[0] // -> t23a
+        mul_mla         v6,  v3,  v2,  v0.s[0], v0.s[0] // -> t24a
+        srshr           v23.4s, v4.4s,  #12             // t23a
+        srshr           v24.4s, v6.4s,  #12             // t24a
+
+        ret
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+        mov             x14, x30
+        movi            v7.4s,  #0
+        lsl             x8,  x8,  #1
+.if \scale
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+.endif
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        ld1             {\i}, [x7]
+        st1             {v7.4s}, [x7], x8
+.endr
+        sub             x7,  x7,  x8, lsl #4
+        add             x7,  x7,  x8, lsr #1
+.if \scale
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+        bl              inv_dct_4s_x16_neon
+        transpose_4x4s  v16, v17, v18, v19, v2,  v3,  v4,  v5
+        transpose_4x4s  v20, v21, v22, v23, v2,  v3,  v4,  v5
+        transpose_4x4s  v24, v25, v26, v27, v2,  v3,  v4,  v5
+        transpose_4x4s  v28, v29, v30, v31, v2,  v3,  v4,  v5
+
+.macro store1 r0, r1, r2, r3
+        st1             {\r0}, [x6], #16
+        st1             {\r1}, [x6], #16
+        st1             {\r2}, [x6], #16
+        st1             {\r3}, [x6], #16
+.endm
+        store1          v16.4s,  v20.4s,  v24.4s,  v28.4s
+        store1          v17.4s,  v21.4s,  v25.4s,  v29.4s
+        store1          v18.4s,  v22.4s,  v26.4s,  v30.4s
+        store1          v19.4s,  v23.4s,  v27.4s,  v31.4s
+.purgem store1
+        sub             x6,  x6,  #64*4
+
+        movi            v7.4s,  #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        ld1             {\i}, [x7]
+        st1             {v7.4s}, [x7], x8
+.endr
+.if \scale
+        // This relies on the fact that the idct also leaves the right coeff in v0.s[1]
+        scale_input     .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+        bl              inv_dct32_odd_4s_x16_neon
+        transpose_4x4s  v31, v30, v29, v28, v2,  v3,  v4,  v5
+        transpose_4x4s  v27, v26, v25, v24, v2,  v3,  v4,  v5
+        transpose_4x4s  v23, v22, v21, v20, v2,  v3,  v4,  v5
+        transpose_4x4s  v19, v18, v17, v16, v2,  v3,  v4,  v5
+.macro store2 r0, r1, r2, r3, shift
+        ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]
+        sqsub           v4.4s,   v0.4s,   \r0
+        sqadd           v0.4s,   v0.4s,   \r0
+        sqsub           v5.4s,   v1.4s,   \r1
+        sqadd           v1.4s,   v1.4s,   \r1
+        sqsub           v6.4s,   v2.4s,   \r2
+        sqadd           v2.4s,   v2.4s,   \r2
+        sqsub           v7.4s,   v3.4s,   \r3
+        sqadd           v3.4s,   v3.4s,   \r3
+        sqrshrn         v0.4h,   v0.4s,   #\shift
+        sqrshrn2        v0.8h,   v1.4s,   #\shift
+        sqrshrn         v1.4h,   v2.4s,   #\shift
+        sqrshrn2        v1.8h,   v3.4s,   #\shift
+        sqrshrn         v2.4h,   v7.4s,   #\shift
+        sqrshrn2        v2.8h,   v6.4s,   #\shift
+        sqrshrn         v3.4h,   v5.4s,   #\shift
+        sqrshrn2        v3.8h,   v4.4s,   #\shift
+        st1             {v0.8h, v1.8h}, [x6], #32
+        rev64           v2.8h,   v2.8h
+        rev64           v3.8h,   v3.8h
+        st1             {v2.8h, v3.8h}, [x6], #32
+.endm
+
+        store2          v31.4s,  v27.4s,  v23.4s,  v19.4s,  \shift
+        store2          v30.4s,  v26.4s,  v22.4s,  v18.4s,  \shift
+        store2          v29.4s,  v25.4s,  v21.4s,  v17.4s,  \shift
+        store2          v28.4s,  v24.4s,  v20.4s,  v16.4s,  \shift
+.purgem store2
+        br              x14
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_8x32_neon
+        mov             x14, x30
+        lsl             x8,  x8,  #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ld1             {v\i\().8h}, [x7], x8
+.endr
+        sub             x7,  x7,  x8, lsl #4
+
+        bl              X(inv_dct_8h_x16_neon)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        st1             {v\i\().8h}, [x7], x8
+.endr
+        sub             x7,  x7,  x8, lsl #4
+        add             x7,  x7,  x8, lsr #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ld1             {v\i\().8h}, [x7], x8
+.endr
+        sub             x7,  x7,  x8, lsl #4
+        sub             x7,  x7,  x8, lsr #1
+        bl              X(inv_dct32_odd_8h_x16_neon)
+
+        neg             x9,  x8
+        mov             x10, x6
+        movi            v0.8h,   #0
+        mvni            v1.8h,   #0xfc, lsl #8 // 0x3ff
+.macro combine r0, r1, r2, r3, op, stride
+        ld1             {v5.8h}, [x7],    \stride
+        ld1             {v2.8h}, [x10],   x1
+        ld1             {v6.8h}, [x7],    \stride
+        ld1             {v3.8h}, [x10],   x1
+        \op             v5.8h,   v5.8h,   \r0
+        ld1             {v7.8h}, [x7],    \stride
+        ld1             {v4.8h}, [x10],   x1
+        srshr           v5.8h,   v5.8h,   #4
+        \op             v6.8h,   v6.8h,   \r1
+        sqadd           v5.8h,   v5.8h,   v2.8h
+        srshr           v6.8h,   v6.8h,   #4
+        \op             v7.8h,   v7.8h,   \r2
+        smax            v2.8h,   v5.8h,   v0.8h
+        ld1             {v5.8h}, [x7],    \stride
+        sqadd           v6.8h,   v6.8h,   v3.8h
+        smin            v2.8h,   v2.8h,   v1.8h
+        srshr           v7.8h,   v7.8h,   #4
+        \op             v5.8h,   v5.8h,   \r3
+        st1             {v2.8h}, [x6],    x1
+        ld1             {v2.8h}, [x10],   x1
+        smax            v3.8h,   v6.8h,   v0.8h
+        sqadd           v7.8h,   v7.8h,   v4.8h
+        smin            v3.8h,   v3.8h,   v1.8h
+        srshr           v5.8h,   v5.8h,   #4
+        st1             {v3.8h}, [x6],    x1
+        smax            v4.8h,   v7.8h,   v0.8h
+        sqadd           v5.8h,   v5.8h,   v2.8h
+        smin            v4.8h,   v4.8h,   v1.8h
+        st1             {v4.8h}, [x6],    x1
+        smax            v2.8h,   v5.8h,   v0.8h
+        smin            v2.8h,   v2.8h,   v1.8h
+        st1             {v2.8h}, [x6],    x1
+.endm
+        combine         v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
+        combine         v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
+        combine         v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
+        combine         v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
+        sub             x7,  x7,  x8
+        combine         v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
+        combine         v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
+        combine         v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
+        combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
+.purgem combine
+
+        br              x14
+endfunc
+
+const eob_32x32
+        .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+        .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+        .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+        .short 10, 43, 75, 107, 139, 171, 203, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
+        movi            v0.8h,  #0
+        movi            v1.8h,  #0
+        movrel          x13, eob_32x32, 2
+
+        mov             x8,  #4*32
+1:
+        mov             w9,  #0
+        movrel          x12, eob_32x32, 2
+2:
+        add             w9,  w9,  #8
+        ld1             {v16.4s, v17.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v18.4s, v19.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v20.4s, v21.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v22.4s, v23.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v24.4s, v25.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v26.4s, v27.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v28.4s, v29.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v30.4s, v31.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        sqxtn           v16.4h,  v16.4s
+        sqxtn2          v16.8h,  v17.4s
+        sqxtn           v17.4h,  v18.4s
+        sqxtn2          v17.8h,  v19.4s
+        sqxtn           v18.4h,  v20.4s
+        sqxtn2          v18.8h,  v21.4s
+        sqxtn           v19.4h,  v22.4s
+        sqxtn2          v19.8h,  v23.4s
+        sqxtn           v20.4h,  v24.4s
+        sqxtn2          v20.8h,  v25.4s
+        sqxtn           v21.4h,  v26.4s
+        sqxtn2          v21.8h,  v27.4s
+        sqxtn           v22.4h,  v28.4s
+        sqxtn2          v22.8h,  v29.4s
+        sqxtn           v23.4h,  v30.4s
+        sqxtn2          v23.8h,  v31.4s
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+        load_add_store_8x8 x0, x7, shiftbits=2
+        ldrh            w11, [x12], #4
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #2*8
+        cmp             w3,  w11
+        b.ge            2b
+
+        ldrh            w11, [x13], #4
+        cmp             w3,  w11
+        b.lt            9f
+
+        sub             x0,  x0,  w9, uxtw #1
+        add             x0,  x0,  x1, lsl #3
+        msub            x2,  x8,  x9,  x2
+        add             x2,  x2,  #4*8
+        b               1b
+9:
+        ret
+endfunc
+
+.macro shift_16_regs op, shift
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        \op             \i,  \i,  #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+        movz            w16, #2896*8, lsl #16
+        movz            w17, #2*(5793-4096)*8, lsl #16
+        movi            v0.4s,   #0
+        movi            v1.4s,   #0
+        movrel          x13, eob_16x32\hshort, 2
+
+        mov             x8,  #4*\h
+1:
+        mov             w9,  #0
+        movrel          x12, eob_16x32\wshort, 2
+2:
+        add             w9,  w9,  #8
+        ld1             {v16.4s, v17.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        dup             v2.2s,   w16
+        ld1             {v18.4s, v19.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        mov             v2.s[1], w17
+        ld1             {v20.4s, v21.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v22.4s, v23.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v24.4s, v25.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v26.4s, v27.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v28.4s, v29.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v30.4s, v31.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        scale_input     .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+
+.if \w == 16
+        // 16x32
+        identity_4x16_shift1 v2.s[1]
+.else
+        // 32x16
+        shift_16_regs   sqshl, 1
+        identity_4x16   v2.s[1]
+.endif
+        sqxtn           v16.4h,  v16.4s
+        sqxtn2          v16.8h,  v17.4s
+        sqxtn           v17.4h,  v18.4s
+        sqxtn2          v17.8h,  v19.4s
+        sqxtn           v18.4h,  v20.4s
+        sqxtn2          v18.8h,  v21.4s
+        sqxtn           v19.4h,  v22.4s
+        sqxtn2          v19.8h,  v23.4s
+        sqxtn           v20.4h,  v24.4s
+        sqxtn2          v20.8h,  v25.4s
+        sqxtn           v21.4h,  v26.4s
+        sqxtn2          v21.8h,  v27.4s
+        sqxtn           v22.4h,  v28.4s
+        sqxtn2          v22.8h,  v29.4s
+        sqxtn           v23.4h,  v30.4s
+        sqxtn2          v23.8h,  v31.4s
+
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+.if \w == 16
+        load_add_store_8x8 x0, x7, shiftbits=2
+.else
+        load_add_store_8x8 x0, x7, shiftbits=4
+.endif
+        ldrh            w11, [x12], #4
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #16
+        cmp             w3,  w11
+        b.ge            2b
+
+        ldrh            w11, [x13], #4
+        cmp             w3,  w11
+        b.lt            9f
+
+        sub             x0,  x0,  w9, uxtw #1
+        add             x0,  x0,  x1, lsl #3
+        msub            x2,  x8,  x9,  x2
+        add             x2,  x2,  #4*8
+        b               1b
+9:
+        ret
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+        movi            v0.4s,  #0
+        movi            v1.4s,  #0
+        // Working on 8x8 blocks, read every other entry from eob_8x32
+        movrel          x13, eob_8x32, 2
+
+        mov             w8,  #4*\h
+1:
+        // Working on 8x8 blocks, read every other entry from eob_8x32
+        ldrh            w12, [x13], #4
+        ld1             {v16.4s, v17.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v18.4s, v19.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v20.4s, v21.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v22.4s, v23.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v24.4s, v25.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v26.4s, v27.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v28.4s, v29.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v30.4s, v31.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+
+.if \w == 8
+        sqrshrn         v16.4h,  v16.4s,  #1
+        sqrshrn2        v16.8h,  v17.4s,  #1
+        sqrshrn         v17.4h,  v18.4s,  #1
+        sqrshrn2        v17.8h,  v19.4s,  #1
+        sqrshrn         v18.4h,  v20.4s,  #1
+        sqrshrn2        v18.8h,  v21.4s,  #1
+        sqrshrn         v19.4h,  v22.4s,  #1
+        sqrshrn2        v19.8h,  v23.4s,  #1
+        sqrshrn         v20.4h,  v24.4s,  #1
+        sqrshrn2        v20.8h,  v25.4s,  #1
+        sqrshrn         v21.4h,  v26.4s,  #1
+        sqrshrn2        v21.8h,  v27.4s,  #1
+        sqrshrn         v22.4h,  v28.4s,  #1
+        sqrshrn2        v22.8h,  v29.4s,  #1
+        sqrshrn         v23.4h,  v30.4s,  #1
+        sqrshrn2        v23.8h,  v31.4s,  #1
+.else
+        sqxtn           v16.4h,  v16.4s
+        sqxtn2          v16.8h,  v17.4s
+        sqxtn           v17.4h,  v18.4s
+        sqxtn2          v17.8h,  v19.4s
+        sqxtn           v18.4h,  v20.4s
+        sqxtn2          v18.8h,  v21.4s
+        sqxtn           v19.4h,  v22.4s
+        sqxtn2          v19.8h,  v23.4s
+        sqxtn           v20.4h,  v24.4s
+        sqxtn2          v20.8h,  v25.4s
+        sqxtn           v21.4h,  v26.4s
+        sqxtn2          v21.8h,  v27.4s
+        sqxtn           v22.4h,  v28.4s
+        sqxtn2          v22.8h,  v29.4s
+        sqxtn           v23.4h,  v30.4s
+        sqxtn2          v23.8h,  v31.4s
+.endif
+
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+
+        cmp             w3,  w12
+.if \w == 8
+        load_add_store_8x8 x0, x7, shiftbits=2
+.else
+        load_add_store_8x8 x0, x7, shiftbits=3
+.endif
+
+        b.lt            9f
+.if \w == 8
+        sub             x2,  x2,  x8, lsl #3
+        add             x2,  x2,  #4*8
+.else
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #2*8
+.endif
+        b               1b
+
+9:
+        ret
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
+        idct_dc         32,  32,  2
+
+        mov             x15, x30
+        sub             sp,  sp,  #2048
+        movrel          x13, eob_32x32
+        ldrh            w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x6,  sp,  #(\i*32*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.if \i < 28
+        ldrh            w12, [x13], #2
+.endif
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #32*4
+        bl              inv_txfm_horz_dct_32x4_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24
+        add             x6,  x0,  #(\i*2)
+        add             x7,  sp,  #(\i*2)
+        mov             x8,  #32*2
+        bl              inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+        add             sp,  sp,  #2048
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
+        idct_dc         16,  32,  1
+
+        mov             x15, x30
+        sub             sp,  sp,  #1024
+        movrel          x13, eob_16x32
+        ldrh            w12, [x13], #2
+        adr             x4,  inv_dct_4s_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x6,  sp,  #(\i*16*2)
+        add             x7,  x2,  #(\i*4)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.if \i < 28
+        ldrh            w12, [x13], #2
+.endif
+.endif
+        mov             x8,  #4*32
+        bl              inv_txfm_horz_scale_16x4_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 2
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8
+        add             x6,  x0,  #(\i*2)
+        add             x7,  sp,  #(\i*2)
+        mov             x8,  #16*2
+        bl              inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+        add             sp,  sp,  #1024
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
+        idct_dc         32,  16,  1
+
+        mov             x15, x30
+        sub             sp,  sp,  #1024
+
+        movrel          x13, eob_16x32
+        movrel          x5,  X(inv_dct_8h_x16_neon)
+        ldrh            w12, [x13], #2
+
+.irp i, 0, 4, 8, 12
+        add             x6,  sp,  #(\i*32*2)
+        add             x7,  x2,  #(\i*4)
+.if \i > 0
+        mov             w8,  #(16 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+        ldrh            w12, [x13], #2
+.endif
+        mov             x8,  #4*16
+        bl              inv_txfm_horz_scale_dct_32x4_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24
+        add             x6,  x0,  #(\i*2)
+        add             x7,  sp,  #(\i*2)
+        mov             x8,  #32*2
+        bl              inv_txfm_add_vert_8x16_neon
+.endr
+
+        add             sp,  sp,  #1024
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
+        idct_dc         8,   32, 2
+
+        mov             x15, x30
+        sub             sp,  sp,  #512
+
+        movrel          x13, eob_8x32
+
+        movi            v28.4s,  #0
+        mov             x8,  #4*32
+        mov             w9,  #32
+        mov             x6,  sp
+        mov             x7,  x2
+1:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        ld1             {v\i\().4s}, [x7]
+        st1             {v28.4s}, [x7], x8
+.endr
+        ldrh            w12, [x13], #2
+        sub             w9,  w9,  #4
+        sub             x7,  x7,  x8, lsl #3
+        add             x7,  x7,  #4*4
+
+        bl              inv_dct_4s_x8_neon
+
+        sqrshrn         v16.4h,  v16.4s,  #2
+        sqrshrn         v17.4h,  v17.4s,  #2
+        sqrshrn         v18.4h,  v18.4s,  #2
+        sqrshrn         v19.4h,  v19.4s,  #2
+        sqrshrn2        v16.8h,  v20.4s,  #2
+        sqrshrn2        v17.8h,  v21.4s,  #2
+        sqrshrn2        v18.8h,  v22.4s,  #2
+        sqrshrn2        v19.8h,  v23.4s,  #2
+
+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
+
+        cmp             w3,  w12
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+
+        b.ge            1b
+        cbz             w9,  3f
+
+        movi            v29.8h,  #0
+        movi            v30.8h,  #0
+        movi            v31.8h,  #0
+2:
+        subs            w9,  w9,  #4
+        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
+        b.gt            2b
+
+3:
+        mov             x6,  x0
+        mov             x7,  sp
+        mov             x8,  #8*2
+        bl              inv_txfm_add_vert_dct_8x32_neon
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
+        idct_dc         32,  8,   2
+
+        mov             x15, x30
+        sub             sp,  sp,  #512
+
+.irp i, 0, 4
+        add             x6,  sp,  #(\i*32*2)
+        add             x7,  x2,  #(\i*4)
+.if \i > 0
+        cmp             w3,  #10
+        b.lt            1f
+.endif
+        mov             x8,  #8*4
+        bl              inv_txfm_horz_dct_32x4_neon
+.endr
+        b               2f
+
+1:
+        movi            v4.8h,   #0
+        movi            v5.8h,   #0
+        movi            v6.8h,   #0
+        movi            v7.8h,   #0
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+
+2:
+        mov             x8,  #2*32
+        mov             w9,  #0
+1:
+        add             x6,  x0,  x9, lsl #1
+        add             x7,  sp,  x9, lsl #1 // #(\i*2)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        ld1             {v\i\().8h}, [x7], x8
+.endr
+        add             w9,  w9,  #8
+
+        bl              X(inv_dct_8h_x8_neon)
+
+        cmp             w9,  #32
+
+        load_add_store_8x8 x6, x7
+
+        b.lt            1b
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
+
+function inv_dct64_step1_neon
+        // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+        // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+        // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+        // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+        ld1             {v0.4s, v1.4s}, [x17], #32
+
+        sqrdmulh        v23.4s,  v16.4s,  v0.s[1]       // t63a
+        sqrdmulh        v16.4s,  v16.4s,  v0.s[0]       // t32a
+        sqrdmulh        v22.4s,  v17.4s,  v0.s[2]       // t62a
+        sqrdmulh        v17.4s,  v17.4s,  v0.s[3]       // t33a
+        sqrdmulh        v21.4s,  v18.4s,  v1.s[1]       // t61a
+        sqrdmulh        v18.4s,  v18.4s,  v1.s[0]       // t34a
+        sqrdmulh        v20.4s,  v19.4s,  v1.s[2]       // t60a
+        sqrdmulh        v19.4s,  v19.4s,  v1.s[3]       // t35a
+
+        ld1             {v0.4s}, [x17], #16
+
+        sqadd           v24.4s,  v16.4s,  v17.4s        // t32
+        sqsub           v25.4s,  v16.4s,  v17.4s        // t33
+        sqsub           v26.4s,  v19.4s,  v18.4s        // t34
+        sqadd           v27.4s,  v19.4s,  v18.4s        // t35
+        sqadd           v28.4s,  v20.4s,  v21.4s        // t60
+        sqsub           v29.4s,  v20.4s,  v21.4s        // t61
+        sqsub           v30.4s,  v23.4s,  v22.4s        // t62
+        sqadd           v31.4s,  v23.4s,  v22.4s        // t63
+
+        mul_mla         v2,  v29, v26, v0.s[0], v0.s[1] // -> t34a
+        mul_mls         v4,  v29, v26, v0.s[1], v0.s[0] // -> t61a
+        neg             v2.4s,   v2.4s                  // t34a
+        mul_mls         v6,  v30, v25, v0.s[1], v0.s[0] // -> t33a
+        srshr           v26.4s, v2.4s,  #12             // t34a
+        mul_mla         v2,  v30, v25, v0.s[0], v0.s[1] // -> t62a
+        srshr           v29.4s, v4.4s,  #12             // t61a
+        srshr           v25.4s, v6.4s,  #12             // t33a
+        srshr           v30.4s, v2.4s,  #12             // t62a
+
+        sqadd           v16.4s,  v24.4s,  v27.4s        // t32a
+        sqsub           v19.4s,  v24.4s,  v27.4s        // t35a
+        sqadd           v17.4s,  v25.4s,  v26.4s        // t33
+        sqsub           v18.4s,  v25.4s,  v26.4s        // t34
+        sqsub           v20.4s,  v31.4s,  v28.4s        // t60a
+        sqadd           v23.4s,  v31.4s,  v28.4s        // t63a
+        sqsub           v21.4s,  v30.4s,  v29.4s        // t61
+        sqadd           v22.4s,  v30.4s,  v29.4s        // t62
+
+        mul_mla         v2,  v21, v18, v0.s[2], v0.s[3] // -> t61a
+        mul_mls         v4,  v21, v18, v0.s[3], v0.s[2] // -> t34a
+        mul_mla         v6,  v20, v19, v0.s[2], v0.s[3] // -> t60
+        srshr           v21.4s, v2.4s,  #12             // t61a
+        srshr           v18.4s, v4.4s,  #12             // t34a
+        mul_mls         v2,  v20, v19, v0.s[3], v0.s[2] // -> t35
+        srshr           v20.4s, v6.4s,  #12             // t60
+        srshr           v19.4s, v2.4s,  #12             // t35
+
+        st1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64
+        st1             {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64
+
+        ret
+endfunc
+
+function inv_dct64_step2_neon
+        movrel          x16, idct_coeffs
+        ld1             {v0.4s}, [x16]
+1:
+        // t32a/33/34a/35/60/61a/62/63a
+        // t56a/57/58a/59/36/37a/38/39a
+        // t40a/41/42a/43/52/53a/54/55a
+        // t48a/49/50a/51/44/45a/46/47a
+        ldr             q16, [x6, #4*4*0]  // t32a
+        ldr             q17, [x9, #4*4*8]  // t39a
+        ldr             q18, [x9, #4*4*0]  // t63a
+        ldr             q19, [x6, #4*4*8]  // t56a
+        ldr             q20, [x6, #4*4*16] // t40a
+        ldr             q21, [x9, #4*4*24] // t47a
+        ldr             q22, [x9, #4*4*16] // t55a
+        ldr             q23, [x6, #4*4*24] // t48a
+
+        sqadd           v24.4s,  v16.4s, v17.4s         // t32
+        sqsub           v25.4s,  v16.4s, v17.4s         // t39
+        sqadd           v26.4s,  v18.4s, v19.4s         // t63
+        sqsub           v27.4s,  v18.4s, v19.4s         // t56
+        sqsub           v28.4s,  v21.4s, v20.4s         // t40
+        sqadd           v29.4s,  v21.4s, v20.4s         // t47
+        sqadd           v30.4s,  v23.4s, v22.4s         // t48
+        sqsub           v31.4s,  v23.4s, v22.4s         // t55
+
+        mul_mla         v2,  v27, v25, v0.s[3], v0.s[2] // -> t56a
+        mul_mls         v4,  v27, v25, v0.s[2], v0.s[3] // -> t39a
+        mul_mla         v6,  v31, v28, v0.s[3], v0.s[2] // -> t40a
+        srshr           v25.4s, v2.4s,  #12             // t56a
+        srshr           v27.4s, v4.4s,  #12             // t39a
+        neg             v6.4s,   v6.4s                  // t40a
+        mul_mls         v2,  v31, v28, v0.s[2], v0.s[3] // -> t55a
+        srshr           v31.4s, v6.4s,  #12             // t40a
+        srshr           v28.4s, v2.4s,  #12             // t55a
+
+        sqadd           v16.4s,  v24.4s,  v29.4s        // t32a
+        sqsub           v19.4s,  v24.4s,  v29.4s        // t47a
+        sqadd           v17.4s,  v27.4s,  v31.4s        // t39
+        sqsub           v18.4s,  v27.4s,  v31.4s        // t40
+        sqsub           v20.4s,  v26.4s,  v30.4s        // t48a
+        sqadd           v23.4s,  v26.4s,  v30.4s        // t63a
+        sqsub           v21.4s,  v25.4s,  v28.4s        // t55
+        sqadd           v22.4s,  v25.4s,  v28.4s        // t56
+
+        mul_mls         v2,  v21, v18, v0.s[0], v0.s[0] // -> t40a
+        mul_mla         v4,  v21, v18, v0.s[0], v0.s[0] // -> t55a
+        mul_mls         v6,  v20, v19, v0.s[0], v0.s[0] // -> t47
+        srshr           v18.4s, v2.4s,  #12             // t40a
+        srshr           v21.4s, v4.4s,  #12             // t55a
+        mul_mla         v2,  v20, v19, v0.s[0], v0.s[0] // -> t48
+        srshr           v19.4s, v6.4s,  #12             // t47
+        srshr           v20.4s, v2.4s,  #12             // t48
+
+        str             q16, [x6, #4*4*0]  // t32a
+        str             q17, [x9, #4*4*0]  // t39
+        str             q18, [x6, #4*4*8]  // t40a
+        str             q19, [x9, #4*4*8]  // t47
+        str             q20, [x6, #4*4*16] // t48
+        str             q21, [x9, #4*4*16] // t55a
+        str             q22, [x6, #4*4*24] // t56
+        str             q23, [x9, #4*4*24] // t63a
+
+        add             x6,  x6,  #4*4
+        sub             x9,  x9,  #4*4
+        cmp             x6,  x9
+        b.lt            1b
+        ret
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+.if \clear
+        ld1             {\i}, [\src]
+        st1             {\zero}, [\src], \strd
+.else
+        ld1             {\i}, [\src], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        st1             {\i}, [\dst], #16
+.endr
+.endm
+
+.macro clear_upper8
+.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        movi            \i,  #0
+.endr
+.endm
+
+.macro movi_if reg, val, cond
+.if \cond
+        movi            \reg, \val
+.endif
+.endm
+
+.macro movz16dup_if reg, gpr, val, cond
+.if \cond
+        movz            \gpr, \val, lsl #16
+        dup             \reg, \gpr
+.endif
+.endm
+
+.macro st1_if regs, dst, cond
+.if \cond
+        st1             \regs, \dst
+.endif
+.endm
+
+.macro str_if reg, dst, cond
+.if \cond
+        str             \reg, \dst
+.endif
+.endm
+
+.macro stroff_if reg, dst, dstoff, cond
+.if \cond
+        str             \reg, \dst, \dstoff
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+        scale_input     .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4s_x64_neon
+        mov             x14, x30
+        mov             x6,  sp
+        lsl             x8,  x8,  #2
+
+        movz16dup_if    v0.2s, w16, #2896*8, \scale
+        movi_if         v7.4s,  #0, \clear
+        load8           x7,  x8,  v7.4s, \clear
+        clear_upper8
+        sub             x7,  x7,  x8, lsl #3
+        add             x7,  x7,  x8, lsr #1
+        scale_if        \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+        bl              inv_dct_4s_x16_neon
+
+        store16         x6
+
+        movz16dup_if    v0.2s, w16, #2896*8, \scale
+        movi_if         v7.8h,  #0, \clear
+        load8           x7,  x8,  v7.4s, \clear
+        clear_upper8
+        sub             x7,  x7,  x8, lsl #3
+        lsr             x8,  x8,  #1
+        sub             x7,  x7,  x8, lsr #1
+        scale_if        \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+        bl              inv_dct32_odd_4s_x16_neon
+
+        add             x10, x6,  #16*15
+        sub             x6,  x6,  #16*16
+
+        mov             x9,  #-16
+
+.macro store_addsub r0, r1, r2, r3
+        ld1             {v2.4s}, [x6], #16
+        ld1             {v3.4s}, [x6], #16
+        sqadd           v6.4s,  v2.4s,  \r0
+        sqsub           \r0,    v2.4s,  \r0
+        ld1             {v4.4s}, [x6], #16
+        sqadd           v7.4s,  v3.4s,  \r1
+        sqsub           \r1,    v3.4s,  \r1
+        ld1             {v5.4s}, [x6], #16
+        sqadd           v2.4s,  v4.4s,  \r2
+        sub             x6,  x6,  #16*4
+        sqsub           \r2,    v4.4s,  \r2
+        st1             {v6.4s}, [x6], #16
+        st1             {\r0},   [x10], x9
+        sqadd           v3.4s,  v5.4s,  \r3
+        sqsub           \r3,    v5.4s,  \r3
+        st1             {v7.4s}, [x6], #16
+        st1             {\r1},   [x10], x9
+        st1             {v2.4s}, [x6], #16
+        st1             {\r2},   [x10], x9
+        st1             {v3.4s}, [x6], #16
+        st1             {\r3},   [x10], x9
+.endm
+        store_addsub    v31.4s, v30.4s, v29.4s, v28.4s
+        store_addsub    v27.4s, v26.4s, v25.4s, v24.4s
+        store_addsub    v23.4s, v22.4s, v21.4s, v20.4s
+        store_addsub    v19.4s, v18.4s, v17.4s, v16.4s
+.purgem store_addsub
+
+        add             x6,  x6,  #4*4*16
+
+        movrel          x17, idct64_coeffs
+        movz16dup_if    v0.2s, w16, #2896*8, \scale
+        movi_if         v7.4s,  #0, \clear
+        add             x9,  x7,  x8, lsl #4 // offset 16
+        add             x10, x7,  x8, lsl #3 // offset 8
+        sub             x9,  x9,  x8         // offset 15
+        sub             x11, x10, x8         // offset 7
+        ld1             {v16.4s}, [x7]  // in1  (offset 0)
+        ld1             {v17.4s}, [x9]  // in31 (offset 15)
+        ld1             {v18.4s}, [x10] // in17 (offset 8)
+        ld1             {v19.4s}, [x11] // in15 (offset 7)
+        st1_if          {v7.4s}, [x7],  \clear
+        st1_if          {v7.4s}, [x9],  \clear
+        st1_if          {v7.4s}, [x10], \clear
+        st1_if          {v7.4s}, [x11], \clear
+        scale_if        \scale, v0.s[0], v16, v17, v18, v19
+        bl              inv_dct64_step1_neon
+        movz16dup_if    v0.2s, w16, #2896*8, \scale
+        movi_if         v7.4s,  #0, \clear
+        add             x7,  x7,  x8, lsl #2 // offset 4
+        sub             x9,  x9,  x8, lsl #2 // offset 11
+        sub             x10, x7,  x8         // offset 3
+        add             x11, x9,  x8         // offset 12
+        ld1             {v16.4s}, [x10] // in7  (offset 3)
+        ld1             {v17.4s}, [x11] // in25 (offset 12)
+        ld1             {v18.4s}, [x9]  // in23 (offset 11)
+        ld1             {v19.4s}, [x7]  // in9  (offset 4)
+        st1_if          {v7.4s}, [x7],  \clear
+        st1_if          {v7.4s}, [x9],  \clear
+        st1_if          {v7.4s}, [x10], \clear
+        st1_if          {v7.4s}, [x11], \clear
+        scale_if        \scale, v0.s[0], v16, v17, v18, v19
+        bl              inv_dct64_step1_neon
+        movz16dup_if    v0.2s, w16, #2896*8, \scale
+        movi_if         v7.4s,  #0, \clear
+        sub             x10, x10, x8, lsl #1 // offset 1
+        sub             x9,  x9,  x8, lsl #1 // offset 9
+        add             x7,  x7,  x8         // offset 5
+        add             x11, x11, x8         // offset 13
+        ldr             q16, [x10, x8] // in5  (offset 2)
+        ldr             q17, [x11]     // in27 (offset 13)
+        ldr             q18, [x9,  x8] // in21 (offset 10)
+        ldr             q19, [x7]      // in11 (offset 5)
+        stroff_if       q7,  [x10, x8], \clear
+        str_if          q7,  [x11],     \clear
+        stroff_if       q7,  [x9,  x8], \clear
+        str_if          q7,  [x7],      \clear
+        scale_if        \scale, v0.s[0], v16, v17, v18, v19
+        bl              inv_dct64_step1_neon
+        movz16dup_if    v0.2s, w16, #2896*8, \scale
+        movi_if         v7.4s,  #0, \clear
+        ldr             q16, [x10]     // in3  (offset 1)
+        ldr             q17, [x11, x8] // in29 (offset 14)
+        ldr             q18, [x9]      // in19 (offset 9)
+        ldr             q19, [x7,  x8] // in13 (offset 6)
+        str_if          q7,  [x10],     \clear
+        stroff_if       q7,  [x11, x8], \clear
+        str_if          q7,  [x9],      \clear
+        stroff_if       q7,  [x7,  x8], \clear
+        scale_if        \scale, v0.s[0], v16, v17, v18, v19
+        bl              inv_dct64_step1_neon
+
+        sub             x6,  x6,  #4*4*32
+        add             x9,  x6,  #4*4*7
+
+        bl              inv_dct64_step2_neon
+
+        br              x14
+endfunc
+.endm
+
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+
+function inv_txfm_horz_dct_64x4_neon
+        mov             x14, x30
+
+        mov             x7,  sp
+        add             x8,  sp,  #4*4*(64 - 4)
+        add             x9,  x6,  #2*56
+        mov             x10, #2*64
+        mov             x11, #-4*4*4
+
+        dup             v7.4s,  w12
+1:
+        ld1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64
+        ld1             {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11
+        ld1             {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64
+        ld1             {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11
+        transpose_4x4s  v16, v17, v18, v19, v2,  v3,  v4,  v5
+        transpose_4x4s  v20, v21, v22, v23, v2,  v3,  v4,  v5
+        transpose_4x4s  v31, v30, v29, v28, v2,  v3,  v4,  v5
+        transpose_4x4s  v27, v26, v25, v24, v2,  v3,  v4,  v5
+
+.macro store_addsub src0, src1, src2, src3
+        sqsub           v1.4s,   \src0,   \src1
+        sqadd           v0.4s,   \src0,   \src1
+        sqsub           v3.4s,   \src2,   \src3
+        srshl           v1.4s,   v1.4s,   v7.4s
+        sqadd           v2.4s,   \src2,   \src3
+        srshl           v3.4s,   v3.4s,   v7.4s
+        srshl           v0.4s,   v0.4s,   v7.4s
+        srshl           v2.4s,   v2.4s,   v7.4s
+        sqxtn           v3.4h,   v3.4s
+        sqxtn2          v3.8h,   v1.4s
+        sqxtn           v0.4h,   v0.4s
+        sqxtn2          v0.8h,   v2.4s
+        rev64           v3.8h,   v3.8h
+        st1             {v0.8h},  [x6], x10
+        st1             {v3.8h},  [x9], x10
+.endm
+        store_addsub    v16.4s,  v31.4s,  v20.4s,  v27.4s
+        store_addsub    v17.4s,  v30.4s,  v21.4s,  v26.4s
+        store_addsub    v18.4s,  v29.4s,  v22.4s,  v25.4s
+        store_addsub    v19.4s,  v28.4s,  v23.4s,  v24.4s
+.purgem store_addsub
+        sub             x6,  x6,  x10, lsl #2
+        sub             x9,  x9,  x10, lsl #2
+        add             x6,  x6,  #16
+        sub             x9,  x9,  #16
+
+        cmp             x7,  x8
+        b.lt            1b
+        br              x14
+endfunc
+
+function inv_txfm_add_vert_dct_8x64_neon
+        mov             x14, x30
+        lsl             x8,  x8,  #1
+
+        mov             x7,  sp
+        add             x8,  sp,  #2*8*(64 - 4)
+        add             x9,  x6,  x1, lsl #6
+        sub             x9,  x9,  x1
+        neg             x10, x1
+        mov             x11, #-2*8*4
+
+1:
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+
+        movi            v6.8h,   #0
+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
+.macro add_dest_addsub src0, src1, src2, src3
+        ld1             {v0.8h}, [x6], x1
+        ld1             {v1.8h}, [x9], x10
+        sqadd           v4.8h,   \src0,   \src1
+        ld1             {v2.8h}, [x6]
+        sqsub           \src0,   \src0,   \src1
+        ld1             {v3.8h}, [x9]
+        sqadd           v5.8h,   \src2,   \src3
+        sqsub           \src2,   \src2,   \src3
+        sub             x6,  x6,  x1
+        sub             x9,  x9,  x10
+        srshr           v4.8h,   v4.8h,   #4
+        srshr           v5.8h,   v5.8h,   #4
+        srshr           \src0,   \src0,   #4
+        sqadd           v0.8h,   v0.8h,   v4.8h
+        srshr           \src2,   \src2,   #4
+        sqadd           v1.8h,   v1.8h,   \src0
+        sqadd           v2.8h,   v2.8h,   v5.8h
+        smax            v0.8h,   v0.8h,   v6.8h
+        sqadd           v3.8h,   v3.8h,   \src2
+        smax            v1.8h,   v1.8h,   v6.8h
+        smin            v0.8h,   v0.8h,   v7.8h
+        smax            v2.8h,   v2.8h,   v6.8h
+        smin            v1.8h,   v1.8h,   v7.8h
+        st1             {v0.8h}, [x6], x1
+        smax            v3.8h,   v3.8h,   v6.8h
+        smin            v2.8h,   v2.8h,   v7.8h
+        st1             {v1.8h}, [x9], x10
+        smin            v3.8h,   v3.8h,   v7.8h
+        st1             {v2.8h}, [x6], x1
+        st1             {v3.8h}, [x9], x10
+.endm
+        add_dest_addsub v16.8h,  v31.8h,  v17.8h,  v30.8h
+        add_dest_addsub v18.8h,  v29.8h,  v19.8h,  v28.8h
+        add_dest_addsub v20.8h,  v27.8h,  v21.8h,  v26.8h
+        add_dest_addsub v22.8h,  v25.8h,  v23.8h,  v24.8h
+.purgem add_dest_addsub
+        cmp             x7,  x8
+        b.lt            1b
+
+        br              x14
+endfunc
+
+.macro sub_sp space
+#ifdef _WIN32
+.if \space > 8192
+        // Here, we'd need to touch two (or more) pages while decrementing
+        // the stack pointer.
+        .error          "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+        sub             x16, sp,  #4096
+        ldr             xzr, [x16]
+        sub             sp,  x16, #(\space - 4096)
+.else
+        sub             sp,  sp,  #\space
+.endif
+#else
+.if \space >= 4096
+        sub             sp,  sp,  #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+        sub             sp,  sp,  #(\space)%4096
+.endif
+#endif
+.endm
+
+function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
+        idct_dc         64,  64,  2
+
+        mov             x15, x30
+
+        sub_sp          64*32*2+64*4*4
+        add             x5,  sp, #64*4*4
+
+        movrel          x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x6,  x5,  #(\i*64*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #32*4
+        mov             x12, #-2 // shift
+        bl              inv_txfm_dct_clear_4s_x64_neon
+        add             x6,  x5,  #(\i*64*2)
+        bl              inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+        ldrh            w12, [x13], #2
+.endif
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #2
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+        add             x7,  x5,  #(\i*2)
+        mov             x8,  #64*2
+        bl              X(inv_txfm_dct_8h_x64_neon)
+        add             x6,  x0,  #(\i*2)
+        bl              inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+        add             sp,  x5,  #64*32*2
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
+        idct_dc         64,  32,  1
+
+        mov             x15, x30
+
+        sub_sp          64*32*2+64*4*4
+        add             x5,  sp, #64*4*4
+
+        movrel          x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x6,  x5,  #(\i*64*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #32*4
+        mov             x12, #-1 // shift
+        bl              inv_txfm_dct_clear_scale_4s_x64_neon
+        add             x6,  x5,  #(\i*64*2)
+        bl              inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+        ldrh            w12, [x13], #2
+.endif
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #2
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+        add             x6,  x0,  #(\i*2)
+        add             x7,  x5,  #(\i*2)
+        mov             x8,  #64*2
+        bl              inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+        add             sp,  x5,  #64*32*2
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
+        idct_dc         32,  64,  1
+
+        mov             x15, x30
+
+        sub_sp          32*32*2+64*8*2
+        add             x5,  sp, #64*8*2
+
+        movrel          x13, eob_32x32
+        ldrh            w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x6,  x5,  #(\i*32*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+        ldrh            w12, [x13], #2
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #32*4
+        bl              inv_txfm_horz_scale_dct_32x4_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24
+        add             x7,  x5,  #(\i*2)
+        mov             x8,  #32*2
+        bl              X(inv_txfm_dct_8h_x64_neon)
+        add             x6,  x0,  #(\i*2)
+        bl              inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+        add             sp,  x5,  #32*32*2
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
+        idct_dc         64,  16,  2
+
+        mov             x15, x30
+
+        sub_sp          64*16*2+64*4*4
+        add             x4,  sp, #64*4*4
+
+        movrel          x13, eob_16x32
+
+.irp i, 0, 4, 8, 12
+        add             x6,  x4,  #(\i*64*2)
+.if \i > 0
+        mov             w8,  #(16 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #16*4
+        mov             x12, #-2 // shift
+        bl              inv_txfm_dct_clear_4s_x64_neon
+        add             x6,  x4,  #(\i*64*2)
+        bl              inv_txfm_horz_dct_64x4_neon
+.if \i < 12
+        ldrh            w12, [x13], #2
+.endif
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #2
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+        movrel          x5,  X(inv_dct_8h_x16_neon)
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+        add             x6,  x0,  #(\i*2)
+        add             x7,  x4,  #(\i*2)
+        mov             x8,  #64*2
+        bl              inv_txfm_add_vert_8x16_neon
+.endr
+
+        add             sp,  x4,  #64*16*2
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
+        idct_dc         16,  64,  2
+
+        mov             x15, x30
+
+        sub_sp          16*32*2+64*8*2
+        add             x5,  sp, #64*8*2
+
+        movrel          x13, eob_16x32
+        ldrh            w12, [x13], #2
+
+        adr             x4,  inv_dct_4s_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x6,  x5,  #(\i*16*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+        ldrh            w12, [x13], #2
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #32*4
+        bl              inv_txfm_horz_16x4_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 2
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8
+        add             x7,  x5,  #(\i*2)
+        mov             x8,  #16*2
+        bl              X(inv_txfm_dct_8h_x64_neon)
+        add             x6,  x0,  #(\i*2)
+        bl              inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+        add             sp,  x5,  #16*32*2
+        br              x15
+endfunc
diff --git a/src/arm/64/loopfilter.S b/src/arm/64/loopfilter.S
new file mode 100644 (file)
index 0000000..d45f208
--- /dev/null
@@ -0,0 +1,1123 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro loop_filter wd
+function lpf_16_wd\wd\()_neon
+        uabd            v0.16b,  v22.16b, v23.16b // abs(p1 - p0)
+        uabd            v1.16b,  v25.16b, v24.16b // abs(q1 - q0)
+        uabd            v2.16b,  v23.16b, v24.16b // abs(p0 - q0)
+        uabd            v3.16b,  v22.16b, v25.16b // abs(p1 - q1)
+.if \wd >= 6
+        uabd            v4.16b,  v21.16b, v22.16b // abs(p2 - p1)
+        uabd            v5.16b,  v26.16b, v25.16b // abs(q2 - q1)
+.endif
+.if \wd >= 8
+        uabd            v6.16b,  v20.16b, v21.16b // abs(p3 - p2)
+        uabd            v7.16b,  v27.16b, v26.16b // abs(q3 - q3)
+.endif
+.if \wd >= 6
+        umax            v4.16b,  v4.16b,  v5.16b
+.endif
+        uqadd           v2.16b,  v2.16b,  v2.16b  // abs(p0 - q0) * 2
+.if \wd >= 8
+        umax            v6.16b,  v6.16b,  v7.16b
+.endif
+        ushr            v3.16b,  v3.16b,  #1
+.if \wd >= 8
+        umax            v4.16b,  v4.16b,  v6.16b
+.endif
+.if \wd >= 6
+        and             v4.16b,  v4.16b,  v14.16b
+.endif
+        umax            v0.16b,  v0.16b,  v1.16b  // max(abs(p1 - p0), abs(q1 - q0))
+        uqadd           v2.16b,  v2.16b,  v3.16b  // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+        umax            v4.16b,  v0.16b,  v4.16b
+        cmhs            v1.16b,  v11.16b, v4.16b  // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+        cmhs            v1.16b,  v11.16b, v0.16b  // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+        cmhs            v2.16b,  v10.16b, v2.16b  // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+        and             v1.16b,  v1.16b,  v2.16b  // fm
+        and             v1.16b,  v1.16b,  v13.16b // fm && wd >= 4
+.if \wd >= 6
+        and             v14.16b, v14.16b, v1.16b  // fm && wd > 4
+.endif
+.if \wd >= 16
+        and             v15.16b, v15.16b, v1.16b  // fm && wd == 16
+.endif
+
+        mov             x16, v1.d[0]
+        mov             x17, v1.d[1]
+        adds            x16, x16, x17
+        b.eq            9f                        // if (!fm || wd < 4) return;
+
+.if \wd >= 6
+        movi            v10.16b, #1
+        uabd            v2.16b,  v21.16b, v23.16b // abs(p2 - p0)
+        uabd            v3.16b,  v22.16b, v23.16b // abs(p1 - p0)
+        uabd            v4.16b,  v25.16b, v24.16b // abs(q1 - q0)
+        uabd            v5.16b,  v26.16b, v24.16b // abs(q2 - q0)
+.if \wd >= 8
+        uabd            v6.16b,  v20.16b, v23.16b // abs(p3 - p0)
+        uabd            v7.16b,  v27.16b, v24.16b // abs(q3 - q0)
+.endif
+        umax            v2.16b,  v2.16b,  v3.16b
+        umax            v4.16b,  v4.16b,  v5.16b
+.if \wd >= 8
+        umax            v6.16b,  v6.16b,  v7.16b
+.endif
+        umax            v2.16b,  v2.16b,  v4.16b
+.if \wd >= 8
+        umax            v2.16b,  v2.16b,  v6.16b
+.endif
+
+.if \wd == 16
+        uabd            v3.16b,  v17.16b, v23.16b // abs(p6 - p0)
+        uabd            v4.16b,  v18.16b, v23.16b // abs(p5 - p0)
+        uabd            v5.16b,  v19.16b, v23.16b // abs(p4 - p0)
+.endif
+        cmhs            v2.16b,  v10.16b, v2.16b  // flat8in
+.if \wd == 16
+        uabd            v6.16b,  v28.16b, v24.16b // abs(q4 - q0)
+        uabd            v7.16b,  v29.16b, v24.16b // abs(q5 - q0)
+        uabd            v8.16b,  v30.16b, v24.16b // abs(q6 - q0)
+.endif
+        and             v14.16b, v2.16b,  v14.16b // flat8in && fm && wd > 4
+        bic             v1.16b,  v1.16b,  v14.16b // fm && wd >= 4 && !flat8in
+.if \wd == 16
+        umax            v3.16b,  v3.16b,  v4.16b
+        umax            v5.16b,  v5.16b,  v6.16b
+.endif
+        mov             x16, v1.d[0]
+        mov             x17, v1.d[1]
+.if \wd == 16
+        umax            v7.16b,  v7.16b,  v8.16b
+        umax            v3.16b,  v3.16b,  v5.16b
+        umax            v3.16b,  v3.16b,  v7.16b
+        cmhs            v3.16b,  v10.16b, v3.16b  // flat8out
+.endif
+        adds            x16, x16, x17
+.if \wd == 16
+        and             v15.16b, v15.16b, v3.16b  // flat8out && fm && wd == 16
+        and             v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
+        bic             v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
+.endif
+        b.eq            1f                        // skip wd == 4 case
+.endif
+
+        usubl           v2.8h,   v22.8b,  v25.8b  // p1 - q1
+        usubl2          v3.8h,   v22.16b, v25.16b
+        cmhi            v0.16b,  v0.16b,  v12.16b // hev
+        sqxtn           v2.8b,   v2.8h            // iclip_diff(p1 - q1)
+        sqxtn2          v2.16b,  v3.8h
+        and             v4.16b,  v2.16b,  v0.16b  // if (hev) iclip_diff(p1 - q1)
+        bic             v0.16b,  v1.16b,  v0.16b  // (fm && wd >= 4 && !hev)
+        usubl           v2.8h,   v24.8b,  v23.8b
+        movi            v5.8h,   #3
+        usubl2          v3.8h,   v24.16b, v23.16b
+        mul             v2.8h,   v2.8h,   v5.8h
+        mul             v3.8h,   v3.8h,   v5.8h
+        movi            v6.16b,  #4
+        saddw           v2.8h,   v2.8h,   v4.8b
+        saddw2          v3.8h,   v3.8h,   v4.16b
+        movi            v7.16b,  #3
+        sqxtn           v2.8b,   v2.8h            // f
+        sqxtn2          v2.16b,  v3.8h
+        sqadd           v4.16b,  v6.16b,  v2.16b  // imin(f + 4, 127)
+        sqadd           v5.16b,  v7.16b,  v2.16b  // imin(f + 3, 127)
+        sshr            v4.16b,  v4.16b,  #3      // f1
+        sshr            v5.16b,  v5.16b,  #3      // f2
+        uxtl            v2.8h,   v23.8b           // p0
+        uxtl2           v3.8h,   v23.16b
+        uxtl            v6.8h,   v24.8b           // q0
+        uxtl2           v7.8h,   v24.16b
+        saddw           v2.8h,   v2.8h,   v5.8b
+        saddw2          v3.8h,   v3.8h,   v5.16b
+        ssubw           v6.8h,   v6.8h,   v4.8b
+        ssubw2          v7.8h,   v7.8h,   v4.16b
+        srshr           v4.16b,  v4.16b,  #1      // (f1 + 1) >> 1
+        sqxtun          v2.8b,   v2.8h            // out p0
+        sqxtun2         v2.16b,  v3.8h
+        sqxtun          v6.8b,   v6.8h            // out q0
+        sqxtun2         v6.16b,  v7.8h
+        bit             v23.16b, v2.16b,  v1.16b  // if (fm && wd >= 4)
+        uxtl            v2.8h,   v22.8b           // p1
+        uxtl2           v3.8h,   v22.16b
+        bit             v24.16b, v6.16b,  v1.16b  // if (fm && wd >= 4)
+        uxtl            v6.8h,   v25.8b           // q1
+        uxtl2           v7.8h,   v25.16b
+        saddw           v2.8h,   v2.8h,   v4.8b
+        saddw2          v3.8h,   v3.8h,   v4.16b
+        ssubw           v6.8h,   v6.8h,   v4.8b
+        ssubw2          v7.8h,   v7.8h,   v4.16b
+        sqxtun          v2.8b,   v2.8h            // out p1
+        sqxtun2         v2.16b,  v3.8h
+        sqxtun          v6.8b,   v6.8h            // out q1
+        sqxtun2         v6.16b,  v7.8h
+        bit             v22.16b, v2.16b,  v0.16b  // if (fm && wd >= 4 && !hev)
+        bit             v25.16b, v6.16b,  v0.16b  // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+        mov             x16, v14.d[0]
+        mov             x17, v14.d[1]
+        adds            x16, x16, x17
+        b.eq            2f                        // skip if there's no flat8in
+
+        uaddl           v0.8h,   v21.8b,  v21.8b  // p2 * 2
+        uaddl2          v1.8h,   v21.16b, v21.16b
+        uaddl           v2.8h,   v21.8b,  v22.8b  // p2 + p1
+        uaddl2          v3.8h,   v21.16b, v22.16b
+        uaddl           v4.8h,   v22.8b,  v23.8b  // p1 + p0
+        uaddl2          v5.8h,   v22.16b, v23.16b
+        uaddl           v6.8h,   v23.8b,  v24.8b  // p0 + q0
+        uaddl2          v7.8h,   v23.16b, v24.16b
+        add             v8.8h,   v0.8h,   v2.8h
+        add             v9.8h,   v1.8h,   v3.8h
+        add             v10.8h,  v4.8h,   v6.8h
+        add             v11.8h,  v5.8h,   v7.8h
+        uaddl           v12.8h,  v24.8b,  v25.8b  // q0 + q1
+        uaddl2          v13.8h,  v24.16b, v25.16b
+        add             v8.8h,   v8.8h,   v10.8h
+        add             v9.8h,   v9.8h,   v11.8h
+        sub             v12.8h,  v12.8h,  v0.8h
+        sub             v13.8h,  v13.8h,  v1.8h
+        uaddl           v10.8h,  v25.8b,  v26.8b  // q1 + q2
+        uaddl2          v11.8h,  v25.16b, v26.16b
+        rshrn           v0.8b,   v8.8h,   #3      // out p1
+        rshrn2          v0.16b,  v9.8h,   #3
+
+        add             v8.8h,   v8.8h,   v12.8h
+        add             v9.8h,   v9.8h,   v13.8h
+        sub             v10.8h,  v10.8h,  v2.8h
+        sub             v11.8h,  v11.8h,  v3.8h
+        uaddl           v12.8h,  v26.8b,  v26.8b  // q2 + q2
+        uaddl2          v13.8h,  v26.16b, v26.16b
+        rshrn           v1.8b,   v8.8h,   #3      // out p0
+        rshrn2          v1.16b,  v9.8h,   #3
+
+        add             v8.8h,   v8.8h,   v10.8h
+        add             v9.8h,   v9.8h,   v11.8h
+        sub             v12.8h,  v12.8h,  v4.8h
+        sub             v13.8h,  v13.8h,  v5.8h
+        rshrn           v2.8b,   v8.8h,   #3      // out q0
+        rshrn2          v2.16b,  v9.8h,   #3
+
+        bit             v22.16b, v0.16b,  v14.16b // p1 if (flat8in)
+        add             v8.8h,   v8.8h,   v12.8h
+        add             v9.8h,   v9.8h,   v13.8h
+        bit             v23.16b, v1.16b,  v14.16b // p0 if (flat8in)
+        rshrn           v3.8b,   v8.8h,   #3      // out q1
+        rshrn2          v3.16b,  v9.8h,   #3
+        bit             v24.16b, v2.16b,  v14.16b // q0 if (flat8in)
+        bit             v25.16b, v3.16b,  v14.16b // q1 if (flat8in)
+.elseif \wd >= 8
+        mov             x16, v14.d[0]
+        mov             x17, v14.d[1]
+        adds            x16, x16, x17
+.if \wd == 8
+        b.eq            8f                        // skip if there's no flat8in
+.else
+        b.eq            2f                        // skip if there's no flat8in
+.endif
+
+        uaddl           v0.8h,   v20.8b,  v21.8b  // p3 + p2
+        uaddl2          v1.8h,   v20.16b, v21.16b
+        uaddl           v2.8h,   v22.8b,  v25.8b  // p1 + q1
+        uaddl2          v3.8h,   v22.16b, v25.16b
+        uaddl           v4.8h,   v20.8b,  v22.8b  // p3 + p1
+        uaddl2          v5.8h,   v20.16b, v22.16b
+        uaddl           v6.8h,   v23.8b,  v26.8b  // p0 + q2
+        uaddl2          v7.8h,   v23.16b, v26.16b
+        add             v8.8h,   v0.8h,   v0.8h   // 2 * (p3 + p2)
+        add             v9.8h,   v1.8h,   v1.8h
+        uaddw           v8.8h,   v8.8h,   v23.8b  // + p0
+        uaddw2          v9.8h,   v9.8h,   v23.16b
+        uaddw           v8.8h,   v8.8h,   v24.8b  // + q0
+        uaddw2          v9.8h,   v9.8h,   v24.16b
+        add             v8.8h,   v8.8h,   v4.8h
+        add             v9.8h,   v9.8h,   v5.8h   // + p3 + p1
+        sub             v2.8h,   v2.8h,   v0.8h   // p1 + q1 - p3 - p2
+        sub             v3.8h,   v3.8h,   v1.8h
+        sub             v6.8h,   v6.8h,   v4.8h   // p0 + q2 - p3 - p1
+        sub             v7.8h,   v7.8h,   v5.8h
+        rshrn           v10.8b,  v8.8h,   #3      // out p2
+        rshrn2          v10.16b, v9.8h,   #3
+
+        add             v8.8h,   v8.8h,   v2.8h
+        add             v9.8h,   v9.8h,   v3.8h
+        uaddl           v0.8h,   v20.8b,  v23.8b  // p3 + p0
+        uaddl2          v1.8h,   v20.16b, v23.16b
+        uaddl           v2.8h,   v24.8b,  v27.8b  // q0 + q3
+        uaddl2          v3.8h,   v24.16b, v27.16b
+        rshrn           v11.8b,  v8.8h,   #3      // out p1
+        rshrn2          v11.16b, v9.8h,   #3
+
+        add             v8.8h,   v8.8h,   v6.8h
+        add             v9.8h,   v9.8h,   v7.8h
+        sub             v2.8h,   v2.8h,   v0.8h   // q0 + q3 - p3 - p0
+        sub             v3.8h,   v3.8h,   v1.8h
+        uaddl           v4.8h,   v21.8b,  v24.8b  // p2 + q0
+        uaddl2          v5.8h,   v21.16b, v24.16b
+        uaddl           v6.8h,   v25.8b,  v27.8b  // q1 + q3
+        uaddl2          v7.8h,   v25.16b, v27.16b
+        rshrn           v12.8b,  v8.8h,   #3      // out p0
+        rshrn2          v12.16b, v9.8h,   #3
+
+        add             v8.8h,   v8.8h,   v2.8h
+        add             v9.8h,   v9.8h,   v3.8h
+        sub             v6.8h,   v6.8h,   v4.8h   // q1 + q3 - p2 - q0
+        sub             v7.8h,   v7.8h,   v5.8h
+        uaddl           v0.8h,   v22.8b,  v25.8b  // p1 + q1
+        uaddl2          v1.8h,   v22.16b, v25.16b
+        uaddl           v2.8h,   v26.8b,  v27.8b  // q2 + q3
+        uaddl2          v3.8h,   v26.16b, v27.16b
+        rshrn           v13.8b,  v8.8h,   #3      // out q0
+        rshrn2          v13.16b, v9.8h,   #3
+
+        add             v8.8h,   v8.8h,   v6.8h
+        add             v9.8h,   v9.8h,   v7.8h
+        sub             v2.8h,   v2.8h,   v0.8h   // q2 + q3 - p1 - q1
+        sub             v3.8h,   v3.8h,   v1.8h
+        rshrn           v0.8b,   v8.8h,   #3      // out q1
+        rshrn2          v0.16b,  v9.8h,   #3
+
+        add             v8.8h,   v8.8h,   v2.8h
+        add             v9.8h ,  v9.8h,   v3.8h
+
+        bit             v21.16b, v10.16b, v14.16b
+        bit             v22.16b, v11.16b, v14.16b
+        bit             v23.16b, v12.16b, v14.16b
+        rshrn           v1.8b,   v8.8h,   #3      // out q2
+        rshrn2          v1.16b,  v9.8h,   #3
+        bit             v24.16b, v13.16b, v14.16b
+        bit             v25.16b, v0.16b,  v14.16b
+        bit             v26.16b, v1.16b,  v14.16b
+.endif
+2:
+.if \wd == 16
+        mov             x16, v15.d[0]
+        mov             x17, v15.d[1]
+        adds            x16, x16, x17
+        b.ne            1f                        // check if flat8out is needed
+        mov             x16, v14.d[0]
+        mov             x17, v14.d[1]
+        adds            x16, x16, x17
+        b.eq            8f                        // if there was no flat8in, just write the inner 4 pixels
+        b               7f                        // if flat8in was used, write the inner 6 pixels
+1:
+
+        uaddl           v2.8h,   v17.8b,  v17.8b  // p6 + p6
+        uaddl2          v3.8h,   v17.16b, v17.16b
+        uaddl           v4.8h,   v17.8b,  v18.8b  // p6 + p5
+        uaddl2          v5.8h,   v17.16b, v18.16b
+        uaddl           v6.8h,   v17.8b,  v19.8b  // p6 + p4
+        uaddl2          v7.8h,   v17.16b, v19.16b
+        uaddl           v8.8h,   v17.8b,  v20.8b  // p6 + p3
+        uaddl2          v9.8h,   v17.16b, v20.16b
+        add             v12.8h,  v2.8h,   v4.8h
+        add             v13.8h,  v3.8h,   v5.8h
+        add             v10.8h,  v6.8h,   v8.8h
+        add             v11.8h,  v7.8h,   v9.8h
+        uaddl           v6.8h,   v17.8b,  v21.8b  // p6 + p2
+        uaddl2          v7.8h,   v17.16b, v21.16b
+        add             v12.8h,  v12.8h,  v10.8h
+        add             v13.8h,  v13.8h,  v11.8h
+        uaddl           v8.8h,   v17.8b,  v22.8b  // p6 + p1
+        uaddl2          v9.8h,   v17.16b, v22.16b
+        uaddl           v10.8h,  v18.8b,  v23.8b  // p5 + p0
+        uaddl2          v11.8h,  v18.16b, v23.16b
+        add             v6.8h,   v6.8h,   v8.8h
+        add             v7.8h,   v7.8h,   v9.8h
+        uaddl           v8.8h,   v19.8b,  v24.8b  // p4 + q0
+        uaddl2          v9.8h,   v19.16b, v24.16b
+        add             v12.8h,  v12.8h,  v6.8h
+        add             v13.8h,  v13.8h,  v7.8h
+        add             v10.8h,  v10.8h,  v8.8h
+        add             v11.8h,  v11.8h,  v9.8h
+        uaddl           v6.8h,   v20.8b,  v25.8b  // p3 + q1
+        uaddl2          v7.8h,   v20.16b, v25.16b
+        add             v12.8h,  v12.8h,  v10.8h
+        add             v13.8h,  v13.8h,  v11.8h
+        sub             v6.8h,   v6.8h,   v2.8h
+        sub             v7.8h,   v7.8h,   v3.8h
+        uaddl           v2.8h,   v21.8b,  v26.8b  // p2 + q2
+        uaddl2          v3.8h,   v21.16b, v26.16b
+        rshrn           v0.8b,   v12.8h,  #4      // out p5
+        rshrn2          v0.16b,  v13.8h,  #4
+        add             v12.8h,  v12.8h,  v6.8h   // - (p6 + p6) + (p3 + q1)
+        add             v13.8h,  v13.8h,  v7.8h
+        sub             v2.8h,   v2.8h,   v4.8h
+        sub             v3.8h,   v3.8h,   v5.8h
+        uaddl           v4.8h,   v22.8b,  v27.8b  // p1 + q3
+        uaddl2          v5.8h,   v22.16b, v27.16b
+        uaddl           v6.8h,   v17.8b,  v19.8b  // p6 + p4
+        uaddl2          v7.8h,   v17.16b, v19.16b
+        rshrn           v1.8b,   v12.8h,  #4      // out p4
+        rshrn2          v1.16b,  v13.8h,  #4
+        add             v12.8h,  v12.8h,  v2.8h   // - (p6 + p5) + (p2 + q2)
+        add             v13.8h,  v13.8h,  v3.8h
+        sub             v4.8h,   v4.8h,   v6.8h
+        sub             v5.8h,   v5.8h,   v7.8h
+        uaddl           v6.8h,   v23.8b,  v28.8b  // p0 + q4
+        uaddl2          v7.8h,   v23.16b, v28.16b
+        uaddl           v8.8h,   v17.8b,  v20.8b  // p6 + p3
+        uaddl2          v9.8h,   v17.16b, v20.16b
+        rshrn           v2.8b,   v12.8h,  #4      // out p3
+        rshrn2          v2.16b,  v13.8h,  #4
+        add             v12.8h,  v12.8h,  v4.8h   // - (p6 + p4) + (p1 + q3)
+        add             v13.8h,  v13.8h,  v5.8h
+        sub             v6.8h,   v6.8h,   v8.8h
+        sub             v7.8h,   v7.8h,   v9.8h
+        uaddl           v8.8h,   v24.8b,  v29.8b  // q0 + q5
+        uaddl2          v9.8h,   v24.16b, v29.16b
+        uaddl           v4.8h,   v17.8b,  v21.8b  // p6 + p2
+        uaddl2          v5.8h,   v17.16b, v21.16b
+        rshrn           v3.8b,   v12.8h,  #4      // out p2
+        rshrn2          v3.16b,  v13.8h,  #4
+        add             v12.8h,  v12.8h,  v6.8h   // - (p6 + p3) + (p0 + q4)
+        add             v13.8h,  v13.8h,  v7.8h
+        sub             v8.8h,   v8.8h,   v4.8h
+        sub             v9.8h,   v9.8h,   v5.8h
+        uaddl           v6.8h,   v25.8b,  v30.8b  // q1 + q6
+        uaddl2          v7.8h,   v25.16b, v30.16b
+        uaddl           v10.8h,  v17.8b,  v22.8b  // p6 + p1
+        uaddl2          v11.8h,  v17.16b, v22.16b
+        rshrn           v4.8b,   v12.8h,  #4      // out p1
+        rshrn2          v4.16b,  v13.8h,  #4
+        add             v12.8h,  v12.8h,  v8.8h   // - (p6 + p2) + (q0 + q5)
+        add             v13.8h,  v13.8h,  v9.8h
+        sub             v6.8h,   v6.8h,   v10.8h
+        sub             v7.8h,   v7.8h,   v11.8h
+        uaddl           v8.8h,   v26.8b,  v30.8b  // q2 + q6
+        uaddl2          v9.8h,   v26.16b, v30.16b
+        bif             v0.16b,  v18.16b, v15.16b // out p5
+        uaddl           v10.8h,  v18.8b,  v23.8b  // p5 + p0
+        uaddl2          v11.8h,  v18.16b, v23.16b
+        rshrn           v5.8b,   v12.8h,  #4      // out p0
+        rshrn2          v5.16b,  v13.8h,  #4
+        add             v12.8h,  v12.8h,  v6.8h   // - (p6 + p1) + (q1 + q6)
+        add             v13.8h,  v13.8h,  v7.8h
+        sub             v8.8h,   v8.8h,   v10.8h
+        sub             v9.8h,   v9.8h,   v11.8h
+        uaddl           v10.8h,  v27.8b,  v30.8b  // q3 + q6
+        uaddl2          v11.8h,  v27.16b, v30.16b
+        bif             v1.16b,  v19.16b, v15.16b // out p4
+        uaddl           v18.8h,  v19.8b,  v24.8b  // p4 + q0
+        uaddl2          v19.8h,  v19.16b, v24.16b
+        rshrn           v6.8b,   v12.8h,  #4      // out q0
+        rshrn2          v6.16b,  v13.8h,  #4
+        add             v12.8h,  v12.8h,  v8.8h   // - (p5 + p0) + (q2 + q6)
+        add             v13.8h,  v13.8h,  v9.8h
+        sub             v10.8h,  v10.8h,  v18.8h
+        sub             v11.8h,  v11.8h,  v19.8h
+        uaddl           v8.8h,   v28.8b,  v30.8b  // q4 + q6
+        uaddl2          v9.8h,   v28.16b, v30.16b
+        bif             v2.16b,  v20.16b, v15.16b // out p3
+        uaddl           v18.8h,  v20.8b,  v25.8b  // p3 + q1
+        uaddl2          v19.8h,  v20.16b, v25.16b
+        rshrn           v7.8b,   v12.8h,  #4      // out q1
+        rshrn2          v7.16b,  v13.8h,  #4
+        add             v12.8h,  v12.8h,  v10.8h  // - (p4 + q0) + (q3 + q6)
+        add             v13.8h,  v13.8h,  v11.8h
+        sub             v18.8h,  v8.8h,   v18.8h
+        sub             v19.8h,  v9.8h,   v19.8h
+        uaddl           v10.8h,  v29.8b,  v30.8b  // q5 + q6
+        uaddl2          v11.8h,  v29.16b, v30.16b
+        bif             v3.16b,  v21.16b, v15.16b // out p2
+        uaddl           v20.8h,  v21.8b,  v26.8b  // p2 + q2
+        uaddl2          v21.8h,  v21.16b, v26.16b
+        rshrn           v8.8b,   v12.8h,  #4      // out q2
+        rshrn2          v8.16b,  v13.8h,  #4
+        add             v12.8h,  v12.8h,  v18.8h  // - (p3 + q1) + (q4 + q6)
+        add             v13.8h,  v13.8h,  v19.8h
+        sub             v10.8h,  v10.8h,  v20.8h
+        sub             v11.8h,  v11.8h,  v21.8h
+        uaddl           v18.8h,  v30.8b,  v30.8b  // q6 + q6
+        uaddl2          v19.8h,  v30.16b, v30.16b
+        bif             v4.16b,  v22.16b, v15.16b // out p1
+        uaddl           v20.8h,  v22.8b,  v27.8b  // p1 + q3
+        uaddl2          v21.8h,  v22.16b, v27.16b
+        rshrn           v9.8b,   v12.8h,  #4      // out q3
+        rshrn2          v9.16b,  v13.8h,  #4
+        add             v12.8h,  v12.8h,  v10.8h  // - (p2 + q2) + (q5 + q6)
+        add             v13.8h,  v13.8h,  v11.8h
+        sub             v18.8h,  v18.8h,  v20.8h
+        sub             v19.8h,  v19.8h,  v21.8h
+        bif             v5.16b,  v23.16b, v15.16b // out p0
+        rshrn           v10.8b,  v12.8h,  #4      // out q4
+        rshrn2          v10.16b, v13.8h,  #4
+        add             v12.8h,  v12.8h,  v18.8h  // - (p1 + q3) + (q6 + q6)
+        add             v13.8h,  v13.8h,  v19.8h
+        rshrn           v11.8b,  v12.8h,  #4      // out q5
+        rshrn2          v11.16b, v13.8h,  #4
+        bif             v6.16b,  v24.16b, v15.16b // out q0
+        bif             v7.16b,  v25.16b, v15.16b // out q1
+        bif             v8.16b,  v26.16b, v15.16b // out q2
+        bif             v9.16b,  v27.16b, v15.16b // out q3
+        bif             v10.16b, v28.16b, v15.16b // out q4
+        bif             v11.16b, v29.16b, v15.16b // out q5
+.endif
+
+        ret
+.if \wd == 16
+7:
+        // Return to a shorter epilogue, writing only the inner 6 pixels
+        br              x13
+.endif
+.if \wd >= 8
+8:
+        // Return to a shorter epilogue, writing only the inner 4 pixels
+        br              x14
+.endif
+9:
+        // Return directly without writing back any pixels
+        br              x15
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_16_wd16
+        adr             x13, 7f
+        adr             x14, 8f
+        bl              lpf_16_wd16_neon
+.endm
+
+.macro lpf_16_wd8
+        adr             x14, 8f
+        bl              lpf_16_wd8_neon
+.endm
+
+.macro lpf_16_wd6
+        bl              lpf_16_wd6_neon
+.endm
+
+.macro lpf_16_wd4
+        bl              lpf_16_wd4_neon
+.endm
+
+function lpf_v_4_16_neon
+        mov             x15, x30
+        sub             x16, x0,  x1, lsl #1
+        ld1             {v22.16b}, [x16], x1 // p1
+        ld1             {v24.16b}, [x0],  x1 // q0
+        ld1             {v23.16b}, [x16], x1 // p0
+        ld1             {v25.16b}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+
+        lpf_16_wd4
+
+        sub             x16, x0,  x1, lsl #1
+        st1             {v22.16b}, [x16], x1 // p1
+        st1             {v24.16b}, [x0],  x1 // q0
+        st1             {v23.16b}, [x16], x1 // p0
+        st1             {v25.16b}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+endfunc
+
+function lpf_h_4_16_neon
+        mov             x15, x30
+        sub             x16, x0,  #2
+        add             x0,  x16, x1, lsl #3
+        ld1             {v22.s}[0], [x16], x1
+        ld1             {v22.s}[2], [x0],  x1
+        ld1             {v23.s}[0], [x16], x1
+        ld1             {v23.s}[2], [x0],  x1
+        ld1             {v24.s}[0], [x16], x1
+        ld1             {v24.s}[2], [x0],  x1
+        ld1             {v25.s}[0], [x16], x1
+        ld1             {v25.s}[2], [x0],  x1
+        ld1             {v22.s}[1], [x16], x1
+        ld1             {v22.s}[3], [x0],  x1
+        ld1             {v23.s}[1], [x16], x1
+        ld1             {v23.s}[3], [x0],  x1
+        ld1             {v24.s}[1], [x16], x1
+        ld1             {v24.s}[3], [x0],  x1
+        ld1             {v25.s}[1], [x16], x1
+        ld1             {v25.s}[3], [x0],  x1
+        add             x0,  x0,  #2
+
+        transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+
+        lpf_16_wd4
+
+        sub             x16, x0,  x1, lsl #4
+        sub             x16, x16, #2
+        transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #3
+
+        st1             {v22.s}[0], [x16], x1
+        st1             {v22.s}[2], [x0],  x1
+        st1             {v23.s}[0], [x16], x1
+        st1             {v23.s}[2], [x0],  x1
+        st1             {v24.s}[0], [x16], x1
+        st1             {v24.s}[2], [x0],  x1
+        st1             {v25.s}[0], [x16], x1
+        st1             {v25.s}[2], [x0],  x1
+        st1             {v22.s}[1], [x16], x1
+        st1             {v22.s}[3], [x0],  x1
+        st1             {v23.s}[1], [x16], x1
+        st1             {v23.s}[3], [x0],  x1
+        st1             {v24.s}[1], [x16], x1
+        st1             {v24.s}[3], [x0],  x1
+        st1             {v25.s}[1], [x16], x1
+        st1             {v25.s}[3], [x0],  x1
+        add             x0,  x0,  #2
+        br              x15
+endfunc
+
+function lpf_v_6_16_neon
+        mov             x15, x30
+        sub             x16, x0,  x1, lsl #1
+        sub             x16, x16, x1
+        ld1             {v21.16b}, [x16], x1 // p2
+        ld1             {v24.16b}, [x0],  x1 // q0
+        ld1             {v22.16b}, [x16], x1 // p1
+        ld1             {v25.16b}, [x0],  x1 // q1
+        ld1             {v23.16b}, [x16], x1 // p0
+        ld1             {v26.16b}, [x0],  x1 // q2
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+
+        lpf_16_wd6
+
+        sub             x16, x0,  x1, lsl #1
+        st1             {v22.16b}, [x16], x1 // p1
+        st1             {v24.16b}, [x0],  x1 // q0
+        st1             {v23.16b}, [x16], x1 // p0
+        st1             {v25.16b}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+endfunc
+
+function lpf_h_6_16_neon
+        mov             x15, x30
+        sub             x16, x0,  #4
+        add             x0,  x16, x1, lsl #3
+        ld1             {v20.d}[0], [x16], x1
+        ld1             {v20.d}[1], [x0],  x1
+        ld1             {v21.d}[0], [x16], x1
+        ld1             {v21.d}[1], [x0],  x1
+        ld1             {v22.d}[0], [x16], x1
+        ld1             {v22.d}[1], [x0],  x1
+        ld1             {v23.d}[0], [x16], x1
+        ld1             {v23.d}[1], [x0],  x1
+        ld1             {v24.d}[0], [x16], x1
+        ld1             {v24.d}[1], [x0],  x1
+        ld1             {v25.d}[0], [x16], x1
+        ld1             {v25.d}[1], [x0],  x1
+        ld1             {v26.d}[0], [x16], x1
+        ld1             {v26.d}[1], [x0],  x1
+        ld1             {v27.d}[0], [x16], x1
+        ld1             {v27.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+
+        transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        lpf_16_wd6
+
+        sub             x16, x0,  x1, lsl #4
+        sub             x16, x16, #2
+        transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #3
+
+        st1             {v22.s}[0], [x16], x1
+        st1             {v22.s}[2], [x0],  x1
+        st1             {v23.s}[0], [x16], x1
+        st1             {v23.s}[2], [x0],  x1
+        st1             {v24.s}[0], [x16], x1
+        st1             {v24.s}[2], [x0],  x1
+        st1             {v25.s}[0], [x16], x1
+        st1             {v25.s}[2], [x0],  x1
+        st1             {v22.s}[1], [x16], x1
+        st1             {v22.s}[3], [x0],  x1
+        st1             {v23.s}[1], [x16], x1
+        st1             {v23.s}[3], [x0],  x1
+        st1             {v24.s}[1], [x16], x1
+        st1             {v24.s}[3], [x0],  x1
+        st1             {v25.s}[1], [x16], x1
+        st1             {v25.s}[3], [x0],  x1
+        add             x0,  x0,  #2
+        br              x15
+endfunc
+
+function lpf_v_8_16_neon
+        mov             x15, x30
+        sub             x16, x0,  x1, lsl #2
+        ld1             {v20.16b}, [x16], x1 // p3
+        ld1             {v24.16b}, [x0],  x1 // q0
+        ld1             {v21.16b}, [x16], x1 // p2
+        ld1             {v25.16b}, [x0],  x1 // q1
+        ld1             {v22.16b}, [x16], x1 // p1
+        ld1             {v26.16b}, [x0],  x1 // q2
+        ld1             {v23.16b}, [x16], x1 // p0
+        ld1             {v27.16b}, [x0],  x1 // q3
+        sub             x0,  x0,  x1, lsl #2
+
+        lpf_16_wd8
+
+        sub             x16, x0,  x1, lsl #1
+        sub             x16, x16,  x1
+        st1             {v21.16b}, [x16], x1 // p2
+        st1             {v24.16b}, [x0],  x1 // q0
+        st1             {v22.16b}, [x16], x1 // p1
+        st1             {v25.16b}, [x0],  x1 // q1
+        st1             {v23.16b}, [x16], x1 // p0
+        st1             {v26.16b}, [x0],  x1 // q2
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+        br              x15
+
+8:
+        sub             x16, x0,  x1, lsl #1
+        st1             {v22.16b}, [x16], x1 // p1
+        st1             {v24.16b}, [x0],  x1 // q0
+        st1             {v23.16b}, [x16], x1 // p0
+        st1             {v25.16b}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+endfunc
+
+function lpf_h_8_16_neon
+        mov             x15, x30
+        sub             x16, x0,  #4
+        add             x0,  x16, x1, lsl #3
+        ld1             {v20.d}[0], [x16], x1
+        ld1             {v20.d}[1], [x0],  x1
+        ld1             {v21.d}[0], [x16], x1
+        ld1             {v21.d}[1], [x0],  x1
+        ld1             {v22.d}[0], [x16], x1
+        ld1             {v22.d}[1], [x0],  x1
+        ld1             {v23.d}[0], [x16], x1
+        ld1             {v23.d}[1], [x0],  x1
+        ld1             {v24.d}[0], [x16], x1
+        ld1             {v24.d}[1], [x0],  x1
+        ld1             {v25.d}[0], [x16], x1
+        ld1             {v25.d}[1], [x0],  x1
+        ld1             {v26.d}[0], [x16], x1
+        ld1             {v26.d}[1], [x0],  x1
+        ld1             {v27.d}[0], [x16], x1
+        ld1             {v27.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+
+        transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        lpf_16_wd8
+
+        sub             x16, x0,  x1, lsl #4
+        sub             x16, x16, #4
+        transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #3
+
+        st1             {v20.d}[0], [x16], x1
+        st1             {v20.d}[1], [x0],  x1
+        st1             {v21.d}[0], [x16], x1
+        st1             {v21.d}[1], [x0],  x1
+        st1             {v22.d}[0], [x16], x1
+        st1             {v22.d}[1], [x0],  x1
+        st1             {v23.d}[0], [x16], x1
+        st1             {v23.d}[1], [x0],  x1
+        st1             {v24.d}[0], [x16], x1
+        st1             {v24.d}[1], [x0],  x1
+        st1             {v25.d}[0], [x16], x1
+        st1             {v25.d}[1], [x0],  x1
+        st1             {v26.d}[0], [x16], x1
+        st1             {v26.d}[1], [x0],  x1
+        st1             {v27.d}[0], [x16], x1
+        st1             {v27.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+        br              x15
+8:
+        sub             x16, x0,  x1, lsl #4
+        sub             x16, x16, #2
+        transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #3
+
+        st1             {v22.s}[0], [x16], x1
+        st1             {v22.s}[2], [x0],  x1
+        st1             {v23.s}[0], [x16], x1
+        st1             {v23.s}[2], [x0],  x1
+        st1             {v24.s}[0], [x16], x1
+        st1             {v24.s}[2], [x0],  x1
+        st1             {v25.s}[0], [x16], x1
+        st1             {v25.s}[2], [x0],  x1
+        st1             {v22.s}[1], [x16], x1
+        st1             {v22.s}[3], [x0],  x1
+        st1             {v23.s}[1], [x16], x1
+        st1             {v23.s}[3], [x0],  x1
+        st1             {v24.s}[1], [x16], x1
+        st1             {v24.s}[3], [x0],  x1
+        st1             {v25.s}[1], [x16], x1
+        st1             {v25.s}[3], [x0],  x1
+        add             x0,  x0,  #2
+        br              x15
+endfunc
+
+function lpf_v_16_16_neon
+        mov             x15, x30
+
+        sub             x16, x0,  x1, lsl #3
+        add             x16, x16, x1
+        ld1             {v17.16b}, [x16], x1 // p6
+        ld1             {v24.16b}, [x0],  x1 // q0
+        ld1             {v18.16b}, [x16], x1 // p5
+        ld1             {v25.16b}, [x0],  x1 // q1
+        ld1             {v19.16b}, [x16], x1 // p4
+        ld1             {v26.16b}, [x0],  x1 // q2
+        ld1             {v20.16b}, [x16], x1 // p3
+        ld1             {v27.16b}, [x0],  x1 // q3
+        ld1             {v21.16b}, [x16], x1 // p2
+        ld1             {v28.16b}, [x0],  x1 // q4
+        ld1             {v22.16b}, [x16], x1 // p1
+        ld1             {v29.16b}, [x0],  x1 // q5
+        ld1             {v23.16b}, [x16], x1 // p0
+        ld1             {v30.16b}, [x0],  x1 // q6
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  x1
+
+        lpf_16_wd16
+
+        sub             x16, x0,  x1, lsl #2
+        sub             x16, x16,  x1, lsl #1
+        st1             {v0.16b},  [x16], x1 // p5
+        st1             {v6.16b},  [x0],  x1 // q0
+        st1             {v1.16b},  [x16], x1 // p4
+        st1             {v7.16b},  [x0],  x1 // q1
+        st1             {v2.16b},  [x16], x1 // p3
+        st1             {v8.16b},  [x0],  x1 // q2
+        st1             {v3.16b},  [x16], x1 // p2
+        st1             {v9.16b},  [x0],  x1 // q3
+        st1             {v4.16b},  [x16], x1 // p1
+        st1             {v10.16b}, [x0],  x1 // q4
+        st1             {v5.16b},  [x16], x1 // p0
+        st1             {v11.16b}, [x0],  x1 // q5
+        sub             x0,  x0,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+7:
+        sub             x16, x0,  x1
+        sub             x16, x16, x1, lsl #1
+        st1             {v21.16b}, [x16], x1 // p2
+        st1             {v24.16b}, [x0],  x1 // q0
+        st1             {v22.16b}, [x16], x1 // p1
+        st1             {v25.16b}, [x0],  x1 // q1
+        st1             {v23.16b}, [x16], x1 // p0
+        st1             {v26.16b}, [x0],  x1 // q2
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+        br              x15
+
+8:
+        sub             x16, x0,  x1, lsl #1
+        st1             {v22.16b}, [x16], x1 // p1
+        st1             {v24.16b}, [x0],  x1 // q0
+        st1             {v23.16b}, [x16], x1 // p0
+        st1             {v25.16b}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+endfunc
+
+function lpf_h_16_16_neon
+        mov             x15, x30
+        sub             x16, x0,  #8
+        ld1             {v16.d}[0], [x16], x1
+        ld1             {v24.d}[0], [x0],  x1
+        ld1             {v17.d}[0], [x16], x1
+        ld1             {v25.d}[0], [x0],  x1
+        ld1             {v18.d}[0], [x16], x1
+        ld1             {v26.d}[0], [x0],  x1
+        ld1             {v19.d}[0], [x16], x1
+        ld1             {v27.d}[0], [x0],  x1
+        ld1             {v20.d}[0], [x16], x1
+        ld1             {v28.d}[0], [x0],  x1
+        ld1             {v21.d}[0], [x16], x1
+        ld1             {v29.d}[0], [x0],  x1
+        ld1             {v22.d}[0], [x16], x1
+        ld1             {v30.d}[0], [x0],  x1
+        ld1             {v23.d}[0], [x16], x1
+        ld1             {v31.d}[0], [x0],  x1
+        ld1             {v16.d}[1], [x16], x1
+        ld1             {v24.d}[1], [x0],  x1
+        ld1             {v17.d}[1], [x16], x1
+        ld1             {v25.d}[1], [x0],  x1
+        ld1             {v18.d}[1], [x16], x1
+        ld1             {v26.d}[1], [x0],  x1
+        ld1             {v19.d}[1], [x16], x1
+        ld1             {v27.d}[1], [x0],  x1
+        ld1             {v20.d}[1], [x16], x1
+        ld1             {v28.d}[1], [x0],  x1
+        ld1             {v21.d}[1], [x16], x1
+        ld1             {v29.d}[1], [x0],  x1
+        ld1             {v22.d}[1], [x16], x1
+        ld1             {v30.d}[1], [x0],  x1
+        ld1             {v23.d}[1], [x16], x1
+        ld1             {v31.d}[1], [x0],  x1
+
+        transpose_8x16b v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+        transpose_8x16b v24, v25, v26, v27, v28, v29, v30, v31, v0,  v1
+
+        lpf_16_wd16
+
+        sub             x0,  x0,  x1, lsl #4
+        sub             x16, x0,  #8
+
+        transpose_8x16b v16, v17, v0,  v1,  v2,  v3,  v4,  v5,  v18, v19
+        transpose_8x16b v6,  v7,  v8,  v9,  v10, v11, v30, v31, v18, v19
+
+        st1             {v16.d}[0], [x16], x1
+        st1             {v6.d}[0],  [x0],  x1
+        st1             {v17.d}[0], [x16], x1
+        st1             {v7.d}[0],  [x0],  x1
+        st1             {v0.d}[0],  [x16], x1
+        st1             {v8.d}[0],  [x0],  x1
+        st1             {v1.d}[0],  [x16], x1
+        st1             {v9.d}[0],  [x0],  x1
+        st1             {v2.d}[0],  [x16], x1
+        st1             {v10.d}[0], [x0],  x1
+        st1             {v3.d}[0],  [x16], x1
+        st1             {v11.d}[0], [x0],  x1
+        st1             {v4.d}[0],  [x16], x1
+        st1             {v30.d}[0], [x0],  x1
+        st1             {v5.d}[0],  [x16], x1
+        st1             {v31.d}[0], [x0],  x1
+        st1             {v16.d}[1], [x16], x1
+        st1             {v6.d}[1],  [x0],  x1
+        st1             {v17.d}[1], [x16], x1
+        st1             {v7.d}[1],  [x0],  x1
+        st1             {v0.d}[1],  [x16], x1
+        st1             {v8.d}[1],  [x0],  x1
+        st1             {v1.d}[1],  [x16], x1
+        st1             {v9.d}[1],  [x0],  x1
+        st1             {v2.d}[1],  [x16], x1
+        st1             {v10.d}[1], [x0],  x1
+        st1             {v3.d}[1],  [x16], x1
+        st1             {v11.d}[1], [x0],  x1
+        st1             {v4.d}[1],  [x16], x1
+        st1             {v30.d}[1], [x0],  x1
+        st1             {v5.d}[1],  [x16], x1
+        st1             {v31.d}[1], [x0],  x1
+        br              x15
+
+7:
+        sub             x16, x0,  x1, lsl #4
+        sub             x16, x16, #4
+        transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #3
+
+        st1             {v20.d}[0], [x16], x1
+        st1             {v20.d}[1], [x0],  x1
+        st1             {v21.d}[0], [x16], x1
+        st1             {v21.d}[1], [x0],  x1
+        st1             {v22.d}[0], [x16], x1
+        st1             {v22.d}[1], [x0],  x1
+        st1             {v23.d}[0], [x16], x1
+        st1             {v23.d}[1], [x0],  x1
+        st1             {v24.d}[0], [x16], x1
+        st1             {v24.d}[1], [x0],  x1
+        st1             {v25.d}[0], [x16], x1
+        st1             {v25.d}[1], [x0],  x1
+        st1             {v26.d}[0], [x16], x1
+        st1             {v26.d}[1], [x0],  x1
+        st1             {v27.d}[0], [x16], x1
+        st1             {v27.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+        br              x15
+8:
+        sub             x16, x0,  x1, lsl #4
+        sub             x16, x16, #2
+        transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #3
+
+        st1             {v22.s}[0], [x16], x1
+        st1             {v22.s}[2], [x0],  x1
+        st1             {v23.s}[0], [x16], x1
+        st1             {v23.s}[2], [x0],  x1
+        st1             {v24.s}[0], [x16], x1
+        st1             {v24.s}[2], [x0],  x1
+        st1             {v25.s}[0], [x16], x1
+        st1             {v25.s}[2], [x0],  x1
+        st1             {v22.s}[1], [x16], x1
+        st1             {v22.s}[3], [x0],  x1
+        st1             {v23.s}[1], [x16], x1
+        st1             {v23.s}[3], [x0],  x1
+        st1             {v24.s}[1], [x16], x1
+        st1             {v24.s}[3], [x0],  x1
+        st1             {v25.s}[1], [x16], x1
+        st1             {v25.s}[3], [x0],  x1
+        add             x0,  x0,  #2
+        br              x15
+endfunc
+
+// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                 const uint32_t *const vmask,
+//                                 const uint8_t (*l)[4], ptrdiff_t b4_stride,
+//                                 const Av1FilterLUT *lut, const int w)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
+        mov             x11, x30
+        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]
+        ldp             w6,  w7,  [x2]           // vmask[0], vmask[1]
+.ifc \type, y
+        ldr             w2,  [x2, #8]            // vmask[2]
+.endif
+        add             x5,  x5,  #128           // Move to sharp part of lut
+.ifc \type, y
+        orr             w7,  w7,  w2             // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+        sub             x4,  x3,  x4, lsl #2
+.else
+        sub             x3,  x3,  #4
+        lsl             x4,  x4,  #2
+.endif
+        orr             w6,  w6,  w7             // vmask[0] |= vmask[1]
+
+1:
+        tst             w6,  #0x0f
+.ifc \dir, v
+        ld1             {v0.16b}, [x4], #16
+        ld1             {v1.16b}, [x3], #16
+.else
+        ld2             {v0.s,v1.s}[0], [x3], x4
+        ld2             {v0.s,v1.s}[1], [x3], x4
+        ld2             {v0.s,v1.s}[2], [x3], x4
+        ld2             {v0.s,v1.s}[3], [x3], x4
+.endif
+        b.eq            7f                        // if (!(vm & bits)) continue;
+
+        ld1r            {v5.16b}, [x5]            // sharp[0]
+        add             x5,  x5,  #8
+        movi            v2.4s,   #0xff
+        dup             v13.4s,  w6               // vmask[0]
+
+        and             v0.16b,  v0.16b,  v2.16b  // Keep only lowest byte in each 32 bit word
+        and             v1.16b,  v1.16b,  v2.16b
+        cmtst           v3.16b,  v1.16b,  v2.16b  // Check for nonzero values in l[0][0]
+        movi            v4.16b,  #1
+        ld1r            {v6.16b}, [x5]            // sharp[1]
+        sub             x5,  x5,  #8
+        bif             v1.16b,  v0.16b,  v3.16b  // if (!l[0][0]) L = l[offset][0]
+        mul             v1.4s,   v1.4s,   v4.4s   // L
+.ifc \type, y
+        dup             v15.4s,  w2               // vmask[2]
+.endif
+        cmtst           v2.4s,   v1.4s,   v2.4s   // L != 0
+        dup             v14.4s,  w7               // vmask[1]
+        mov             x16, v2.d[0]
+        mov             x17, v2.d[1]
+        adds            x16, x16, x17
+        b.eq            7f                        // if (!L) continue;
+        neg             v5.16b,  v5.16b           // -sharp[0]
+        movrel          x16,  word_1248
+        ushr            v12.16b, v1.16b,  #4      // H
+        ld1             {v16.4s}, [x16]
+        sshl            v3.16b,  v1.16b,  v5.16b  // L >> sharp[0]
+.ifc \type, y
+        cmtst           v15.4s,  v15.4s,  v16.4s  // if (vmask[2] & bits)
+.endif
+        movi            v7.16b,  #2
+        umin            v3.16b,  v3.16b,  v6.16b  // imin(L >> sharp[0], sharp[1])
+        add             v0.16b,  v1.16b,  v7.16b  // L + 2
+        umax            v11.16b, v3.16b,  v4.16b  // imax(imin(), 1) = limit = I
+        add             v0.16b,  v0.16b,  v0.16b  // 2*(L + 2)
+        cmtst           v14.4s,  v14.4s,  v16.4s  // if (vmask[1] & bits)
+        add             v10.16b, v0.16b,  v11.16b // 2*(L + 2) + limit = E
+        cmtst           v13.4s,  v13.4s,  v16.4s  // if (vmask[0] & bits)
+        and             v13.16b, v13.16b, v2.16b  // vmask[0] &= L != 0
+
+.ifc \type, y
+        tst             w2,  #0x0f
+        b.eq            2f
+        // wd16
+        bl              lpf_\dir\()_16_16_neon
+        b               8f
+2:
+.endif
+        tst             w7,  #0x0f
+        b.eq            3f
+.ifc \type, y
+        // wd8
+        bl              lpf_\dir\()_8_16_neon
+.else
+        // wd6
+        bl              lpf_\dir\()_6_16_neon
+.endif
+        b               8f
+3:
+        // wd4
+        bl              lpf_\dir\()_4_16_neon
+.ifc \dir, h
+        b               8f
+7:
+        // For dir h, the functions above increment x0.
+        // If the whole function is skipped, increment it here instead.
+        add             x0,  x0,  x1,  lsl #4
+.else
+7:
+.endif
+8:
+        lsr             w6,  w6,  #4              // vmask[0] >>= 4
+        lsr             w7,  w7,  #4              // vmask[1] >>= 4
+.ifc \type, y
+        lsr             w2,  w2,  #4              // vmask[2] >>= 4
+.endif
+.ifc \dir, v
+        add             x0,  x0,  #16
+.else
+        // For dir h, x0 is returned incremented
+.endif
+        cbnz            w6,  1b
+
+        ldp             d14, d15, [sp, #0x30]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x40
+        br              x11
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_1248
+        .word 1, 2, 4, 8
+endconst
diff --git a/src/arm/64/loopfilter16.S b/src/arm/64/loopfilter16.S
new file mode 100644 (file)
index 0000000..a731918
--- /dev/null
@@ -0,0 +1,907 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro loop_filter wd
+function lpf_8_wd\wd\()_neon
+        uabd            v0.8h,   v22.8h,  v23.8h  // abs(p1 - p0)
+        uabd            v1.8h,   v25.8h,  v24.8h  // abs(q1 - q0)
+        uabd            v2.8h,   v23.8h,  v24.8h  // abs(p0 - q0)
+        uabd            v3.8h,   v22.8h,  v25.8h  // abs(p1 - q1)
+.if \wd >= 6
+        uabd            v4.8h,   v21.8h,  v22.8h  // abs(p2 - p1)
+        uabd            v5.8h,   v26.8h,  v25.8h  // abs(q2 - q1)
+.endif
+.if \wd >= 8
+        uabd            v6.8h,   v20.8h,  v21.8h  // abs(p3 - p2)
+        uabd            v7.8h,   v27.8h,  v26.8h  // abs(q3 - q3)
+.endif
+.if \wd >= 6
+        umax            v4.8h,   v4.8h,   v5.8h
+.endif
+        uqadd           v2.8h,   v2.8h,   v2.8h   // abs(p0 - q0) * 2
+.if \wd >= 8
+        umax            v6.8h,   v6.8h,   v7.8h
+.endif
+        ushr            v3.8h,   v3.8h,   #1
+.if \wd >= 8
+        umax            v4.8h,   v4.8h,   v6.8h
+.endif
+.if \wd >= 6
+        and             v4.16b,  v4.16b,  v14.16b
+.endif
+        umax            v0.8h,   v0.8h,   v1.8h   // max(abs(p1 - p0), abs(q1 - q0))
+        uqadd           v2.8h,   v2.8h,   v3.8h   // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+        umax            v4.8h,   v0.8h,   v4.8h
+        cmhs            v1.8h,   v11.8h,  v4.8h   // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+        cmhs            v1.8h,   v11.8h,  v0.8h   // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+        cmhs            v2.8h,   v10.8h,  v2.8h   // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+        and             v1.16b,  v1.16b,  v2.16b  // fm
+        and             v1.16b,  v1.16b,  v13.16b // fm && wd >= 4
+.if \wd >= 6
+        and             v14.16b, v14.16b, v1.16b  // fm && wd > 4
+.endif
+.if \wd >= 16
+        and             v15.16b, v15.16b, v1.16b  // fm && wd == 16
+.endif
+
+        mov             x16, v1.d[0]
+        mov             x17, v1.d[1]
+        adds            x16, x16, x17
+        b.eq            9f                        // if (!fm || wd < 4) return;
+
+.if \wd >= 6
+        movi            v10.8h,  #1
+        uabd            v2.8h,   v21.8h,  v23.8h  // abs(p2 - p0)
+        uabd            v3.8h,   v22.8h,  v23.8h  // abs(p1 - p0)
+        uabd            v4.8h,   v25.8h,  v24.8h  // abs(q1 - q0)
+        uabd            v5.8h,   v26.8h,  v24.8h  // abs(q2 - q0)
+        dup             v9.8h,   w9               // bitdepth_min_8
+.if \wd >= 8
+        uabd            v6.8h,   v20.8h,  v23.8h  // abs(p3 - p0)
+        uabd            v7.8h,   v27.8h,  v24.8h  // abs(q3 - q0)
+.endif
+        umax            v2.8h,   v2.8h,   v3.8h
+        umax            v4.8h,   v4.8h,   v5.8h
+.if \wd >= 8
+        umax            v6.8h,   v6.8h,   v7.8h
+.endif
+        umax            v2.8h,   v2.8h,   v4.8h
+        ushl            v10.8h,  v10.8h,  v9.8h   // F = 1 << bitdepth_min_8
+.if \wd >= 8
+        umax            v2.8h,   v2.8h,   v6.8h
+.endif
+
+.if \wd == 16
+        uabd            v3.8h,   v17.8h,  v23.8h  // abs(p6 - p0)
+        uabd            v4.8h,   v18.8h,  v23.8h  // abs(p5 - p0)
+        uabd            v5.8h,   v19.8h,  v23.8h  // abs(p4 - p0)
+.endif
+        cmhs            v2.8h,   v10.8h,  v2.8h   // flat8in
+.if \wd == 16
+        uabd            v6.8h,   v28.8h,  v24.8h  // abs(q4 - q0)
+        uabd            v7.8h,   v29.8h,  v24.8h  // abs(q5 - q0)
+        uabd            v8.8h,   v30.8h,  v24.8h  // abs(q6 - q0)
+.endif
+        and             v14.16b, v2.16b,  v14.16b // flat8in && fm && wd > 4
+        bic             v1.16b,  v1.16b,  v14.16b // fm && wd >= 4 && !flat8in
+.if \wd == 16
+        umax            v3.8h,   v3.8h,   v4.8h
+        umax            v5.8h,   v5.8h,   v6.8h
+.endif
+        mov             x16, v1.d[0]
+        mov             x17, v1.d[1]
+.if \wd == 16
+        umax            v7.8h,   v7.8h,   v8.8h
+        umax            v3.8h,   v3.8h,   v5.8h
+        umax            v3.8h,   v3.8h,   v7.8h
+        cmhs            v3.8h,   v10.8h,  v3.8h   // flat8out
+.endif
+        adds            x16, x16, x17
+.if \wd == 16
+        and             v15.16b, v15.16b, v3.16b  // flat8out && fm && wd == 16
+        and             v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
+        bic             v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
+.endif
+        b.eq            1f                        // skip wd == 4 case
+.endif
+
+        dup             v3.8h,   w8               // bitdepth_max
+        sub             v2.8h,   v22.8h,  v25.8h  // p1 - q1
+        ushr            v3.8h,   v3.8h,   #1      // 128 << bitdepth_min_8 - 1
+        cmhi            v0.8h,   v0.8h,   v12.8h  // hev
+        not             v9.16b,  v3.16b           // - 128 * (1 << bitdepth_min_8)
+        smin            v2.8h,   v2.8h,   v3.8h   // iclip_diff(p1 - q1)
+        smax            v2.8h,   v2.8h,   v9.8h   // iclip_diff(p1 - q1)
+        and             v4.16b,  v2.16b,  v0.16b  // if (hev) iclip_diff(p1 - q1)
+        sub             v2.8h,   v24.8h,  v23.8h
+        movi            v5.8h,   #3
+        bic             v0.16b,  v1.16b,  v0.16b  // (fm && wd >= 4 && !hev)
+        mul             v2.8h,   v2.8h,   v5.8h
+        movi            v6.8h,   #4
+        add             v2.8h,   v2.8h,   v4.8h
+        smin            v2.8h,   v2.8h,   v3.8h   // f = iclip_diff()
+        movi            v7.8h,   #3
+        smax            v2.8h,   v2.8h,   v9.8h   // f = iclip_diff()
+        sqadd           v4.8h,   v6.8h,   v2.8h   // f + 4
+        sqadd           v5.8h,   v7.8h,   v2.8h   // f + 3
+        smin            v4.8h,   v4.8h,   v3.8h   // imin(f + 4, 128 << bitdepth_min_8 - 1)
+        smin            v5.8h,   v5.8h,   v3.8h   // imin(f + 3, 128 << bitdepth_min_8 - 1)
+        sshr            v4.8h,   v4.8h,   #3      // f1
+        sshr            v5.8h,   v5.8h,   #3      // f2
+        movi            v9.8h,   #0
+        dup             v3.8h,   w8               // bitdepth_max
+        sqadd           v2.8h,   v23.8h,  v5.8h   // p0 + f2
+        sqsub           v6.8h,   v24.8h,  v4.8h   // q0 - f1
+        srshr           v4.8h,   v4.8h,   #1      // (f1 + 1) >> 1
+        smin            v2.8h,   v2.8h,   v3.8h   // out p0 = iclip_pixel()
+        smin            v6.8h,   v6.8h,   v3.8h   // out q0 = iclip_pixel()
+        smax            v2.8h,   v2.8h,   v9.8h   // out p0 = iclip_pixel()
+        smax            v6.8h,   v6.8h,   v9.8h   // out q0 = iclip_pixel()
+        bit             v23.16b, v2.16b,  v1.16b  // if (fm && wd >= 4)
+        bit             v24.16b, v6.16b,  v1.16b  // if (fm && wd >= 4)
+        sqadd           v2.8h,   v22.8h,  v4.8h   // p1 + f
+        sqsub           v6.8h,   v25.8h,  v4.8h   // q1 - f
+        smin            v2.8h,   v2.8h,   v3.8h   // out p1 = iclip_pixel()
+        smin            v6.8h,   v6.8h,   v3.8h   // out q1 = iclip_pixel()
+        smax            v2.8h,   v2.8h,   v9.8h   // out p1 = iclip_pixel()
+        smax            v6.8h,   v6.8h,   v9.8h   // out q1 = iclip_pixel()
+        bit             v22.16b, v2.16b,  v0.16b  // if (fm && wd >= 4 && !hev)
+        bit             v25.16b, v6.16b,  v0.16b  // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+        mov             x16, v14.d[0]
+        mov             x17, v14.d[1]
+        adds            x16, x16, x17
+        b.eq            2f                        // skip if there's no flat8in
+
+        add             v0.8h,   v21.8h,  v21.8h  // p2 * 2
+        add             v2.8h,   v21.8h,  v22.8h  // p2 + p1
+        add             v4.8h,   v22.8h,  v23.8h  // p1 + p0
+        add             v6.8h,   v23.8h,  v24.8h  // p0 + q0
+        add             v8.8h,   v0.8h,   v2.8h
+        add             v10.8h,  v4.8h,   v6.8h
+        add             v12.8h,  v24.8h,  v25.8h  // q0 + q1
+        add             v8.8h,   v8.8h,   v10.8h
+        sub             v12.8h,  v12.8h,  v0.8h
+        add             v10.8h,  v25.8h,  v26.8h  // q1 + q2
+        urshr           v0.8h,   v8.8h,   #3      // out p1
+
+        add             v8.8h,   v8.8h,   v12.8h
+        sub             v10.8h,  v10.8h,  v2.8h
+        add             v12.8h,  v26.8h,  v26.8h  // q2 + q2
+        urshr           v1.8h,   v8.8h,   #3      // out p0
+
+        add             v8.8h,   v8.8h,   v10.8h
+        sub             v12.8h,  v12.8h,  v4.8h
+        urshr           v2.8h,   v8.8h,   #3      // out q0
+
+        bit             v22.16b, v0.16b,  v14.16b // p1 if (flat8in)
+        add             v8.8h,   v8.8h,   v12.8h
+        bit             v23.16b, v1.16b,  v14.16b // p0 if (flat8in)
+        urshr           v3.8h,   v8.8h,   #3      // out q1
+        bit             v24.16b, v2.16b,  v14.16b // q0 if (flat8in)
+        bit             v25.16b, v3.16b,  v14.16b // q1 if (flat8in)
+.elseif \wd >= 8
+        mov             x16, v14.d[0]
+        mov             x17, v14.d[1]
+        adds            x16, x16, x17
+.if \wd == 8
+        b.eq            8f                        // skip if there's no flat8in
+.else
+        b.eq            2f                        // skip if there's no flat8in
+.endif
+
+        add             v0.8h,   v20.8h,  v21.8h  // p3 + p2
+        add             v2.8h,   v22.8h,  v25.8h  // p1 + q1
+        add             v4.8h,   v20.8h,  v22.8h  // p3 + p1
+        add             v6.8h,   v23.8h,  v26.8h  // p0 + q2
+        add             v8.8h,   v0.8h,   v0.8h   // 2 * (p3 + p2)
+        add             v9.8h,   v23.8h,  v24.8h  // p0 + q0
+        add             v8.8h,   v8.8h,   v4.8h   // + p3 + p1
+        sub             v2.8h,   v2.8h,   v0.8h   // p1 + q1 - p3 - p2
+        add             v8.8h,   v8.8h,   v9.8h   // + p0 + q0
+        sub             v6.8h,   v6.8h,   v4.8h   // p0 + q2 - p3 - p1
+        urshr           v10.8h,  v8.8h,   #3      // out p2
+
+        add             v8.8h,   v8.8h,   v2.8h
+        add             v0.8h,   v20.8h,  v23.8h  // p3 + p0
+        add             v2.8h,   v24.8h,  v27.8h  // q0 + q3
+        urshr           v11.8h,  v8.8h,   #3      // out p1
+
+        add             v8.8h,   v8.8h,   v6.8h
+        sub             v2.8h,   v2.8h,   v0.8h   // q0 + q3 - p3 - p0
+        add             v4.8h,   v21.8h,  v24.8h  // p2 + q0
+        add             v6.8h,   v25.8h,  v27.8h  // q1 + q3
+        urshr           v12.8h,  v8.8h,   #3      // out p0
+
+        add             v8.8h,   v8.8h,   v2.8h
+        sub             v6.8h,   v6.8h,   v4.8h   // q1 + q3 - p2 - q0
+        add             v0.8h,   v22.8h,  v25.8h  // p1 + q1
+        add             v2.8h,   v26.8h,  v27.8h  // q2 + q3
+        urshr           v13.8h,  v8.8h,   #3      // out q0
+
+        add             v8.8h,   v8.8h,   v6.8h
+        sub             v2.8h,   v2.8h,   v0.8h   // q2 + q3 - p1 - q1
+        urshr           v0.8h,   v8.8h,   #3      // out q1
+
+        add             v8.8h,   v8.8h,   v2.8h
+
+        bit             v21.16b, v10.16b, v14.16b
+        bit             v22.16b, v11.16b, v14.16b
+        bit             v23.16b, v12.16b, v14.16b
+        urshr           v1.8h,   v8.8h,   #3      // out q2
+        bit             v24.16b, v13.16b, v14.16b
+        bit             v25.16b, v0.16b,  v14.16b
+        bit             v26.16b, v1.16b,  v14.16b
+.endif
+2:
+.if \wd == 16
+        mov             x16, v15.d[0]
+        mov             x17, v15.d[1]
+        adds            x16, x16, x17
+        b.ne            1f                        // check if flat8out is needed
+        mov             x16, v14.d[0]
+        mov             x17, v14.d[1]
+        adds            x16, x16, x17
+        b.eq            8f                        // if there was no flat8in, just write the inner 4 pixels
+        b               7f                        // if flat8in was used, write the inner 6 pixels
+1:
+
+        add             v2.8h,   v17.8h,  v17.8h  // p6 + p6
+        add             v4.8h,   v17.8h,  v18.8h  // p6 + p5
+        add             v6.8h,   v17.8h,  v19.8h  // p6 + p4
+        add             v8.8h,   v17.8h,  v20.8h  // p6 + p3
+        add             v12.8h,  v2.8h,   v4.8h
+        add             v10.8h,  v6.8h,   v8.8h
+        add             v6.8h,   v17.8h,  v21.8h  // p6 + p2
+        add             v12.8h,  v12.8h,  v10.8h
+        add             v8.8h,   v17.8h,  v22.8h  // p6 + p1
+        add             v10.8h,  v18.8h,  v23.8h  // p5 + p0
+        add             v6.8h,   v6.8h,   v8.8h
+        add             v8.8h,   v19.8h,  v24.8h  // p4 + q0
+        add             v12.8h,  v12.8h,  v6.8h
+        add             v10.8h,  v10.8h,  v8.8h
+        add             v6.8h,   v20.8h,  v25.8h  // p3 + q1
+        add             v12.8h,  v12.8h,  v10.8h
+        sub             v6.8h,   v6.8h,   v2.8h
+        add             v2.8h,   v21.8h,  v26.8h  // p2 + q2
+        urshr           v0.8h,   v12.8h,  #4      // out p5
+        add             v12.8h,  v12.8h,  v6.8h   // - (p6 + p6) + (p3 + q1)
+        sub             v2.8h,   v2.8h,   v4.8h
+        add             v4.8h,   v22.8h,  v27.8h  // p1 + q3
+        add             v6.8h,   v17.8h,  v19.8h  // p6 + p4
+        urshr           v1.8h,   v12.8h,  #4      // out p4
+        add             v12.8h,  v12.8h,  v2.8h   // - (p6 + p5) + (p2 + q2)
+        sub             v4.8h,   v4.8h,   v6.8h
+        add             v6.8h,   v23.8h,  v28.8h  // p0 + q4
+        add             v8.8h,   v17.8h,  v20.8h  // p6 + p3
+        urshr           v2.8h,   v12.8h,  #4      // out p3
+        add             v12.8h,  v12.8h,  v4.8h   // - (p6 + p4) + (p1 + q3)
+        sub             v6.8h,   v6.8h,   v8.8h
+        add             v8.8h,   v24.8h,  v29.8h  // q0 + q5
+        add             v4.8h,   v17.8h,  v21.8h  // p6 + p2
+        urshr           v3.8h,   v12.8h,  #4      // out p2
+        add             v12.8h,  v12.8h,  v6.8h   // - (p6 + p3) + (p0 + q4)
+        sub             v8.8h,   v8.8h,   v4.8h
+        add             v6.8h,   v25.8h,  v30.8h  // q1 + q6
+        add             v10.8h,  v17.8h,  v22.8h  // p6 + p1
+        urshr           v4.8h,   v12.8h,  #4      // out p1
+        add             v12.8h,  v12.8h,  v8.8h   // - (p6 + p2) + (q0 + q5)
+        sub             v6.8h,   v6.8h,   v10.8h
+        add             v8.8h,   v26.8h,  v30.8h  // q2 + q6
+        bif             v0.16b,  v18.16b, v15.16b // out p5
+        add             v10.8h,  v18.8h,  v23.8h  // p5 + p0
+        urshr           v5.8h,   v12.8h,  #4      // out p0
+        add             v12.8h,  v12.8h,  v6.8h   // - (p6 + p1) + (q1 + q6)
+        sub             v8.8h,   v8.8h,   v10.8h
+        add             v10.8h,  v27.8h,  v30.8h  // q3 + q6
+        bif             v1.16b,  v19.16b, v15.16b // out p4
+        add             v18.8h,  v19.8h,  v24.8h  // p4 + q0
+        urshr           v6.8h,   v12.8h,  #4      // out q0
+        add             v12.8h,  v12.8h,  v8.8h   // - (p5 + p0) + (q2 + q6)
+        sub             v10.8h,  v10.8h,  v18.8h
+        add             v8.8h,   v28.8h,  v30.8h  // q4 + q6
+        bif             v2.16b,  v20.16b, v15.16b // out p3
+        add             v18.8h,  v20.8h,  v25.8h  // p3 + q1
+        urshr           v7.8h,   v12.8h,  #4      // out q1
+        add             v12.8h,  v12.8h,  v10.8h  // - (p4 + q0) + (q3 + q6)
+        sub             v18.8h,  v8.8h,   v18.8h
+        add             v10.8h,  v29.8h,  v30.8h  // q5 + q6
+        bif             v3.16b,  v21.16b, v15.16b // out p2
+        add             v20.8h,  v21.8h,  v26.8h  // p2 + q2
+        urshr           v8.8h,   v12.8h,  #4      // out q2
+        add             v12.8h,  v12.8h,  v18.8h  // - (p3 + q1) + (q4 + q6)
+        sub             v10.8h,  v10.8h,  v20.8h
+        add             v18.8h,  v30.8h,  v30.8h  // q6 + q6
+        bif             v4.16b,  v22.16b, v15.16b // out p1
+        add             v20.8h,  v22.8h,  v27.8h  // p1 + q3
+        urshr           v9.8h,   v12.8h,  #4      // out q3
+        add             v12.8h,  v12.8h,  v10.8h  // - (p2 + q2) + (q5 + q6)
+        sub             v18.8h,  v18.8h,  v20.8h
+        bif             v5.16b,  v23.16b, v15.16b // out p0
+        urshr           v10.8h,  v12.8h,  #4      // out q4
+        add             v12.8h,  v12.8h,  v18.8h  // - (p1 + q3) + (q6 + q6)
+        urshr           v11.8h,  v12.8h,  #4      // out q5
+        bif             v6.16b,  v24.16b, v15.16b // out q0
+        bif             v7.16b,  v25.16b, v15.16b // out q1
+        bif             v8.16b,  v26.16b, v15.16b // out q2
+        bif             v9.16b,  v27.16b, v15.16b // out q3
+        bif             v10.16b, v28.16b, v15.16b // out q4
+        bif             v11.16b, v29.16b, v15.16b // out q5
+.endif
+
+        ret
+.if \wd == 16
+7:
+        // Return to a shorter epilogue, writing only the inner 6 pixels
+        br              x13
+.endif
+.if \wd >= 8
+8:
+        // Return to a shorter epilogue, writing only the inner 4 pixels
+        br              x14
+.endif
+9:
+        // Return directly without writing back any pixels
+        br              x15
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_8_wd16
+        adr             x13, 7f
+        adr             x14, 8f
+        bl              lpf_8_wd16_neon
+.endm
+
+.macro lpf_8_wd8
+        adr             x14, 8f
+        bl              lpf_8_wd8_neon
+.endm
+
+.macro lpf_8_wd6
+        bl              lpf_8_wd6_neon
+.endm
+
+.macro lpf_8_wd4
+        bl              lpf_8_wd4_neon
+.endm
+
+function lpf_v_4_8_neon
+        mov             x15, x30
+        sub             x16, x0,  x1, lsl #1
+        ld1             {v22.8h}, [x16], x1 // p1
+        ld1             {v24.8h}, [x0],  x1 // q0
+        ld1             {v23.8h}, [x16], x1 // p0
+        ld1             {v25.8h}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+
+        lpf_8_wd4
+
+        sub             x16, x0,  x1, lsl #1
+        st1             {v22.8h}, [x16], x1 // p1
+        st1             {v24.8h}, [x0],  x1 // q0
+        st1             {v23.8h}, [x16], x1 // p0
+        st1             {v25.8h}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+endfunc
+
+function lpf_h_4_8_neon
+        mov             x15, x30
+        sub             x16, x0,  #4
+        add             x0,  x16, x1, lsl #2
+        ld1             {v22.d}[0], [x16], x1
+        ld1             {v22.d}[1], [x0],  x1
+        ld1             {v23.d}[0], [x16], x1
+        ld1             {v23.d}[1], [x0],  x1
+        ld1             {v24.d}[0], [x16], x1
+        ld1             {v24.d}[1], [x0],  x1
+        ld1             {v25.d}[0], [x16], x1
+        ld1             {v25.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+
+        transpose_4x8h  v22, v23, v24, v25, v26, v27, v28, v29
+
+        lpf_8_wd4
+
+        sub             x16, x0,  x1, lsl #3
+        sub             x16, x16, #4
+        transpose_4x8h  v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #2
+
+        st1             {v22.d}[0], [x16], x1
+        st1             {v22.d}[1], [x0],  x1
+        st1             {v23.d}[0], [x16], x1
+        st1             {v23.d}[1], [x0],  x1
+        st1             {v24.d}[0], [x16], x1
+        st1             {v24.d}[1], [x0],  x1
+        st1             {v25.d}[0], [x16], x1
+        st1             {v25.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+        br              x15
+endfunc
+
+function lpf_v_6_8_neon
+        mov             x15, x30
+        sub             x16, x0,  x1, lsl #1
+        sub             x16, x16, x1
+        ld1             {v21.8h}, [x16], x1 // p2
+        ld1             {v24.8h}, [x0],  x1 // q0
+        ld1             {v22.8h}, [x16], x1 // p1
+        ld1             {v25.8h}, [x0],  x1 // q1
+        ld1             {v23.8h}, [x16], x1 // p0
+        ld1             {v26.8h}, [x0],  x1 // q2
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+
+        lpf_8_wd6
+
+        sub             x16, x0,  x1, lsl #1
+        st1             {v22.8h}, [x16], x1 // p1
+        st1             {v24.8h}, [x0],  x1 // q0
+        st1             {v23.8h}, [x16], x1 // p0
+        st1             {v25.8h}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+endfunc
+
+function lpf_h_6_8_neon
+        mov             x15, x30
+        sub             x16, x0,  #8
+        add             x0,  x16, x1, lsl #2
+        ld1             {v20.8h}, [x16], x1
+        ld1             {v24.8h}, [x0],  x1
+        ld1             {v21.8h}, [x16], x1
+        ld1             {v25.8h}, [x0],  x1
+        ld1             {v22.8h}, [x16], x1
+        ld1             {v26.8h}, [x0],  x1
+        ld1             {v23.8h}, [x16], x1
+        ld1             {v27.8h}, [x0],  x1
+        add             x0,  x0,  #8
+
+        transpose_8x8h  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        lpf_8_wd6
+
+        sub             x16, x0,  x1, lsl #3
+        sub             x16, x16, #4
+        transpose_4x8h  v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #2
+
+        st1             {v22.d}[0], [x16], x1
+        st1             {v22.d}[1], [x0],  x1
+        st1             {v23.d}[0], [x16], x1
+        st1             {v23.d}[1], [x0],  x1
+        st1             {v24.d}[0], [x16], x1
+        st1             {v24.d}[1], [x0],  x1
+        st1             {v25.d}[0], [x16], x1
+        st1             {v25.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+        br              x15
+endfunc
+
+function lpf_v_8_8_neon
+        mov             x15, x30
+        sub             x16, x0,  x1, lsl #2
+        ld1             {v20.8h}, [x16], x1 // p3
+        ld1             {v24.8h}, [x0],  x1 // q0
+        ld1             {v21.8h}, [x16], x1 // p2
+        ld1             {v25.8h}, [x0],  x1 // q1
+        ld1             {v22.8h}, [x16], x1 // p1
+        ld1             {v26.8h}, [x0],  x1 // q2
+        ld1             {v23.8h}, [x16], x1 // p0
+        ld1             {v27.8h}, [x0],  x1 // q3
+        sub             x0,  x0,  x1, lsl #2
+
+        lpf_8_wd8
+
+        sub             x16, x0,  x1, lsl #1
+        sub             x16, x16,  x1
+        st1             {v21.8h}, [x16], x1 // p2
+        st1             {v24.8h}, [x0],  x1 // q0
+        st1             {v22.8h}, [x16], x1 // p1
+        st1             {v25.8h}, [x0],  x1 // q1
+        st1             {v23.8h}, [x16], x1 // p0
+        st1             {v26.8h}, [x0],  x1 // q2
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+        br              x15
+
+8:
+        sub             x16, x0,  x1, lsl #1
+        st1             {v22.8h}, [x16], x1 // p1
+        st1             {v24.8h}, [x0],  x1 // q0
+        st1             {v23.8h}, [x16], x1 // p0
+        st1             {v25.8h}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+endfunc
+
+function lpf_h_8_8_neon
+        mov             x15, x30
+        sub             x16, x0,  #8
+        add             x0,  x16, x1, lsl #2
+        ld1             {v20.8h}, [x16], x1
+        ld1             {v24.8h}, [x0],  x1
+        ld1             {v21.8h}, [x16], x1
+        ld1             {v25.8h}, [x0],  x1
+        ld1             {v22.8h}, [x16], x1
+        ld1             {v26.8h}, [x0],  x1
+        ld1             {v23.8h}, [x16], x1
+        ld1             {v27.8h}, [x0],  x1
+        add             x0,  x0,  #8
+
+        transpose_8x8h  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        lpf_8_wd8
+
+        sub             x16, x0,  x1, lsl #3
+        sub             x16, x16, #8
+        transpose_8x8h  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #2
+
+        st1             {v20.8h}, [x16], x1
+        st1             {v24.8h}, [x0],  x1
+        st1             {v21.8h}, [x16], x1
+        st1             {v25.8h}, [x0],  x1
+        st1             {v22.8h}, [x16], x1
+        st1             {v26.8h}, [x0],  x1
+        st1             {v23.8h}, [x16], x1
+        st1             {v27.8h}, [x0],  x1
+        add             x0,  x0,  #8
+        br              x15
+8:
+        sub             x16, x0,  x1, lsl #3
+        sub             x16, x16, #4
+        transpose_4x8h  v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #2
+
+        st1             {v22.d}[0], [x16], x1
+        st1             {v22.d}[1], [x0],  x1
+        st1             {v23.d}[0], [x16], x1
+        st1             {v23.d}[1], [x0],  x1
+        st1             {v24.d}[0], [x16], x1
+        st1             {v24.d}[1], [x0],  x1
+        st1             {v25.d}[0], [x16], x1
+        st1             {v25.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+        br              x15
+endfunc
+
+function lpf_v_16_8_neon
+        mov             x15, x30
+
+        sub             x16, x0,  x1, lsl #3
+        add             x16, x16, x1
+        ld1             {v17.8h}, [x16], x1 // p6
+        ld1             {v24.8h}, [x0],  x1 // q0
+        ld1             {v18.8h}, [x16], x1 // p5
+        ld1             {v25.8h}, [x0],  x1 // q1
+        ld1             {v19.8h}, [x16], x1 // p4
+        ld1             {v26.8h}, [x0],  x1 // q2
+        ld1             {v20.8h}, [x16], x1 // p3
+        ld1             {v27.8h}, [x0],  x1 // q3
+        ld1             {v21.8h}, [x16], x1 // p2
+        ld1             {v28.8h}, [x0],  x1 // q4
+        ld1             {v22.8h}, [x16], x1 // p1
+        ld1             {v29.8h}, [x0],  x1 // q5
+        ld1             {v23.8h}, [x16], x1 // p0
+        ld1             {v30.8h}, [x0],  x1 // q6
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  x1
+
+        lpf_8_wd16
+
+        sub             x16, x0,  x1, lsl #2
+        sub             x16, x16, x1, lsl #1
+        st1             {v0.8h},  [x16], x1 // p5
+        st1             {v6.8h},  [x0],  x1 // q0
+        st1             {v1.8h},  [x16], x1 // p4
+        st1             {v7.8h},  [x0],  x1 // q1
+        st1             {v2.8h},  [x16], x1 // p3
+        st1             {v8.8h},  [x0],  x1 // q2
+        st1             {v3.8h},  [x16], x1 // p2
+        st1             {v9.8h},  [x0],  x1 // q3
+        st1             {v4.8h},  [x16], x1 // p1
+        st1             {v10.8h}, [x0],  x1 // q4
+        st1             {v5.8h},  [x16], x1 // p0
+        st1             {v11.8h}, [x0],  x1 // q5
+        sub             x0,  x0,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+7:
+        sub             x16, x0,  x1
+        sub             x16, x16, x1, lsl #1
+        st1             {v21.8h}, [x16], x1 // p2
+        st1             {v24.8h}, [x0],  x1 // q0
+        st1             {v22.8h}, [x16], x1 // p1
+        st1             {v25.8h}, [x0],  x1 // q1
+        st1             {v23.8h}, [x16], x1 // p0
+        st1             {v26.8h}, [x0],  x1 // q2
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+        br              x15
+
+8:
+        sub             x16, x0,  x1, lsl #1
+        st1             {v22.8h}, [x16], x1 // p1
+        st1             {v24.8h}, [x0],  x1 // q0
+        st1             {v23.8h}, [x16], x1 // p0
+        st1             {v25.8h}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+endfunc
+
+function lpf_h_16_8_neon
+        mov             x15, x30
+        sub             x16, x0,  #16
+        ld1             {v16.8h}, [x16], x1
+        ld1             {v24.8h}, [x0],  x1
+        ld1             {v17.8h}, [x16], x1
+        ld1             {v25.8h}, [x0],  x1
+        ld1             {v18.8h}, [x16], x1
+        ld1             {v26.8h}, [x0],  x1
+        ld1             {v19.8h}, [x16], x1
+        ld1             {v27.8h}, [x0],  x1
+        ld1             {v20.8h}, [x16], x1
+        ld1             {v28.8h}, [x0],  x1
+        ld1             {v21.8h}, [x16], x1
+        ld1             {v29.8h}, [x0],  x1
+        ld1             {v22.8h}, [x16], x1
+        ld1             {v30.8h}, [x0],  x1
+        ld1             {v23.8h}, [x16], x1
+        ld1             {v31.8h}, [x0],  x1
+
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v0,  v1
+
+        lpf_8_wd16
+
+        sub             x0,  x0,  x1, lsl #3
+        sub             x16, x0,  #16
+
+        transpose_8x8h  v16, v17, v0,  v1,  v2,  v3,  v4,  v5,  v18, v19
+        transpose_8x8h  v6,  v7,  v8,  v9,  v10, v11, v30, v31, v18, v19
+
+        st1             {v16.8h}, [x16], x1
+        st1             {v6.8h},  [x0],  x1
+        st1             {v17.8h}, [x16], x1
+        st1             {v7.8h},  [x0],  x1
+        st1             {v0.8h},  [x16], x1
+        st1             {v8.8h},  [x0],  x1
+        st1             {v1.8h},  [x16], x1
+        st1             {v9.8h},  [x0],  x1
+        st1             {v2.8h},  [x16], x1
+        st1             {v10.8h}, [x0],  x1
+        st1             {v3.8h},  [x16], x1
+        st1             {v11.8h}, [x0],  x1
+        st1             {v4.8h},  [x16], x1
+        st1             {v30.8h}, [x0],  x1
+        st1             {v5.8h},  [x16], x1
+        st1             {v31.8h}, [x0],  x1
+        br              x15
+
+7:
+        sub             x16, x0,  x1, lsl #3
+        sub             x16, x16, #8
+        transpose_8x8h  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #2
+
+        st1             {v20.8h}, [x16], x1
+        st1             {v24.8h}, [x0],  x1
+        st1             {v21.8h}, [x16], x1
+        st1             {v25.8h}, [x0],  x1
+        st1             {v22.8h}, [x16], x1
+        st1             {v26.8h}, [x0],  x1
+        st1             {v23.8h}, [x16], x1
+        st1             {v27.8h}, [x0],  x1
+        add             x0,  x0,  #8
+        br              x15
+8:
+        sub             x16, x0,  x1, lsl #3
+        sub             x16, x16, #4
+        transpose_4x8h  v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #2
+
+        st1             {v22.d}[0], [x16], x1
+        st1             {v22.d}[1], [x0],  x1
+        st1             {v23.d}[0], [x16], x1
+        st1             {v23.d}[1], [x0],  x1
+        st1             {v24.d}[0], [x16], x1
+        st1             {v24.d}[1], [x0],  x1
+        st1             {v25.d}[0], [x16], x1
+        st1             {v25.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+        br              x15
+endfunc
+
+// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                  const uint32_t *const vmask,
+//                                  const uint8_t (*l)[4], ptrdiff_t b4_stride,
+//                                  const Av1FilterLUT *lut, const int w,
+//                                  const int bitdepth_max)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
+        mov             x11, x30
+        mov             w8,  w7  // bitdepth_max
+        clz             w9,  w8
+        mov             w10, #24
+        sub             w9,  w10,  w9 // bitdepth_min_8
+        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]
+        ldp             w6,  w7,  [x2]           // vmask[0], vmask[1]
+.ifc \type, y
+        ldr             w2,  [x2, #8]            // vmask[2]
+.endif
+        add             x5,  x5,  #128           // Move to sharp part of lut
+.ifc \type, y
+        orr             w7,  w7,  w2             // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+        sub             x4,  x3,  x4, lsl #2
+.else
+        sub             x3,  x3,  #4
+        lsl             x4,  x4,  #2
+.endif
+        orr             w6,  w6,  w7             // vmask[0] |= vmask[1]
+
+1:
+        tst             w6,  #0x0f
+.ifc \dir, v
+        ld1             {v0.8b}, [x4], #8
+        ld1             {v1.8b}, [x3], #8
+.else
+        ld2             {v0.s,v1.s}[0], [x3], x4
+        ld2             {v0.s,v1.s}[1], [x3], x4
+.endif
+        b.eq            7f                        // if (!(vm & bits)) continue;
+
+        ld1r            {v5.8b}, [x5]             // sharp[0]
+        add             x5,  x5,  #8
+        movi            v2.2s,   #0xff
+        dup             v13.2s,  w6               // vmask[0]
+        dup             v31.8h,  w9               // bitdepth_min_8
+
+        and             v0.8b,   v0.8b,   v2.8b   // Keep only lowest byte in each 32 bit word
+        and             v1.8b,   v1.8b,   v2.8b
+        cmtst           v3.8b,   v1.8b,   v2.8b   // Check for nonzero values in l[0][0]
+        movi            v4.8b,   #1
+        ld1r            {v6.8b}, [x5]             // sharp[1]
+        sub             x5,  x5,  #8
+        bif             v1.8b,   v0.8b,   v3.8b   // if (!l[0][0]) L = l[offset][0]
+        mul             v1.2s,   v1.2s,   v4.2s   // L
+.ifc \type, y
+        dup             v15.2s,  w2               // vmask[2]
+.endif
+        cmtst           v2.2s,   v1.2s,   v2.2s   // L != 0
+        dup             v14.2s,  w7               // vmask[1]
+        mov             x16, v2.d[0]
+        cmp             x16, #0
+        b.eq            7f                        // if (!L) continue;
+        neg             v5.8b,   v5.8b            // -sharp[0]
+        movrel          x16,  word_12
+        ushr            v12.8b,  v1.8b,   #4      // H
+        ld1             {v16.2s}, [x16]
+        sshl            v3.8b,   v1.8b,   v5.8b   // L >> sharp[0]
+.ifc \type, y
+        cmtst           v15.2s,  v15.2s,  v16.2s  // if (vmask[2] & bits)
+.endif
+        movi            v7.8b,   #2
+        umin            v3.8b,   v3.8b,   v6.8b   // imin(L >> sharp[0], sharp[1])
+        add             v0.8b,   v1.8b,   v7.8b   // L + 2
+        umax            v11.8b,  v3.8b,   v4.8b   // imax(imin(), 1) = limit = I
+        add             v0.8b,   v0.8b,   v0.8b   // 2*(L + 2)
+        cmtst           v14.2s,  v14.2s,  v16.2s  // if (vmask[1] & bits)
+        uxtl            v12.8h,  v12.8b
+        add             v10.8b,  v0.8b,   v11.8b  // 2*(L + 2) + limit = E
+        cmtst           v13.2s,  v13.2s,  v16.2s  // if (vmask[0] & bits)
+        uxtl            v11.8h,  v11.8b
+        uxtl            v10.8h,  v10.8b
+        and             v13.8b,  v13.8b,  v2.8b   // vmask[0] &= L != 0
+        sxtl            v14.8h,  v14.8b
+        sxtl            v13.8h,  v13.8b
+.ifc \type, y
+        sxtl            v15.8h,  v15.8b
+.endif
+        ushl            v12.8h,  v12.8h,  v31.8h
+        ushl            v11.8h,  v11.8h,  v31.8h
+        ushl            v10.8h,  v10.8h,  v31.8h
+
+.ifc \type, y
+        tst             w2,  #0x0f
+        b.eq            2f
+        // wd16
+        bl              lpf_\dir\()_16_8_neon
+        b               8f
+2:
+.endif
+        tst             w7,  #0x0f
+        b.eq            3f
+.ifc \type, y
+        // wd8
+        bl              lpf_\dir\()_8_8_neon
+.else
+        // wd6
+        bl              lpf_\dir\()_6_8_neon
+.endif
+        b               8f
+3:
+        // wd4
+        bl              lpf_\dir\()_4_8_neon
+.ifc \dir, h
+        b               8f
+7:
+        // For dir h, the functions above increment x0.
+        // If the whole function is skipped, increment it here instead.
+        add             x0,  x0,  x1,  lsl #3
+.else
+7:
+.endif
+8:
+        lsr             w6,  w6,  #2              // vmask[0] >>= 2
+        lsr             w7,  w7,  #2              // vmask[1] >>= 2
+.ifc \type, y
+        lsr             w2,  w2,  #2              // vmask[2] >>= 2
+.endif
+.ifc \dir, v
+        add             x0,  x0,  #16
+.else
+        // For dir h, x0 is returned incremented
+.endif
+        cbnz            w6,  1b
+
+        ldp             d14, d15, [sp, #0x30]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x40
+        br              x11
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_12
+        .word 1, 2
+endconst
diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S
new file mode 100644 (file)
index 0000000..1e864c2
--- /dev/null
@@ -0,0 +1,1152 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
+//                                      const pixel *src, ptrdiff_t stride,
+//                                      const int16_t fh[7], const intptr_t w,
+//                                      int h, enum LrEdgeFlags edges);
+function wiener_filter_h_8bpc_neon, export=1
+        mov             w8,  w5
+        ld1             {v0.8h},  [x4]
+        mov             w9,  #(1 << 14) - (1 << 2)
+        dup             v30.8h,  w9
+        movi            v31.8h,  #8, lsl #8
+        // Calculate mid_stride
+        add             w10, w5,  #7
+        bic             w10, w10, #7
+        lsl             w10, w10, #1
+
+        // Clear the last unused element of v0, to allow filtering a single
+        // pixel with one plain mul+addv.
+        ins             v0.h[7], wzr
+
+        // Set up pointers for reading/writing alternate rows
+        add             x12, x0,  x10
+        lsl             w10, w10, #1
+        add             x13, x2,  x3
+        lsl             x3,  x3,  #1
+
+        // Subtract the width from mid_stride
+        sub             x10, x10, w5, uxtw #1
+
+        // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
+        cmp             w5,  #8
+        add             w11, w5,  #13
+        bic             w11, w11, #7
+        b.ge            1f
+        mov             w11, #16
+1:
+        sub             x3,  x3,  w11, uxtw
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            2f
+        // LR_HAVE_LEFT
+        cbnz            x1,  0f
+        // left == NULL
+        sub             x2,  x2,  #3
+        sub             x13, x13, #3
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             x3,  x3,  #3
+
+
+1:      // Loop vertically
+        ld1             {v3.16b},  [x2],  #16
+        ld1             {v5.16b},  [x13], #16
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            0f
+        cbz             x1,  2f
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v2.s}[3],  [x1], #4
+        // Move x2/x13 back to account for the last 3 bytes we loaded earlier,
+        // which we'll shift out.
+        sub             x2,  x2,  #3
+        sub             x13, x13, #3
+        ld1             {v4.s}[3],  [x1], #4
+        ext             v3.16b, v2.16b, v3.16b, #13
+        ext             v5.16b, v4.16b, v5.16b, #13
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+        // and shift v3 to have 3x the first byte at the front.
+        dup             v2.16b, v3.b[0]
+        dup             v4.16b, v5.b[0]
+        // Move x2 back to account for the last 3 bytes we loaded before,
+        // which we shifted out.
+        sub             x2,  x2,  #3
+        sub             x13, x13, #3
+        ext             v3.16b, v2.16b, v3.16b, #13
+        ext             v5.16b, v4.16b, v5.16b, #13
+
+2:
+        uxtl            v2.8h,  v3.8b
+        uxtl2           v3.8h,  v3.16b
+        uxtl            v4.8h,  v5.8b
+        uxtl2           v5.8h,  v5.16b
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w9,  w5, #14
+        ldr             b28, [x2,  w9, sxtw]
+        ldr             b29, [x13, w9, sxtw]
+        // Fill v28/v29 with the right padding pixel
+        dup             v28.8b,  v28.b[0]
+        dup             v29.8b,  v29.b[0]
+        uxtl            v28.8h,  v28.8b
+        uxtl            v29.8h,  v29.8b
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             w5,  #11
+        b.ge            4f   // If w >= 11, all used input pixels are valid
+        cmp             w5,  #7
+        b.ge            5f   // If w >= 7, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro filter wd
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        ext             v16.16b, v2.16b,  v3.16b, #2
+        ext             v17.16b, v2.16b,  v3.16b, #4
+        ext             v18.16b, v2.16b,  v3.16b, #6
+        ext             v19.16b, v2.16b,  v3.16b, #8
+        ext             v20.16b, v2.16b,  v3.16b, #10
+        ext             v21.16b, v2.16b,  v3.16b, #12
+        mul             v6\wd,   v2\wd,   v0.h[0]
+        mla             v6\wd,   v16\wd,  v0.h[1]
+        mla             v6\wd,   v17\wd,  v0.h[2]
+        mla             v6\wd,   v18\wd,  v0.h[3]
+        mla             v6\wd,   v19\wd,  v0.h[4]
+        mla             v6\wd,   v20\wd,  v0.h[5]
+        mla             v6\wd,   v21\wd,  v0.h[6]
+        ext             v22.16b, v4.16b,  v5.16b, #2
+        ext             v23.16b, v4.16b,  v5.16b, #4
+        ext             v24.16b, v4.16b,  v5.16b, #6
+        ext             v25.16b, v4.16b,  v5.16b, #8
+        ext             v26.16b, v4.16b,  v5.16b, #10
+        ext             v27.16b, v4.16b,  v5.16b, #12
+        mul             v7\wd,   v4\wd,   v0.h[0]
+        mla             v7\wd,   v22\wd,  v0.h[1]
+        mla             v7\wd,   v23\wd,  v0.h[2]
+        mla             v7\wd,   v24\wd,  v0.h[3]
+        mla             v7\wd,   v25\wd,  v0.h[4]
+        mla             v7\wd,   v26\wd,  v0.h[5]
+        mla             v7\wd,   v27\wd,  v0.h[6]
+
+        shl             v18\wd,  v18\wd,  #7
+        shl             v24\wd,  v24\wd,  #7
+        sub             v18\wd,  v18\wd,  v30\wd
+        sub             v24\wd,  v24\wd,  v30\wd
+        sqadd           v6\wd,   v6\wd,   v18\wd
+        sqadd           v7\wd,   v7\wd,   v24\wd
+        sshr            v6\wd,   v6\wd,   #3
+        sshr            v7\wd,   v7\wd,   #3
+        add             v6\wd,   v6\wd,   v31\wd
+        add             v7\wd,   v7\wd,   v31\wd
+.endm
+        filter          .8h
+        st1             {v6.8h},  [x0],  #16
+        st1             {v7.8h},  [x12], #16
+
+        subs            w5,  w5,  #8
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        mov             v2.16b,  v3.16b
+        mov             v4.16b,  v5.16b
+        ld1             {v3.8b},  [x2],  #8
+        ld1             {v5.8b},  [x13], #8
+        uxtl            v3.8h,   v3.8b
+        uxtl            v5.8h,   v5.8b
+        b.ne            4b // If we don't need to pad, just keep filtering.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Filter 4 pixels, 7 <= w < 11
+        filter          .4h
+        st1             {v6.4h},  [x0],  #8
+        st1             {v7.4h},  [x12], #8
+
+        subs            w5,  w5,  #4 // 3 <= w < 7
+        ext             v2.16b,  v2.16b,  v3.16b, #8
+        ext             v3.16b,  v3.16b,  v3.16b, #8
+        ext             v4.16b,  v4.16b,  v5.16b, #8
+        ext             v5.16b,  v5.16b,  v5.16b, #8
+
+6:      // Pad the right edge and filter the last few pixels.
+        // w < 7, w+3 pixels valid in v2-v3
+        cmp             w5,  #5
+        b.lt            7f
+        b.gt            8f
+        // w == 5, 8 pixels valid in v2, v3 invalid
+        mov             v3.16b,  v28.16b
+        mov             v5.16b,  v29.16b
+        b               88f
+
+7:      // 1 <= w < 5, 4-7 pixels valid in v2
+        sub             w9,  w5,  #1
+        // w9 = (pixels valid - 4)
+        adr             x11, L(variable_shift_tbl)
+        ldrh            w9,  [x11, w9, uxtw #1]
+        sub             x11, x11, w9, uxth
+        mov             v3.16b,  v28.16b
+        mov             v5.16b,  v29.16b
+        br              x11
+44:     // 4 pixels valid in v2/v4, fill the high half with padding.
+        ins             v2.d[1], v3.d[0]
+        ins             v4.d[1], v5.d[0]
+        b               88f
+        // Shift v2 right, shifting out invalid pixels,
+        // shift v2 left to the original offset, shifting in padding pixels.
+55:     // 5 pixels valid
+        ext             v2.16b,  v2.16b,  v2.16b,  #10
+        ext             v2.16b,  v2.16b,  v3.16b,  #6
+        ext             v4.16b,  v4.16b,  v4.16b,  #10
+        ext             v4.16b,  v4.16b,  v5.16b,  #6
+        b               88f
+66:     // 6 pixels valid, fill the upper 2 pixels with padding.
+        ins             v2.s[3], v3.s[0]
+        ins             v4.s[3], v5.s[0]
+        b               88f
+77:     // 7 pixels valid, fill the last pixel with padding.
+        ins             v2.h[7], v3.h[0]
+        ins             v4.h[7], v5.h[0]
+        b               88f
+
+L(variable_shift_tbl):
+        .hword L(variable_shift_tbl) - 44b
+        .hword L(variable_shift_tbl) - 55b
+        .hword L(variable_shift_tbl) - 66b
+        .hword L(variable_shift_tbl) - 77b
+
+8:      // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3
+        ins             v28.h[0],  v3.h[0]
+        ins             v29.h[0],  v5.h[0]
+        mov             v3.16b,  v28.16b
+        mov             v5.16b,  v29.16b
+
+88:
+        // w < 7, v2-v3 padded properly
+        cmp             w5,  #4
+        b.lt            888f
+
+        // w >= 4, filter 4 pixels
+        filter          .4h
+        st1             {v6.4h},  [x0],  #8
+        st1             {v7.4h},  [x12], #8
+        subs            w5,  w5,  #4 // 0 <= w < 4
+        ext             v2.16b,  v2.16b,  v3.16b, #8
+        ext             v4.16b,  v4.16b,  v5.16b, #8
+        b.eq            9f
+888:    // 1 <= w < 4, filter 1 pixel at a time
+        mul             v6.8h,   v2.8h,   v0.8h
+        mul             v7.8h,   v4.8h,   v0.8h
+        addv            h6,      v6.8h
+        addv            h7,      v7.8h
+        dup             v16.4h,  v2.h[3]
+        ins             v16.h[1], v4.h[3]
+        ins             v6.h[1], v7.h[0]
+        shl             v16.4h,  v16.4h,  #7
+        sub             v16.4h,  v16.4h,  v30.4h
+        sqadd           v6.4h,   v6.4h,   v16.4h
+        sshr            v6.4h,   v6.4h,   #3
+        add             v6.4h,   v6.4h,   v31.4h
+        st1             {v6.h}[0], [x0],  #2
+        st1             {v6.h}[1], [x12], #2
+        subs            w5,  w5,  #1
+        ext             v2.16b,  v2.16b,  v3.16b,  #2
+        ext             v4.16b,  v4.16b,  v5.16b,  #2
+        b.gt            888b
+
+9:
+        subs            w6,  w6,  #2
+        b.le            0f
+        // Jump to the next row and loop horizontally
+        add             x0,  x0,  x10
+        add             x12, x12, x10
+        add             x2,  x2,  x3
+        add             x13, x13, x3
+        mov             w5,  w8
+        b               1b
+0:
+        ret
+.purgem filter
+endfunc
+
+// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                      const int16_t *mid, int w, int h,
+//                                      const int16_t fv[7], enum LrEdgeFlags edges,
+//                                      ptrdiff_t mid_stride);
+function wiener_filter_v_8bpc_neon, export=1
+        mov             w8,  w4
+        ld1             {v0.8h},  [x5]
+        movi            v1.8h, #128
+        add             v1.8h,  v1.8h,  v0.8h
+
+        // Calculate the number of rows to move back when looping vertically
+        mov             w11, w4
+        tst             w6,  #4 // LR_HAVE_TOP
+        b.eq            0f
+        sub             x2,  x2,  x7,  lsl #1
+        add             w11, w11, #2
+0:
+        tst             w6,  #8 // LR_HAVE_BOTTOM
+        b.eq            1f
+        add             w11, w11, #2
+
+1:      // Start of horizontal loop; start one vertical filter slice.
+        // Load rows into v16-v19 and pad properly.
+        tst             w6,  #4 // LR_HAVE_TOP
+        ld1             {v16.8h}, [x2], x7
+        b.eq            2f
+        // LR_HAVE_TOP
+        ld1             {v18.8h}, [x2], x7
+        mov             v17.16b, v16.16b
+        ld1             {v19.8h}, [x2], x7
+        b               3f
+2:      // !LR_HAVE_TOP
+        mov             v17.16b, v16.16b
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v16.16b
+
+3:
+        cmp             w4,  #4
+        b.lt            5f
+        // Start filtering normally; fill in v20-v22 with unique rows.
+        ld1             {v20.8h}, [x2], x7
+        ld1             {v21.8h}, [x2], x7
+        ld1             {v22.8h}, [x2], x7
+
+4:
+.macro filter compare
+        subs            w4,  w4,  #1
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        smull           v2.4s,  v16.4h,  v0.h[0]
+        smlal           v2.4s,  v17.4h,  v0.h[1]
+        smlal           v2.4s,  v18.4h,  v0.h[2]
+        smlal           v2.4s,  v19.4h,  v1.h[3]
+        smlal           v2.4s,  v20.4h,  v0.h[4]
+        smlal           v2.4s,  v21.4h,  v0.h[5]
+        smlal           v2.4s,  v22.4h,  v0.h[6]
+        smull2          v3.4s,  v16.8h,  v0.h[0]
+        smlal2          v3.4s,  v17.8h,  v0.h[1]
+        smlal2          v3.4s,  v18.8h,  v0.h[2]
+        smlal2          v3.4s,  v19.8h,  v1.h[3]
+        smlal2          v3.4s,  v20.8h,  v0.h[4]
+        smlal2          v3.4s,  v21.8h,  v0.h[5]
+        smlal2          v3.4s,  v22.8h,  v0.h[6]
+        sqrshrun        v2.4h,  v2.4s,   #11
+        sqrshrun2       v2.8h,  v3.4s,   #11
+        sqxtun          v2.8b,  v2.8h
+        st1             {v2.8b}, [x0], x1
+.if \compare
+        cmp             w4,  #4
+.else
+        b.le            9f
+.endif
+        mov             v16.16b,  v17.16b
+        mov             v17.16b,  v18.16b
+        mov             v18.16b,  v19.16b
+        mov             v19.16b,  v20.16b
+        mov             v20.16b,  v21.16b
+        mov             v21.16b,  v22.16b
+.endm
+        filter          1
+        b.lt            7f
+        ld1             {v22.8h}, [x2], x7
+        b               4b
+
+5:      // Less than 4 rows in total; not all of v20-v21 are filled yet.
+        tst             w6,  #8 // LR_HAVE_BOTTOM
+        b.eq            6f
+        // LR_HAVE_BOTTOM
+        cmp             w4,  #2
+        // We load at least 2 rows in all cases.
+        ld1             {v20.8h}, [x2], x7
+        ld1             {v21.8h}, [x2], x7
+        b.gt            53f // 3 rows in total
+        b.eq            52f // 2 rows in total
+51:     // 1 row in total, v19 already loaded, load edge into v20-v22.
+        mov             v22.16b,  v21.16b
+        b               8f
+52:     // 2 rows in total, v19 already loaded, load v20 with content data
+        // and 2 rows of edge.
+        ld1             {v22.8h}, [x2], x7
+        mov             v23.16b,  v22.16b
+        b               8f
+53:
+        // 3 rows in total, v19 already loaded, load v20 and v21 with content
+        // and 2 rows of edge.
+        ld1             {v22.8h}, [x2], x7
+        ld1             {v23.8h}, [x2], x7
+        mov             v24.16b,  v23.16b
+        b               8f
+
+6:
+        // !LR_HAVE_BOTTOM
+        cmp             w4,  #2
+        b.gt            63f // 3 rows in total
+        b.eq            62f // 2 rows in total
+61:     // 1 row in total, v19 already loaded, pad that into v20-v22.
+        mov             v20.16b,  v19.16b
+        mov             v21.16b,  v19.16b
+        mov             v22.16b,  v19.16b
+        b               8f
+62:     // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23.
+        ld1             {v20.8h}, [x2], x7
+        mov             v21.16b,  v20.16b
+        mov             v22.16b,  v20.16b
+        mov             v23.16b,  v20.16b
+        b               8f
+63:
+        // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24.
+        ld1             {v20.8h}, [x2], x7
+        ld1             {v21.8h}, [x2], x7
+        mov             v22.16b,  v21.16b
+        mov             v23.16b,  v21.16b
+        mov             v24.16b,  v21.16b
+        b               8f
+
+7:
+        // All registers up to v21 are filled already, 3 valid rows left.
+        // < 4 valid rows left; fill in padding and filter the last
+        // few rows.
+        tst             w6,  #8 // LR_HAVE_BOTTOM
+        b.eq            71f
+        // LR_HAVE_BOTTOM; load 2 rows of edge.
+        ld1             {v22.8h}, [x2], x7
+        ld1             {v23.8h}, [x2], x7
+        mov             v24.16b,  v23.16b
+        b               8f
+71:
+        // !LR_HAVE_BOTTOM, pad 3 rows
+        mov             v22.16b,  v21.16b
+        mov             v23.16b,  v21.16b
+        mov             v24.16b,  v21.16b
+
+8:      // At this point, all registers up to v22-v24 are loaded with
+        // edge/padding (depending on how many rows are left).
+        filter          0 // This branches to 9f when done
+        mov             v22.16b,  v23.16b
+        mov             v23.16b,  v24.16b
+        b               8b
+
+9:      // End of one vertical slice.
+        subs            w3,  w3,  #8
+        b.le            0f
+        // Move pointers back up to the top and loop horizontally.
+        msub            x0,  x1,  x8,  x0
+        msub            x2,  x7,  x11, x2
+        add             x0,  x0,  #8
+        add             x2,  x2,  #16
+        mov             w4,  w8
+        b               1b
+
+0:
+        ret
+.purgem filter
+endfunc
+
+// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                  const pixel *src, int w, int h);
+function copy_narrow_8bpc_neon, export=1
+        adr             x5,  L(copy_narrow_tbl)
+        ldrh            w6,  [x5, w3, uxtw #1]
+        sub             x5,  x5,  w6, uxth
+        br              x5
+10:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+18:
+        subs            w4,  w4,  #8
+        b.lt            110f
+        ld1             {v0.8b}, [x2], #8
+        st1             {v0.b}[0], [x0], x1
+        st1             {v0.b}[1], [x7], x1
+        st1             {v0.b}[2], [x0], x1
+        st1             {v0.b}[3], [x7], x1
+        st1             {v0.b}[4], [x0], x1
+        st1             {v0.b}[5], [x7], x1
+        st1             {v0.b}[6], [x0], x1
+        st1             {v0.b}[7], [x7], x1
+        b.le            0f
+        b               18b
+110:
+        add             w4,  w4,  #8
+        asr             x1,  x1,  #1
+11:
+        subs            w4,  w4,  #1
+        ld1             {v0.b}[0], [x2], #1
+        st1             {v0.b}[0], [x0], x1
+        b.gt            11b
+0:
+        ret
+
+20:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+24:
+        subs            w4,  w4,  #4
+        b.lt            210f
+        ld1             {v0.4h}, [x2], #8
+        st1             {v0.h}[0], [x0], x1
+        st1             {v0.h}[1], [x7], x1
+        st1             {v0.h}[2], [x0], x1
+        st1             {v0.h}[3], [x7], x1
+        b.le            0f
+        b               24b
+210:
+        add             w4,  w4,  #4
+        asr             x1,  x1,  #1
+22:
+        subs            w4,  w4,  #1
+        ld1             {v0.h}[0], [x2], #2
+        st1             {v0.h}[0], [x0], x1
+        b.gt            22b
+0:
+        ret
+
+30:
+        ldrh            w5,  [x2]
+        ldrb            w6,  [x2, #2]
+        add             x2,  x2,  #3
+        subs            w4,  w4,  #1
+        strh            w5,  [x0]
+        strb            w6,  [x0, #2]
+        add             x0,  x0,  x1
+        b.gt            30b
+        ret
+
+40:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+42:
+        subs            w4,  w4,  #2
+        b.lt            41f
+        ld1             {v0.2s}, [x2], #8
+        st1             {v0.s}[0], [x0], x1
+        st1             {v0.s}[1], [x7], x1
+        b.le            0f
+        b               42b
+41:
+        ld1             {v0.s}[0], [x2]
+        st1             {v0.s}[0], [x0]
+0:
+        ret
+
+50:
+        ldr             w5,  [x2]
+        ldrb            w6,  [x2, #4]
+        add             x2,  x2,  #5
+        subs            w4,  w4,  #1
+        str             w5,  [x0]
+        strb            w6,  [x0, #4]
+        add             x0,  x0,  x1
+        b.gt            50b
+        ret
+
+60:
+        ldr             w5,  [x2]
+        ldrh            w6,  [x2, #4]
+        add             x2,  x2,  #6
+        subs            w4,  w4,  #1
+        str             w5,  [x0]
+        strh            w6,  [x0, #4]
+        add             x0,  x0,  x1
+        b.gt            60b
+        ret
+
+70:
+        ldr             w5,  [x2]
+        ldrh            w6,  [x2, #4]
+        ldrb            w7,  [x2, #6]
+        add             x2,  x2,  #7
+        subs            w4,  w4,  #1
+        str             w5,  [x0]
+        strh            w6,  [x0, #4]
+        strb            w7,  [x0, #6]
+        add             x0,  x0,  x1
+        b.gt            70b
+        ret
+
+L(copy_narrow_tbl):
+        .hword 0
+        .hword L(copy_narrow_tbl) - 10b
+        .hword L(copy_narrow_tbl) - 20b
+        .hword L(copy_narrow_tbl) - 30b
+        .hword L(copy_narrow_tbl) - 40b
+        .hword L(copy_narrow_tbl) - 50b
+        .hword L(copy_narrow_tbl) - 60b
+        .hword L(copy_narrow_tbl) - 70b
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                 const pixel (*left)[4],
+//                                 const pixel *src, const ptrdiff_t stride,
+//                                 const int w, const int h,
+//                                 const enum LrEdgeFlags edges);
+function sgr_box3_h_8bpc_neon, export=1
+        add             w5,  w5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq
+        add             x11, x1,  #(2*SUM_STRIDE)   // sum
+        add             x12, x3,  x4                // src
+        lsl             x4,  x4,  #1
+        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            0f
+        // !LR_HAVE_RIGHT
+        add             w13, w5,  #3
+        bic             w13, w13, #3
+        b               1f
+0:
+        add             w13, w5,  #7
+        bic             w13, w13, #7
+1:
+        sub             x9,  x9,  w13, uxtw #1
+
+        // Store the width for the vertical loop
+        mov             w8,  w5
+
+        // Subtract the number of pixels read from the input from the stride
+        add             w13, w5,  #14
+        bic             w13, w13, #7
+        sub             x4,  x4,  w13, uxtw
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            2f
+        // LR_HAVE_LEFT
+        cbnz            x2,  0f
+        // left == NULL
+        sub             x3,  x3,  #2
+        sub             x12, x12, #2
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 2 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             x4,  x4,  #2
+
+
+1:      // Loop vertically
+        ld1             {v0.16b},  [x3],  #16
+        ld1             {v4.16b},  [x12], #16
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            0f
+        cbz             x2,  2f
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v1.s}[3],  [x2], #4
+        // Move x3/x12 back to account for the last 2 bytes we loaded earlier,
+        // which we'll shift out.
+        sub             x3,  x3,  #2
+        sub             x12, x12, #2
+        ld1             {v5.s}[3],  [x2], #4
+        ext             v0.16b, v1.16b, v0.16b, #14
+        ext             v4.16b, v5.16b, v4.16b, #14
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+        // and shift v0 to have 2x the first byte at the front.
+        dup             v1.16b, v0.b[0]
+        dup             v5.16b, v4.b[0]
+        // Move x3 back to account for the last 2 bytes we loaded before,
+        // which we shifted out.
+        sub             x3,  x3,  #2
+        sub             x12, x12, #2
+        ext             v0.16b, v1.16b, v0.16b, #14
+        ext             v4.16b, v5.16b, v4.16b, #14
+
+2:
+        umull           v1.8h,   v0.8b,   v0.8b
+        umull2          v2.8h,   v0.16b,  v0.16b
+        umull           v5.8h,   v4.8b,   v4.8b
+        umull2          v6.8h,   v4.16b,  v4.16b
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w13, w5, #(2 + 16 - 2 + 1)
+        ldr             b30, [x3,  w13, sxtw]
+        ldr             b31, [x12, w13, sxtw]
+        // Fill v30/v31 with the right padding pixel
+        dup             v30.8b,  v30.b[0]
+        dup             v31.8b,  v31.b[0]
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             w5,  #10
+        b.ge            4f   // If w >= 10, all used input pixels are valid
+        cmp             w5,  #6
+        b.ge            5f   // If w >= 6, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro uaddl_nh         dst1, dst2, src1, src2, w
+        uaddl           \dst1,  \src1\().4h,  \src2\().4h
+.if \w > 4
+        uaddl2          \dst2,  \src1\().8h,  \src2\().8h
+.endif
+.endm
+.macro uaddw_nh         dst1, dst2, src, w
+        uaddw           \dst1,  \dst1,  \src\().4h
+.if \w > 4
+        uaddw2          \dst2,  \dst2,  \src\().8h
+.endif
+.endm
+.macro add_nh           dst1, dst2, src1, src2, w
+        add             \dst1,  \dst1,  \src1
+.if \w > 4
+        add             \dst2,  \dst2,  \src2
+.endif
+.endm
+
+.macro add3 w
+        ext             v16.16b, v0.16b,  v0.16b, #1
+        ext             v17.16b, v0.16b,  v0.16b, #2
+        ext             v18.16b, v4.16b,  v4.16b, #1
+        ext             v19.16b, v4.16b,  v4.16b, #2
+        uaddl           v3.8h,   v0.8b,   v16.8b
+        uaddw           v3.8h,   v3.8h,   v17.8b
+        uaddl           v7.8h,   v4.8b,   v18.8b
+        uaddw           v7.8h,   v7.8h,   v19.8b
+
+        ext             v20.16b, v1.16b,  v2.16b, #2
+        ext             v21.16b, v1.16b,  v2.16b, #4
+        ext             v22.16b, v5.16b,  v6.16b, #2
+        ext             v23.16b, v5.16b,  v6.16b, #4
+
+        uaddl_nh        v26.4s,  v27.4s,  v1,   v20,  \w
+        uaddw_nh        v26.4s,  v27.4s,  v21,        \w
+
+        uaddl_nh        v28.4s,  v29.4s,  v5,   v22,  \w
+        uaddw_nh        v28.4s,  v29.4s,  v23,        \w
+.endm
+        add3            8
+        st1             {v3.8h},         [x1],  #16
+        st1             {v7.8h},         [x11], #16
+        st1             {v26.4s,v27.4s}, [x0],  #32
+        st1             {v28.4s,v29.4s}, [x10], #32
+
+        subs            w5,  w5,  #8
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        ld1             {v3.8b},  [x3],  #8
+        ld1             {v7.8b},  [x12], #8
+        mov             v1.16b,  v2.16b
+        mov             v5.16b,  v6.16b
+        ext             v0.16b,  v0.16b,  v3.16b, #8
+        ext             v4.16b,  v4.16b,  v7.16b, #8
+        umull           v2.8h,   v3.8b,   v3.8b
+        umull           v6.8h,   v7.8b,   v7.8b
+
+        b.ne            4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 6 <= w < 10
+        add3            4
+        st1             {v3.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v26.4s}, [x0],  #16
+        st1             {v28.4s}, [x10], #16
+
+        subs            w5,  w5,  #4 // 2 <= w < 6
+        ext             v0.16b,  v0.16b,  v0.16b, #4
+        ext             v4.16b,  v4.16b,  v4.16b, #4
+
+6:      // Pad the right edge and produce the last few pixels.
+        // 2 <= w < 6, 2-5 pixels valid in v0
+        sub             w13,  w5,  #2
+        // w13 = (pixels valid - 2)
+        adr             x14, L(box3_variable_shift_tbl)
+        ldrh            w13, [x14, w13, uxtw #1]
+        sub             x13, x14, w13, uxth
+        br              x13
+        // Shift v0 right, shifting out invalid pixels,
+        // shift v0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #2
+        ext             v4.16b,  v4.16b,  v4.16b,  #2
+        ext             v0.16b,  v0.16b,  v30.16b, #14
+        ext             v4.16b,  v4.16b,  v31.16b, #14
+        b               88f
+33:     // 3 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #3
+        ext             v4.16b,  v4.16b,  v4.16b,  #3
+        ext             v0.16b,  v0.16b,  v30.16b, #13
+        ext             v4.16b,  v4.16b,  v31.16b, #13
+        b               88f
+44:     // 4 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #4
+        ext             v4.16b,  v4.16b,  v4.16b,  #4
+        ext             v0.16b,  v0.16b,  v30.16b, #12
+        ext             v4.16b,  v4.16b,  v31.16b, #12
+        b               88f
+55:     // 5 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #5
+        ext             v4.16b,  v4.16b,  v4.16b,  #5
+        ext             v0.16b,  v0.16b,  v30.16b, #11
+        ext             v4.16b,  v4.16b,  v31.16b, #11
+        b               88f
+
+L(box3_variable_shift_tbl):
+        .hword L(box3_variable_shift_tbl) - 22b
+        .hword L(box3_variable_shift_tbl) - 33b
+        .hword L(box3_variable_shift_tbl) - 44b
+        .hword L(box3_variable_shift_tbl) - 55b
+
+88:
+        umull           v1.8h,   v0.8b,   v0.8b
+        umull2          v2.8h,   v0.16b,  v0.16b
+        umull           v5.8h,   v4.8b,   v4.8b
+        umull2          v6.8h,   v4.16b,  v4.16b
+
+        add3            4
+        subs            w5,  w5,  #4
+        st1             {v3.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v26.4s}, [x0],  #16
+        st1             {v28.4s}, [x10], #16
+        b.le            9f
+        ext             v0.16b,  v0.16b,  v0.16b, #4
+        ext             v4.16b,  v4.16b,  v4.16b, #4
+        ext             v1.16b,  v1.16b,  v2.16b, #8
+        ext             v5.16b,  v5.16b,  v6.16b, #8
+        // Only one needed pixel left, but do a normal 4 pixel
+        // addition anyway
+        add3            4
+        st1             {v3.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v26.4s}, [x0],  #16
+        st1             {v28.4s}, [x10], #16
+
+9:
+        subs            w6,  w6,  #2
+        b.le            0f
+        // Jump to the next row and loop horizontally
+        add             x0,  x0,  x9, lsl #1
+        add             x10, x10, x9, lsl #1
+        add             x1,  x1,  x9
+        add             x11, x11, x9
+        add             x3,  x3,  x4
+        add             x12, x12, x4
+        mov             w5,  w8
+        b               1b
+0:
+        ret
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                 const pixel (*left)[4],
+//                                 const pixel *src, const ptrdiff_t stride,
+//                                 const int w, const int h,
+//                                 const enum LrEdgeFlags edges);
+function sgr_box5_h_8bpc_neon, export=1
+        add             w5,  w5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq
+        add             x11, x1,  #(2*SUM_STRIDE)   // sum
+        add             x12, x3,  x4                // src
+        lsl             x4,  x4,  #1
+        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        // Subtract the number of pixels read from the input from the stride.
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            0f
+        // !LR_HAVE_RIGHT
+        add             w13, w5,  #3
+        bic             w13, w13, #3
+        add             w14, w5,  #13
+        b               1f
+0:
+        add             w13, w5,  #7
+        bic             w13, w13, #7
+        add             w14, w5,  #15
+1:
+        sub             x9,  x9,  w13, uxtw #1
+        bic             w14, w14, #7
+        sub             x4,  x4,  w14, uxtw
+
+        // Store the width for the vertical loop
+        mov             w8,  w5
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            2f
+        // LR_HAVE_LEFT
+        cbnz            x2,  0f
+        // left == NULL
+        sub             x3,  x3,  #3
+        sub             x12, x12, #3
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             x4,  x4,  #3
+
+1:      // Loop vertically
+        ld1             {v0.16b},  [x3],  #16
+        ld1             {v4.16b},  [x12], #16
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            0f
+        cbz             x2,  2f
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v1.s}[3],  [x2], #4
+        // Move x3/x12 back to account for the last 3 bytes we loaded earlier,
+        // which we'll shift out.
+        sub             x3,  x3,  #3
+        sub             x12, x12, #3
+        ld1             {v5.s}[3],  [x2], #4
+        ext             v0.16b, v1.16b, v0.16b, #13
+        ext             v4.16b, v5.16b, v4.16b, #13
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+        // and shift v0 to have 3x the first byte at the front.
+        dup             v1.16b, v0.b[0]
+        dup             v5.16b, v4.b[0]
+        // Move x3 back to account for the last 3 bytes we loaded before,
+        // which we shifted out.
+        sub             x3,  x3,  #3
+        sub             x12, x12, #3
+        ext             v0.16b, v1.16b, v0.16b, #13
+        ext             v4.16b, v5.16b, v4.16b, #13
+
+2:
+        umull           v1.8h,   v0.8b,   v0.8b
+        umull2          v2.8h,   v0.16b,  v0.16b
+        umull           v5.8h,   v4.8b,   v4.8b
+        umull2          v6.8h,   v4.16b,  v4.16b
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w13, w5, #(2 + 16 - 3 + 1)
+        ldr             b30, [x3,  w13, sxtw]
+        ldr             b31, [x12, w13, sxtw]
+        // Fill v30/v31 with the right padding pixel
+        dup             v30.8b,  v30.b[0]
+        dup             v31.8b,  v31.b[0]
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             w5,  #11
+        b.ge            4f   // If w >= 11, all used input pixels are valid
+        cmp             w5,  #7
+        b.ge            5f   // If w >= 7, we can produce 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro add5 w
+        ext             v16.16b, v0.16b,  v0.16b, #1
+        ext             v17.16b, v0.16b,  v0.16b, #2
+        ext             v18.16b, v0.16b,  v0.16b, #3
+        ext             v19.16b, v0.16b,  v0.16b, #4
+        ext             v20.16b, v4.16b,  v4.16b, #1
+        ext             v21.16b, v4.16b,  v4.16b, #2
+        ext             v22.16b, v4.16b,  v4.16b, #3
+        ext             v23.16b, v4.16b,  v4.16b, #4
+        uaddl           v3.8h,   v0.8b,   v16.8b
+        uaddl           v24.8h,  v17.8b,  v18.8b
+        uaddl           v7.8h,   v4.8b,   v20.8b
+        uaddw           v3.8h,   v3.8h,   v19.8b
+        uaddl           v25.8h,  v21.8b,  v22.8b
+        uaddw           v7.8h,   v7.8h,   v23.8b
+        add             v3.8h,   v3.8h,   v24.8h
+        add             v7.8h,   v7.8h,   v25.8h
+
+        ext             v16.16b, v1.16b,  v2.16b, #2
+        ext             v17.16b, v1.16b,  v2.16b, #4
+        ext             v18.16b, v1.16b,  v2.16b, #6
+        ext             v19.16b, v1.16b,  v2.16b, #8
+        ext             v20.16b, v5.16b,  v6.16b, #2
+        ext             v21.16b, v5.16b,  v6.16b, #4
+        ext             v22.16b, v5.16b,  v6.16b, #6
+        ext             v23.16b, v5.16b,  v6.16b, #8
+
+        uaddl_nh        v26.4s,  v27.4s,  v1,   v16,  \w
+        uaddl_nh        v16.4s,  v17.4s,  v17,  v18,  \w
+        uaddl_nh        v28.4s,  v29.4s,  v5,   v20,  \w
+        uaddw_nh        v26.4s,  v27.4s,  v19,        \w
+        uaddl_nh        v20.4s,  v21.4s,  v21,  v22,  \w
+        uaddw_nh        v28.4s,  v29.4s,  v23,        \w
+        add_nh          v26.4s,  v27.4s,  v16.4s, v17.4s, \w
+        add_nh          v28.4s,  v29.4s,  v20.4s, v21.4s, \w
+.endm
+        add5            8
+        st1             {v3.8h},         [x1],  #16
+        st1             {v7.8h},         [x11], #16
+        st1             {v26.4s,v27.4s}, [x0],  #32
+        st1             {v28.4s,v29.4s}, [x10], #32
+
+        subs            w5,  w5,  #8
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        ld1             {v3.8b},  [x3],  #8
+        ld1             {v7.8b},  [x12], #8
+        mov             v1.16b,  v2.16b
+        mov             v5.16b,  v6.16b
+        ext             v0.16b,  v0.16b,  v3.16b, #8
+        ext             v4.16b,  v4.16b,  v7.16b, #8
+        umull           v2.8h,   v3.8b,   v3.8b
+        umull           v6.8h,   v7.8b,   v7.8b
+        b.ne            4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 7 <= w < 11
+        add5            4
+        st1             {v3.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v26.4s}, [x0],  #16
+        st1             {v28.4s}, [x10], #16
+
+        subs            w5,  w5,  #4 // 3 <= w < 7
+        ext             v0.16b,  v0.16b,  v0.16b, #4
+        ext             v4.16b,  v4.16b,  v4.16b, #4
+
+6:      // Pad the right edge and produce the last few pixels.
+        // w < 7, w+1 pixels valid in v0/v4
+        sub             w13,  w5,  #1
+        // w13 = pixels valid - 2
+        adr             x14, L(box5_variable_shift_tbl)
+        ldrh            w13, [x14, w13, uxtw #1]
+        sub             x13, x14, w13, uxth
+        br              x13
+        // Shift v0 right, shifting out invalid pixels,
+        // shift v0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #2
+        ext             v4.16b,  v4.16b,  v4.16b,  #2
+        ext             v0.16b,  v0.16b,  v30.16b, #14
+        ext             v4.16b,  v4.16b,  v31.16b, #14
+        b               88f
+33:     // 3 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #3
+        ext             v4.16b,  v4.16b,  v4.16b,  #3
+        ext             v0.16b,  v0.16b,  v30.16b, #13
+        ext             v4.16b,  v4.16b,  v31.16b, #13
+        b               88f
+44:     // 4 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #4
+        ext             v4.16b,  v4.16b,  v4.16b,  #4
+        ext             v0.16b,  v0.16b,  v30.16b, #12
+        ext             v4.16b,  v4.16b,  v31.16b, #12
+        b               88f
+55:     // 5 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #5
+        ext             v4.16b,  v4.16b,  v4.16b,  #5
+        ext             v0.16b,  v0.16b,  v30.16b, #11
+        ext             v4.16b,  v4.16b,  v31.16b, #11
+        b               88f
+66:     // 6 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #6
+        ext             v4.16b,  v4.16b,  v4.16b,  #6
+        ext             v0.16b,  v0.16b,  v30.16b, #10
+        ext             v4.16b,  v4.16b,  v31.16b, #10
+        b               88f
+77:     // 7 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #7
+        ext             v4.16b,  v4.16b,  v4.16b,  #7
+        ext             v0.16b,  v0.16b,  v30.16b, #9
+        ext             v4.16b,  v4.16b,  v31.16b, #9
+        b               88f
+
+L(box5_variable_shift_tbl):
+        .hword L(box5_variable_shift_tbl) - 22b
+        .hword L(box5_variable_shift_tbl) - 33b
+        .hword L(box5_variable_shift_tbl) - 44b
+        .hword L(box5_variable_shift_tbl) - 55b
+        .hword L(box5_variable_shift_tbl) - 66b
+        .hword L(box5_variable_shift_tbl) - 77b
+
+88:
+        umull           v1.8h,   v0.8b,   v0.8b
+        umull2          v2.8h,   v0.16b,  v0.16b
+        umull           v5.8h,   v4.8b,   v4.8b
+        umull2          v6.8h,   v4.16b,  v4.16b
+
+        add5            4
+        subs            w5,  w5,  #4
+        st1             {v3.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v26.4s}, [x0],  #16
+        st1             {v28.4s}, [x10], #16
+        b.le            9f
+        ext             v0.16b,  v0.16b,  v0.16b, #4
+        ext             v1.16b,  v1.16b,  v2.16b, #8
+        ext             v4.16b,  v4.16b,  v4.16b, #4
+        ext             v5.16b,  v5.16b,  v6.16b, #8
+        add5            4
+        st1             {v3.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v26.4s}, [x0],  #16
+        st1             {v28.4s}, [x10], #16
+
+9:
+        subs            w6,  w6,  #2
+        b.le            0f
+        // Jump to the next row and loop horizontally
+        add             x0,  x0,  x9, lsl #1
+        add             x10, x10, x9, lsl #1
+        add             x1,  x1,  x9
+        add             x11, x11, x9
+        add             x3,  x3,  x4
+        add             x12, x12, x4
+        mov             w5,  w8
+        b               1b
+0:
+        ret
+.purgem add5
+endfunc
+
+sgr_funcs 8
diff --git a/src/arm/64/looprestoration16.S b/src/arm/64/looprestoration16.S
new file mode 100644 (file)
index 0000000..95f24fc
--- /dev/null
@@ -0,0 +1,1239 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
+//                                       const pixel *src, ptrdiff_t stride,
+//                                       const int16_t fh[7], const intptr_t w,
+//                                       int h, enum LrEdgeFlags edges,
+//                                       const int bitdepth_max);
+function wiener_filter_h_16bpc_neon, export=1
+        ldr             w8,  [sp]      // bitdepth_max
+        ld1             {v0.8h},  [x4]
+        clz             w8,  w8
+        movi            v30.4s,  #1
+        sub             w9,  w8,  #38  // -(bitdepth + 6)
+        sub             w8,  w8,  #25  // -round_bits_h
+        neg             w9,  w9        // bitdepth + 6
+        dup             v1.4s,   w9
+        dup             v29.4s,  w8    // -round_bits_h
+        movi            v31.8h,  #0x20, lsl #8  // 1 << 13 = 8192
+        ushl            v30.4s,  v30.4s,  v1.4s // 1 << (bitdepth + 6)
+        mov             w8,  w5
+        // Calculate mid_stride
+        add             w10, w5,  #7
+        bic             w10, w10, #7
+        lsl             w10, w10, #1
+
+        // Clear the last unused element of v0, to allow filtering a single
+        // pixel with one plain mul+addv.
+        ins             v0.h[7], wzr
+
+        // Set up pointers for reading/writing alternate rows
+        add             x12, x0,  x10
+        lsl             w10, w10, #1
+        add             x13, x2,  x3
+        lsl             x3,  x3,  #1
+
+        // Subtract the width from mid_stride
+        sub             x10, x10, w5, uxtw #1
+
+        // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
+        cmp             w5,  #8
+        add             w11, w5,  #13
+        bic             w11, w11, #7
+        b.ge            1f
+        mov             w11, #16
+1:
+        sub             x3,  x3,  w11, uxtw #1
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            2f
+        // LR_HAVE_LEFT
+        cbnz            x1,  0f
+        // left == NULL
+        sub             x2,  x2,  #6
+        sub             x13, x13, #6
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             x3,  x3,  #6
+
+
+1:      // Loop vertically
+        ld1             {v2.8h, v3.8h},  [x2],  #32
+        ld1             {v4.8h, v5.8h},  [x13], #32
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            0f
+        cbz             x1,  2f
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v1.d}[1],  [x1], #8
+        // Move x2/x13 back to account for the last 3 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             x2,  x2,  #6
+        sub             x13, x13, #6
+        ld1             {v6.d}[1],  [x1], #8
+        ext             v3.16b,  v2.16b,  v3.16b,  #10
+        ext             v2.16b,  v1.16b,  v2.16b,  #10
+        ext             v5.16b,  v4.16b,  v5.16b,  #10
+        ext             v4.16b,  v6.16b,  v4.16b,  #10
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill v1 with the leftmost pixel
+        // and shift v2/v3 to have 3x the first pixel at the front.
+        dup             v1.8h,   v2.h[0]
+        dup             v6.8h,   v4.h[0]
+        // Move x2 back to account for the last 3 pixels we loaded before,
+        // which we shifted out.
+        sub             x2,  x2,  #6
+        sub             x13, x13, #6
+        ext             v3.16b,  v2.16b,  v3.16b,  #10
+        ext             v2.16b,  v1.16b,  v2.16b,  #10
+        ext             v5.16b,  v4.16b,  v5.16b,  #10
+        ext             v4.16b,  v6.16b,  v4.16b,  #10
+
+2:
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w9,  w5,  #14
+        ldr             h27, [x2,  w9, sxtw #1]
+        ldr             h28, [x13, w9, sxtw #1]
+        // Fill v27/v28 with the right padding pixel
+        dup             v27.8h,  v27.h[0]
+        dup             v28.8h,  v28.h[0]
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             w5,  #11
+        b.ge            4f   // If w >= 11, all used input pixels are valid
+        cmp             w5,  #7
+        b.ge            5f   // If w >= 7, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro ushll_sz d0, d1, src, shift, wd
+        ushll           \d0\().4s,  \src\().4h,  \shift
+.ifc \wd, .8h
+        ushll2          \d1\().4s,  \src\().8h,  \shift
+.endif
+.endm
+.macro add_sz d0, d1, s0, s1, c, wd
+        add             \d0\().4s,  \s0\().4s,   \c\().4s
+.ifc \wd, .8h
+        add             \d1\().4s,  \s1\().4s,   \c\().4s
+.endif
+.endm
+.macro srshl_sz d0, d1, s0, s1, c, wd
+        srshl           \d0\().4s,  \s0\().4s,   \c\().4s
+.ifc \wd, .8h
+        srshl           \d1\().4s,  \s1\().4s,   \c\().4s
+.endif
+.endm
+.macro sqxtun_sz dst, s0, s1, wd
+        sqxtun          \dst\().4h, \s0\().4s
+.ifc \wd, .8h
+        sqxtun2         \dst\().8h, \s1\().4s
+.endif
+.endm
+
+.macro filter wd
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        ext             v16.16b, v2.16b,  v3.16b, #2
+        ext             v17.16b, v2.16b,  v3.16b, #4
+        ext             v18.16b, v2.16b,  v3.16b, #6
+        ext             v19.16b, v2.16b,  v3.16b, #8
+        ext             v20.16b, v2.16b,  v3.16b, #10
+        ext             v21.16b, v2.16b,  v3.16b, #12
+        ushll_sz        v6,  v7,  v18, #7, \wd
+        smlal           v6.4s,   v2.4h,   v0.h[0]
+        smlal           v6.4s,   v16.4h,  v0.h[1]
+        smlal           v6.4s,   v17.4h,  v0.h[2]
+        smlal           v6.4s,   v18.4h,  v0.h[3]
+        smlal           v6.4s,   v19.4h,  v0.h[4]
+        smlal           v6.4s,   v20.4h,  v0.h[5]
+        smlal           v6.4s,   v21.4h,  v0.h[6]
+.ifc \wd, .8h
+        smlal2          v7.4s,   v2.8h,   v0.h[0]
+        smlal2          v7.4s,   v16.8h,  v0.h[1]
+        smlal2          v7.4s,   v17.8h,  v0.h[2]
+        smlal2          v7.4s,   v18.8h,  v0.h[3]
+        smlal2          v7.4s,   v19.8h,  v0.h[4]
+        smlal2          v7.4s,   v20.8h,  v0.h[5]
+        smlal2          v7.4s,   v21.8h,  v0.h[6]
+.endif
+        ext             v19.16b, v4.16b,  v5.16b, #2
+        ext             v20.16b, v4.16b,  v5.16b, #4
+        ext             v21.16b, v4.16b,  v5.16b, #6
+        ext             v22.16b, v4.16b,  v5.16b, #8
+        ext             v23.16b, v4.16b,  v5.16b, #10
+        ext             v24.16b, v4.16b,  v5.16b, #12
+        ushll_sz        v16, v17, v21, #7, \wd
+        smlal           v16.4s,  v4.4h,   v0.h[0]
+        smlal           v16.4s,  v19.4h,  v0.h[1]
+        smlal           v16.4s,  v20.4h,  v0.h[2]
+        smlal           v16.4s,  v21.4h,  v0.h[3]
+        smlal           v16.4s,  v22.4h,  v0.h[4]
+        smlal           v16.4s,  v23.4h,  v0.h[5]
+        smlal           v16.4s,  v24.4h,  v0.h[6]
+.ifc \wd, .8h
+        smlal2          v17.4s,  v4.8h,   v0.h[0]
+        smlal2          v17.4s,  v19.8h,  v0.h[1]
+        smlal2          v17.4s,  v20.8h,  v0.h[2]
+        smlal2          v17.4s,  v21.8h,  v0.h[3]
+        smlal2          v17.4s,  v22.8h,  v0.h[4]
+        smlal2          v17.4s,  v23.8h,  v0.h[5]
+        smlal2          v17.4s,  v24.8h,  v0.h[6]
+.endif
+        mvni            v24\wd,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+        add_sz          v6,  v7,  v6,  v7,  v30, \wd
+        add_sz          v16, v17, v16, v17, v30, \wd
+        srshl_sz        v6,  v7,  v6,  v7,  v29, \wd
+        srshl_sz        v16, v17, v16, v17, v29, \wd
+        sqxtun_sz       v6,  v6,  v7,  \wd
+        sqxtun_sz       v7,  v16, v17, \wd
+        umin            v6\wd,   v6\wd,   v24\wd
+        umin            v7\wd,   v7\wd,   v24\wd
+        sub             v6\wd,   v6\wd,   v31\wd
+        sub             v7\wd,   v7\wd,   v31\wd
+.endm
+        filter          .8h
+        st1             {v6.8h},  [x0],  #16
+        st1             {v7.8h},  [x12], #16
+
+        subs            w5,  w5,  #8
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        mov             v2.16b,  v3.16b
+        mov             v4.16b,  v5.16b
+        ld1             {v3.8h},  [x2],  #16
+        ld1             {v5.8h},  [x13], #16
+        b.ne            4b // If we don't need to pad, just keep filtering.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Filter 4 pixels, 7 <= w < 11
+        filter          .4h
+        st1             {v6.4h},  [x0],  #8
+        st1             {v7.4h},  [x12], #8
+
+        subs            w5,  w5,  #4 // 3 <= w < 7
+        ext             v2.16b,  v2.16b,  v3.16b, #8
+        ext             v3.16b,  v3.16b,  v3.16b, #8
+        ext             v4.16b,  v4.16b,  v5.16b, #8
+        ext             v5.16b,  v5.16b,  v5.16b, #8
+
+6:      // Pad the right edge and filter the last few pixels.
+        // w < 7, w+3 pixels valid in v2-v3
+        cmp             w5,  #5
+        b.lt            7f
+        b.gt            8f
+        // w == 5, 8 pixels valid in v2, v3 invalid
+        mov             v3.16b,  v27.16b
+        mov             v5.16b,  v28.16b
+        b               88f
+
+7:      // 1 <= w < 5, 4-7 pixels valid in v2
+        sub             w9,  w5,  #1
+        // w9 = (pixels valid - 4)
+        adr             x11, L(variable_shift_tbl)
+        ldrh            w9,  [x11, w9, uxtw #1]
+        sub             x11, x11, w9, uxth
+        mov             v3.16b,  v27.16b
+        mov             v5.16b,  v28.16b
+        br              x11
+44:     // 4 pixels valid in v2/v4, fill the high half with padding.
+        ins             v2.d[1], v3.d[0]
+        ins             v4.d[1], v5.d[0]
+        b               88f
+        // Shift v2 right, shifting out invalid pixels,
+        // shift v2 left to the original offset, shifting in padding pixels.
+55:     // 5 pixels valid
+        ext             v2.16b,  v2.16b,  v2.16b,  #10
+        ext             v2.16b,  v2.16b,  v3.16b,  #6
+        ext             v4.16b,  v4.16b,  v4.16b,  #10
+        ext             v4.16b,  v4.16b,  v5.16b,  #6
+        b               88f
+66:     // 6 pixels valid, fill the upper 2 pixels with padding.
+        ins             v2.s[3], v3.s[0]
+        ins             v4.s[3], v5.s[0]
+        b               88f
+77:     // 7 pixels valid, fill the last pixel with padding.
+        ins             v2.h[7], v3.h[0]
+        ins             v4.h[7], v5.h[0]
+        b               88f
+
+L(variable_shift_tbl):
+        .hword L(variable_shift_tbl) - 44b
+        .hword L(variable_shift_tbl) - 55b
+        .hword L(variable_shift_tbl) - 66b
+        .hword L(variable_shift_tbl) - 77b
+
+8:      // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3
+        ins             v27.h[0],  v3.h[0]
+        ins             v28.h[0],  v5.h[0]
+        mov             v3.16b,  v27.16b
+        mov             v5.16b,  v28.16b
+
+88:
+        // w < 7, v2-v3 padded properly
+        cmp             w5,  #4
+        b.lt            888f
+
+        // w >= 4, filter 4 pixels
+        filter          .4h
+        st1             {v6.4h},  [x0],  #8
+        st1             {v7.4h},  [x12], #8
+        subs            w5,  w5,  #4 // 0 <= w < 4
+        ext             v2.16b,  v2.16b,  v3.16b, #8
+        ext             v4.16b,  v4.16b,  v5.16b, #8
+        b.eq            9f
+888:    // 1 <= w < 4, filter 1 pixel at a time
+        smull           v6.4s,   v2.4h,   v0.4h
+        smull2          v7.4s,   v2.8h,   v0.8h
+        smull           v16.4s,  v4.4h,   v0.4h
+        smull2          v17.4s,  v4.8h,   v0.8h
+        add             v6.4s,   v6.4s,   v7.4s
+        add             v16.4s,  v16.4s,  v17.4s
+        addv            s6,      v6.4s
+        addv            s7,      v16.4s
+        dup             v16.4h,  v2.h[3]
+        ins             v16.h[1], v4.h[3]
+        ins             v6.s[1], v7.s[0]
+        mvni            v24.4h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+        ushll           v16.4s,  v16.4h,  #7
+        add             v6.4s,   v6.4s,   v30.4s
+        add             v6.4s,   v6.4s,   v16.4s
+        srshl           v6.4s,   v6.4s,   v29.4s
+        sqxtun          v6.4h,   v6.4s
+        umin            v6.4h,   v6.4h,   v24.4h
+        sub             v6.4h,   v6.4h,   v31.4h
+        st1             {v6.h}[0], [x0],  #2
+        st1             {v6.h}[1], [x12], #2
+        subs            w5,  w5,  #1
+        ext             v2.16b,  v2.16b,  v3.16b,  #2
+        ext             v4.16b,  v4.16b,  v5.16b,  #2
+        b.gt            888b
+
+9:
+        subs            w6,  w6,  #2
+        b.le            0f
+        // Jump to the next row and loop horizontally
+        add             x0,  x0,  x10
+        add             x12, x12, x10
+        add             x2,  x2,  x3
+        add             x13, x13, x3
+        mov             w5,  w8
+        b               1b
+0:
+        ret
+.purgem filter
+endfunc
+
+// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                       const int16_t *mid, int w, int h,
+//                                       const int16_t fv[7], enum LrEdgeFlags edges,
+//                                       ptrdiff_t mid_stride, const int bitdepth_max);
+function wiener_filter_v_16bpc_neon, export=1
+        ldr             w8,  [sp]       // bitdepth_max
+        ld1             {v0.8h},  [x5]
+        dup             v31.8h,  w8
+        clz             w8,  w8
+        movi            v1.8h,   #128
+        sub             w8,  w8,  #11   // round_bits_v
+        add             v1.8h,   v1.8h,   v0.8h
+        dup             v30.4s,  w8
+        mov             w8,  w4
+        neg             v30.4s,  v30.4s // -round_bits_v
+
+        // Calculate the number of rows to move back when looping vertically
+        mov             w11, w4
+        tst             w6,  #4 // LR_HAVE_TOP
+        b.eq            0f
+        sub             x2,  x2,  x7,  lsl #1
+        add             w11, w11, #2
+0:
+        tst             w6,  #8 // LR_HAVE_BOTTOM
+        b.eq            1f
+        add             w11, w11, #2
+
+1:      // Start of horizontal loop; start one vertical filter slice.
+        // Load rows into v16-v19 and pad properly.
+        tst             w6,  #4 // LR_HAVE_TOP
+        ld1             {v16.8h}, [x2], x7
+        b.eq            2f
+        // LR_HAVE_TOP
+        ld1             {v18.8h}, [x2], x7
+        mov             v17.16b, v16.16b
+        ld1             {v19.8h}, [x2], x7
+        b               3f
+2:      // !LR_HAVE_TOP
+        mov             v17.16b, v16.16b
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v16.16b
+
+3:
+        cmp             w4,  #4
+        b.lt            5f
+        // Start filtering normally; fill in v20-v22 with unique rows.
+        ld1             {v20.8h}, [x2], x7
+        ld1             {v21.8h}, [x2], x7
+        ld1             {v22.8h}, [x2], x7
+
+4:
+.macro filter compare
+        subs            w4,  w4,  #1
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        smull           v2.4s,  v16.4h,  v0.h[0]
+        smlal           v2.4s,  v17.4h,  v0.h[1]
+        smlal           v2.4s,  v18.4h,  v0.h[2]
+        smlal           v2.4s,  v19.4h,  v1.h[3]
+        smlal           v2.4s,  v20.4h,  v0.h[4]
+        smlal           v2.4s,  v21.4h,  v0.h[5]
+        smlal           v2.4s,  v22.4h,  v0.h[6]
+        smull2          v3.4s,  v16.8h,  v0.h[0]
+        smlal2          v3.4s,  v17.8h,  v0.h[1]
+        smlal2          v3.4s,  v18.8h,  v0.h[2]
+        smlal2          v3.4s,  v19.8h,  v1.h[3]
+        smlal2          v3.4s,  v20.8h,  v0.h[4]
+        smlal2          v3.4s,  v21.8h,  v0.h[5]
+        smlal2          v3.4s,  v22.8h,  v0.h[6]
+        srshl           v2.4s,  v2.4s,   v30.4s // round_bits_v
+        srshl           v3.4s,  v3.4s,   v30.4s
+        sqxtun          v2.4h,  v2.4s
+        sqxtun2         v2.8h,  v3.4s
+        umin            v2.8h,  v2.8h,   v31.8h // bitdepth_max
+        st1             {v2.8h}, [x0], x1
+.if \compare
+        cmp             w4,  #4
+.else
+        b.le            9f
+.endif
+        mov             v16.16b,  v17.16b
+        mov             v17.16b,  v18.16b
+        mov             v18.16b,  v19.16b
+        mov             v19.16b,  v20.16b
+        mov             v20.16b,  v21.16b
+        mov             v21.16b,  v22.16b
+.endm
+        filter          1
+        b.lt            7f
+        ld1             {v22.8h}, [x2], x7
+        b               4b
+
+5:      // Less than 4 rows in total; not all of v20-v21 are filled yet.
+        tst             w6,  #8 // LR_HAVE_BOTTOM
+        b.eq            6f
+        // LR_HAVE_BOTTOM
+        cmp             w4,  #2
+        // We load at least 2 rows in all cases.
+        ld1             {v20.8h}, [x2], x7
+        ld1             {v21.8h}, [x2], x7
+        b.gt            53f // 3 rows in total
+        b.eq            52f // 2 rows in total
+51:     // 1 row in total, v19 already loaded, load edge into v20-v22.
+        mov             v22.16b,  v21.16b
+        b               8f
+52:     // 2 rows in total, v19 already loaded, load v20 with content data
+        // and 2 rows of edge.
+        ld1             {v22.8h}, [x2], x7
+        mov             v23.16b,  v22.16b
+        b               8f
+53:
+        // 3 rows in total, v19 already loaded, load v20 and v21 with content
+        // and 2 rows of edge.
+        ld1             {v22.8h}, [x2], x7
+        ld1             {v23.8h}, [x2], x7
+        mov             v24.16b,  v23.16b
+        b               8f
+
+6:
+        // !LR_HAVE_BOTTOM
+        cmp             w4,  #2
+        b.gt            63f // 3 rows in total
+        b.eq            62f // 2 rows in total
+61:     // 1 row in total, v19 already loaded, pad that into v20-v22.
+        mov             v20.16b,  v19.16b
+        mov             v21.16b,  v19.16b
+        mov             v22.16b,  v19.16b
+        b               8f
+62:     // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23.
+        ld1             {v20.8h}, [x2], x7
+        mov             v21.16b,  v20.16b
+        mov             v22.16b,  v20.16b
+        mov             v23.16b,  v20.16b
+        b               8f
+63:
+        // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24.
+        ld1             {v20.8h}, [x2], x7
+        ld1             {v21.8h}, [x2], x7
+        mov             v22.16b,  v21.16b
+        mov             v23.16b,  v21.16b
+        mov             v24.16b,  v21.16b
+        b               8f
+
+7:
+        // All registers up to v21 are filled already, 3 valid rows left.
+        // < 4 valid rows left; fill in padding and filter the last
+        // few rows.
+        tst             w6,  #8 // LR_HAVE_BOTTOM
+        b.eq            71f
+        // LR_HAVE_BOTTOM; load 2 rows of edge.
+        ld1             {v22.8h}, [x2], x7
+        ld1             {v23.8h}, [x2], x7
+        mov             v24.16b,  v23.16b
+        b               8f
+71:
+        // !LR_HAVE_BOTTOM, pad 3 rows
+        mov             v22.16b,  v21.16b
+        mov             v23.16b,  v21.16b
+        mov             v24.16b,  v21.16b
+
+8:      // At this point, all registers up to v22-v24 are loaded with
+        // edge/padding (depending on how many rows are left).
+        filter          0 // This branches to 9f when done
+        mov             v22.16b,  v23.16b
+        mov             v23.16b,  v24.16b
+        b               8b
+
+9:      // End of one vertical slice.
+        subs            w3,  w3,  #8
+        b.le            0f
+        // Move pointers back up to the top and loop horizontally.
+        msub            x0,  x1,  x8,  x0
+        msub            x2,  x7,  x11, x2
+        add             x0,  x0,  #16
+        add             x2,  x2,  #16
+        mov             w4,  w8
+        b               1b
+
+0:
+        ret
+.purgem filter
+endfunc
+
+// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                   const pixel *src, int w, int h);
+function copy_narrow_16bpc_neon, export=1
+        adr             x5,  L(copy_narrow_tbl)
+        ldrh            w6,  [x5, w3, uxtw #1]
+        sub             x5,  x5,  w6, uxth
+        br              x5
+10:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+18:
+        subs            w4,  w4,  #8
+        b.lt            110f
+        ld1             {v0.8h}, [x2], #16
+        st1             {v0.h}[0], [x0], x1
+        st1             {v0.h}[1], [x7], x1
+        st1             {v0.h}[2], [x0], x1
+        st1             {v0.h}[3], [x7], x1
+        st1             {v0.h}[4], [x0], x1
+        st1             {v0.h}[5], [x7], x1
+        st1             {v0.h}[6], [x0], x1
+        st1             {v0.h}[7], [x7], x1
+        b.le            0f
+        b               18b
+110:
+        add             w4,  w4,  #8
+        asr             x1,  x1,  #1
+11:
+        subs            w4,  w4,  #1
+        ld1             {v0.h}[0], [x2], #2
+        st1             {v0.h}[0], [x0], x1
+        b.gt            11b
+0:
+        ret
+
+20:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+24:
+        subs            w4,  w4,  #4
+        b.lt            210f
+        ld1             {v0.4s}, [x2], #16
+        st1             {v0.s}[0], [x0], x1
+        st1             {v0.s}[1], [x7], x1
+        st1             {v0.s}[2], [x0], x1
+        st1             {v0.s}[3], [x7], x1
+        b.le            0f
+        b               24b
+210:
+        add             w4,  w4,  #4
+        asr             x1,  x1,  #1
+22:
+        subs            w4,  w4,  #1
+        ld1             {v0.s}[0], [x2], #4
+        st1             {v0.s}[0], [x0], x1
+        b.gt            22b
+0:
+        ret
+
+30:
+        ldr             w5,  [x2]
+        ldrh            w6,  [x2, #4]
+        add             x2,  x2,  #6
+        subs            w4,  w4,  #1
+        str             w5,  [x0]
+        strh            w6,  [x0, #4]
+        add             x0,  x0,  x1
+        b.gt            30b
+        ret
+
+40:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+42:
+        subs            w4,  w4,  #2
+        b.lt            41f
+        ld1             {v0.2d}, [x2], #16
+        st1             {v0.d}[0], [x0], x1
+        st1             {v0.d}[1], [x7], x1
+        b.le            0f
+        b               42b
+41:
+        ld1             {v0.4h}, [x2]
+        st1             {v0.4h}, [x0]
+0:
+        ret
+
+50:
+        ldr             x5,  [x2]
+        ldrh            w6,  [x2, #8]
+        add             x2,  x2,  #10
+        subs            w4,  w4,  #1
+        str             x5,  [x0]
+        strh            w6,  [x0, #8]
+        add             x0,  x0,  x1
+        b.gt            50b
+        ret
+
+60:
+        ldr             x5,  [x2]
+        ldr             w6,  [x2, #8]
+        add             x2,  x2,  #12
+        subs            w4,  w4,  #1
+        str             x5,  [x0]
+        str             w6,  [x0, #8]
+        add             x0,  x0,  x1
+        b.gt            60b
+        ret
+
+70:
+        ldr             x5,  [x2]
+        ldr             w6,  [x2, #8]
+        ldrh            w7,  [x2, #12]
+        add             x2,  x2,  #14
+        subs            w4,  w4,  #1
+        str             x5,  [x0]
+        str             w6,  [x0, #8]
+        strh            w7,  [x0, #12]
+        add             x0,  x0,  x1
+        b.gt            70b
+        ret
+
+L(copy_narrow_tbl):
+        .hword 0
+        .hword L(copy_narrow_tbl) - 10b
+        .hword L(copy_narrow_tbl) - 20b
+        .hword L(copy_narrow_tbl) - 30b
+        .hword L(copy_narrow_tbl) - 40b
+        .hword L(copy_narrow_tbl) - 50b
+        .hword L(copy_narrow_tbl) - 60b
+        .hword L(copy_narrow_tbl) - 70b
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                  const pixel (*left)[4],
+//                                  const pixel *src, const ptrdiff_t stride,
+//                                  const int w, const int h,
+//                                  const enum LrEdgeFlags edges);
+function sgr_box3_h_16bpc_neon, export=1
+        add             w5,  w5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq
+        add             x11, x1,  #(2*SUM_STRIDE)   // sum
+        add             x12, x3,  x4                // src
+        lsl             x4,  x4,  #1
+        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            0f
+        // !LR_HAVE_RIGHT
+        add             w13, w5,  #3
+        bic             w13, w13, #3
+        b               1f
+0:
+        add             w13, w5,  #7
+        bic             w13, w13, #7
+1:
+        sub             x9,  x9,  w13, uxtw #1
+
+        // Store the width for the vertical loop
+        mov             w8,  w5
+
+        // Subtract the number of pixels read from the input from the stride
+        add             w13, w5,  #14
+        bic             w13, w13, #7
+        sub             x4,  x4,  w13, uxtw #1
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            2f
+        // LR_HAVE_LEFT
+        cbnz            x2,  0f
+        // left == NULL
+        sub             x3,  x3,  #4
+        sub             x12, x12, #4
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 2 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             x4,  x4,  #4
+
+
+1:      // Loop vertically
+        ld1             {v0.8h, v1.8h},   [x3],  #32
+        ld1             {v16.8h, v17.8h}, [x12], #32
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            0f
+        cbz             x2,  2f
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v2.d}[1],  [x2], #8
+        // Move x3/x12 back to account for the last 2 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             x3,  x3,  #4
+        sub             x12, x12, #4
+        ld1             {v18.d}[1], [x2], #8
+        ext             v1.16b,  v0.16b,  v1.16b,  #12
+        ext             v0.16b,  v2.16b,  v0.16b,  #12
+        ext             v17.16b, v16.16b, v17.16b, #12
+        ext             v16.16b, v18.16b, v16.16b, #12
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+        // and shift v0/v1 to have 2x the first pixel at the front.
+        dup             v2.8h,  v0.h[0]
+        dup             v18.8h, v16.h[0]
+        // Move x3 back to account for the last 2 pixels we loaded before,
+        // which we shifted out.
+        sub             x3,  x3,  #4
+        sub             x12, x12, #4
+        ext             v1.16b,  v0.16b,  v1.16b,  #12
+        ext             v0.16b,  v2.16b,  v0.16b,  #12
+        ext             v17.16b, v16.16b, v17.16b, #12
+        ext             v16.16b, v18.16b, v16.16b, #12
+
+2:
+        umull           v2.4s,   v0.4h,   v0.4h
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        umull           v18.4s,  v16.4h,  v16.4h
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w13, w5, #(2 + 16 - 2 + 1)
+        ldr             h30, [x3,  w13, sxtw #1]
+        ldr             h31, [x12, w13, sxtw #1]
+        // Fill v30/v31 with the right padding pixel
+        dup             v30.8h,  v30.h[0]
+        dup             v31.8h,  v31.h[0]
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             w5,  #10
+        b.ge            4f   // If w >= 10, all used input pixels are valid
+        cmp             w5,  #6
+        b.ge            5f   // If w >= 6, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro ext_n            dst1, dst2, src1, src2, src3, n, w
+        ext             \dst1,  \src1,  \src2,  \n
+.if \w > 4
+        ext             \dst2,  \src2,  \src3,  \n
+.endif
+.endm
+.macro add_n            dst1, dst2, src1, src2, src3, src4, w
+        add             \dst1,  \src1,  \src3
+.if \w > 4
+        add             \dst2,  \src2,  \src4
+.endif
+.endm
+
+.macro add3 w, wd
+        ext             v24.16b, v0.16b,  v1.16b,  #2
+        ext             v25.16b, v0.16b,  v1.16b,  #4
+        ext             v26.16b, v16.16b, v17.16b, #2
+        ext             v27.16b, v16.16b, v17.16b, #4
+        add             v6\wd,   v0\wd,   v24\wd
+        add             v7\wd,   v16\wd,  v26\wd
+        add             v6\wd,   v6\wd,   v25\wd
+        add             v7\wd,   v7\wd,   v27\wd
+
+        ext_n           v24.16b, v25.16b, v2.16b,  v3.16b,  v4.16b,  #4, \w
+        ext_n           v26.16b, v27.16b, v2.16b,  v3.16b,  v4.16b,  #8, \w
+
+        add_n           v22.4s,  v23.4s,  v2.4s,   v3.4s,   v24.4s,  v25.4s,  \w
+        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v26.4s,  v27.4s,  \w
+
+        ext_n           v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w
+        ext_n           v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w
+
+        add_n           v24.4s,  v25.4s,  v18.4s,  v19.4s,  v24.4s,  v25.4s,  \w
+        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v26.4s,  v27.4s,  \w
+.endm
+        add3            8, .8h
+        st1             {v6.8h},         [x1],  #16
+        st1             {v7.8h},         [x11], #16
+        st1             {v22.4s,v23.4s}, [x0],  #32
+        st1             {v24.4s,v25.4s}, [x10], #32
+
+        subs            w5,  w5,  #8
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        mov             v0.16b,  v1.16b
+        mov             v16.16b, v17.16b
+        ld1             {v1.8h},  [x3],  #16
+        ld1             {v17.8h}, [x12], #16
+        mov             v2.16b,  v4.16b
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        mov             v18.16b, v20.16b
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        b.ne            4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 6 <= w < 10
+        add3            4, .4h
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+
+        subs            w5,  w5,  #4 // 2 <= w < 6
+        ext             v0.16b,  v0.16b,  v1.16b,  #8
+        ext             v16.16b, v16.16b, v17.16b, #8
+
+6:      // Pad the right edge and produce the last few pixels.
+        // 2 <= w < 6, 2-5 pixels valid in v0
+        sub             w13,  w5,  #2
+        // w13 = (pixels valid - 2)
+        adr             x14, L(box3_variable_shift_tbl)
+        ldrh            w13, [x14, w13, uxtw #1]
+        sub             x13, x14, w13, uxth
+        br              x13
+        // Shift v0 right, shifting out invalid pixels,
+        // shift v0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #4
+        ext             v16.16b, v16.16b, v16.16b, #4
+        ext             v0.16b,  v0.16b,  v30.16b, #12
+        ext             v16.16b, v16.16b, v31.16b, #12
+        b               88f
+33:     // 3 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #6
+        ext             v16.16b, v16.16b, v16.16b, #6
+        ext             v0.16b,  v0.16b,  v30.16b, #10
+        ext             v16.16b, v16.16b, v31.16b, #10
+        b               88f
+44:     // 4 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #8
+        ext             v16.16b, v16.16b, v16.16b, #8
+        ext             v0.16b,  v0.16b,  v30.16b, #8
+        ext             v16.16b, v16.16b, v31.16b, #8
+        b               88f
+55:     // 5 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #10
+        ext             v16.16b, v16.16b, v16.16b, #10
+        ext             v0.16b,  v0.16b,  v30.16b, #6
+        ext             v16.16b, v16.16b, v31.16b, #6
+        b               88f
+
+L(box3_variable_shift_tbl):
+        .hword L(box3_variable_shift_tbl) - 22b
+        .hword L(box3_variable_shift_tbl) - 33b
+        .hword L(box3_variable_shift_tbl) - 44b
+        .hword L(box3_variable_shift_tbl) - 55b
+
+88:
+        umull           v2.4s,   v0.4h,   v0.4h
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v18.4s,  v16.4h,  v16.4h
+        umull2          v19.4s,  v16.8h,  v16.8h
+
+        add3            4, .4h
+        subs            w5,  w5,  #4
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+        b.le            9f
+        ext             v0.16b,  v0.16b,  v0.16b,  #8
+        ext             v16.16b, v16.16b, v16.16b, #8
+        mov             v2.16b,  v3.16b
+        mov             v3.16b,  v4.16b
+        mov             v18.16b, v19.16b
+        mov             v19.16b, v20.16b
+        // Only one needed pixel left, but do a normal 4 pixel
+        // addition anyway
+        add3            4, .4h
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+
+9:
+        subs            w6,  w6,  #2
+        b.le            0f
+        // Jump to the next row and loop horizontally
+        add             x0,  x0,  x9, lsl #1
+        add             x10, x10, x9, lsl #1
+        add             x1,  x1,  x9
+        add             x11, x11, x9
+        add             x3,  x3,  x4
+        add             x12, x12, x4
+        mov             w5,  w8
+        b               1b
+0:
+        ret
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                  const pixel (*left)[4],
+//                                  const pixel *src, const ptrdiff_t stride,
+//                                  const int w, const int h,
+//                                  const enum LrEdgeFlags edges);
+function sgr_box5_h_16bpc_neon, export=1
+        add             w5,  w5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq
+        add             x11, x1,  #(2*SUM_STRIDE)   // sum
+        add             x12, x3,  x4                // src
+        lsl             x4,  x4,  #1
+        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        // Subtract the number of pixels read from the input from the stride.
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            0f
+        // !LR_HAVE_RIGHT
+        add             w13, w5,  #3
+        bic             w13, w13, #3
+        add             w14, w5,  #13
+        b               1f
+0:
+        add             w13, w5,  #7
+        bic             w13, w13, #7
+        add             w14, w5,  #15
+1:
+        sub             x9,  x9,  w13, uxtw #1
+        bic             w14, w14, #7
+        sub             x4,  x4,  w14, uxtw #1
+
+        // Store the width for the vertical loop
+        mov             w8,  w5
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            2f
+        // LR_HAVE_LEFT
+        cbnz            x2,  0f
+        // left == NULL
+        sub             x3,  x3,  #6
+        sub             x12, x12, #6
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             x4,  x4,  #6
+
+1:      // Loop vertically
+        ld1             {v0.8h, v1.8h},   [x3],  #32
+        ld1             {v16.8h, v17.8h}, [x12], #32
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            0f
+        cbz             x2,  2f
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v2.d}[1],  [x2], #8
+        // Move x3/x12 back to account for the last 3 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             x3,  x3,  #6
+        sub             x12, x12, #6
+        ld1             {v18.d}[1],  [x2], #8
+        ext             v1.16b,  v0.16b,  v1.16b,  #10
+        ext             v0.16b,  v2.16b,  v0.16b,  #10
+        ext             v17.16b, v16.16b, v17.16b, #10
+        ext             v16.16b, v18.16b, v16.16b, #10
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+        // and shift v0/v1 to have 3x the first pixel at the front.
+        dup             v2.8h,  v0.h[0]
+        dup             v18.8h, v16.h[0]
+        // Move x3 back to account for the last 6 bytes we loaded before,
+        // which we shifted out.
+        sub             x3,  x3,  #6
+        sub             x12, x12, #6
+        ext             v1.16b,  v0.16b,  v1.16b,  #10
+        ext             v0.16b,  v2.16b,  v0.16b,  #10
+        ext             v17.16b, v16.16b, v17.16b, #10
+        ext             v16.16b, v18.16b, v16.16b, #10
+
+2:
+        umull           v2.4s,   v0.4h,   v0.4h
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        umull           v18.4s,  v16.4h,  v16.4h
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w13, w5, #(2 + 16 - 3 + 1)
+        ldr             h30, [x3,  w13, sxtw #1]
+        ldr             h31, [x12, w13, sxtw #1]
+        // Fill v30/v31 with the right padding pixel
+        dup             v30.8h,  v30.h[0]
+        dup             v31.8h,  v31.h[0]
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             w5,  #11
+        b.ge            4f   // If w >= 11, all used input pixels are valid
+        cmp             w5,  #7
+        b.ge            5f   // If w >= 7, we can produce 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro add5 w, wd
+        ext             v24.16b, v0.16b,  v1.16b,  #2
+        ext             v25.16b, v0.16b,  v1.16b,  #4
+        ext             v26.16b, v0.16b,  v1.16b,  #6
+        ext             v27.16b, v0.16b,  v1.16b,  #8
+
+        add             v6\wd,   v0\wd,   v24\wd
+        add             v25\wd,  v25\wd,  v26\wd
+        add             v6\wd,   v6\wd,   v27\wd
+
+        ext             v26.16b, v16.16b, v17.16b, #2
+        ext             v27.16b, v16.16b, v17.16b, #4
+        ext             v28.16b, v16.16b, v17.16b, #6
+        ext             v29.16b, v16.16b, v17.16b, #8
+
+        add             v7\wd,   v16\wd,  v26\wd
+        add             v27\wd,  v27\wd,  v28\wd
+        add             v7\wd,   v7\wd,   v29\wd
+        add             v6\wd,   v6\wd,   v25\wd
+        add             v7\wd,   v7\wd,   v27\wd
+
+        ext_n           v24.16b, v25.16b, v2.16b,  v3.16b,  v4.16b,  #4,  \w
+        ext_n           v26.16b, v27.16b, v2.16b,  v3.16b,  v4.16b,  #8,  \w
+        ext_n           v28.16b, v29.16b, v2.16b,  v3.16b,  v4.16b,  #12, \w
+
+        add_n           v22.4s,  v23.4s,  v2.4s,   v3.4s,   v24.4s,  v25.4s,  \w
+        add_n           v26.4s,  v27.4s,  v26.4s,  v27.4s,  v28.4s,  v29.4s,  \w
+        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v3.4s,   v4.4s,   \w
+        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v26.4s,  v27.4s,  \w
+
+        ext_n           v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4,  \w
+        ext_n           v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8,  \w
+        ext_n           v28.16b, v29.16b, v18.16b, v19.16b, v20.16b, #12, \w
+
+        add_n           v24.4s,  v25.4s,  v18.4s,  v19.4s,  v24.4s,  v25.4s,  \w
+        add_n           v26.4s,  v27.4s,  v26.4s,  v27.4s,  v28.4s,  v29.4s,  \w
+        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v19.4s,  v20.4s,  \w
+        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v26.4s,  v27.4s,  \w
+.endm
+        add5            8, .8h
+        st1             {v6.8h},         [x1],  #16
+        st1             {v7.8h},         [x11], #16
+        st1             {v22.4s,v23.4s}, [x0],  #32
+        st1             {v24.4s,v25.4s}, [x10], #32
+
+        subs            w5,  w5,  #8
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        mov             v0.16b,  v1.16b
+        mov             v16.16b, v17.16b
+        ld1             {v1.8h},  [x3],  #16
+        ld1             {v17.8h}, [x12], #16
+        mov             v2.16b,  v4.16b
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        mov             v18.16b, v20.16b
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        b.ne            4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 7 <= w < 11
+        add5            4, .4h
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+
+        subs            w5,  w5,  #4 // 3 <= w < 7
+        ext             v0.16b,  v0.16b,  v1.16b,  #8
+        ext             v16.16b, v16.16b, v17.16b, #8
+
+6:      // Pad the right edge and produce the last few pixels.
+        // w < 7, w+1 pixels valid in v0/v4
+        sub             w13,  w5,  #1
+        // w13 = pixels valid - 2
+        adr             x14, L(box5_variable_shift_tbl)
+        ldrh            w13, [x14, w13, uxtw #1]
+        mov             v1.16b,  v30.16b
+        mov             v17.16b, v31.16b
+        sub             x13, x14, w13, uxth
+        br              x13
+        // Shift v0 right, shifting out invalid pixels,
+        // shift v0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #4
+        ext             v16.16b, v16.16b, v16.16b, #4
+        ext             v0.16b,  v0.16b,  v30.16b, #12
+        ext             v16.16b, v16.16b, v31.16b, #12
+        b               88f
+33:     // 3 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #6
+        ext             v16.16b, v16.16b, v16.16b, #6
+        ext             v0.16b,  v0.16b,  v30.16b, #10
+        ext             v16.16b, v16.16b, v31.16b, #10
+        b               88f
+44:     // 4 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #8
+        ext             v16.16b, v16.16b, v16.16b, #8
+        ext             v0.16b,  v0.16b,  v30.16b, #8
+        ext             v16.16b, v16.16b, v31.16b, #8
+        b               88f
+55:     // 5 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #10
+        ext             v16.16b, v16.16b, v16.16b, #10
+        ext             v0.16b,  v0.16b,  v30.16b, #6
+        ext             v16.16b, v16.16b, v31.16b, #6
+        b               88f
+66:     // 6 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #12
+        ext             v16.16b, v16.16b, v16.16b, #12
+        ext             v0.16b,  v0.16b,  v30.16b, #4
+        ext             v16.16b, v16.16b, v31.16b, #4
+        b               88f
+77:     // 7 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #14
+        ext             v16.16b, v16.16b, v16.16b, #14
+        ext             v0.16b,  v0.16b,  v30.16b, #2
+        ext             v16.16b, v16.16b, v31.16b, #2
+        b               88f
+
+L(box5_variable_shift_tbl):
+        .hword L(box5_variable_shift_tbl) - 22b
+        .hword L(box5_variable_shift_tbl) - 33b
+        .hword L(box5_variable_shift_tbl) - 44b
+        .hword L(box5_variable_shift_tbl) - 55b
+        .hword L(box5_variable_shift_tbl) - 66b
+        .hword L(box5_variable_shift_tbl) - 77b
+
+88:
+        umull           v2.4s,   v0.4h,   v0.4h
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        umull           v18.4s,  v16.4h,  v16.4h
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        add5            4, .4h
+        subs            w5,  w5,  #4
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+        b.le            9f
+        ext             v0.16b,  v0.16b,  v1.16b,  #8
+        ext             v16.16b, v16.16b, v17.16b, #8
+        mov             v2.16b,  v3.16b
+        mov             v3.16b,  v4.16b
+        mov             v18.16b, v19.16b
+        mov             v19.16b, v20.16b
+        add5            4, .4h
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+
+9:
+        subs            w6,  w6,  #2
+        b.le            0f
+        // Jump to the next row and loop horizontally
+        add             x0,  x0,  x9, lsl #1
+        add             x10, x10, x9, lsl #1
+        add             x1,  x1,  x9
+        add             x11, x11, x9
+        add             x3,  x3,  x4
+        add             x12, x12, x4
+        mov             w5,  w8
+        b               1b
+0:
+        ret
+.purgem add5
+endfunc
+
+sgr_funcs 16
diff --git a/src/arm/64/looprestoration_common.S b/src/arm/64/looprestoration_common.S
new file mode 100644 (file)
index 0000000..200eb63
--- /dev/null
@@ -0,0 +1,432 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+        add             w10, w3,  #2 // Number of output rows to move back
+        mov             w11, w3      // Number of input rows to move back
+        add             w2,  w2,  #2 // Actual summed width
+        mov             x7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             x8,       #(2*SUM_STRIDE) // sum stride
+        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             w4,  #4 // LR_HAVE_TOP
+        b.eq            0f
+        // If have top, read from row -2.
+        sub             x5,  x0,  #(4*SUM_STRIDE)
+        sub             x6,  x1,  #(2*SUM_STRIDE)
+        add             w11, w11, #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             x5,  x0,  #(4*SUM_STRIDE)
+        add             x6,  x1,  #(2*SUM_STRIDE)
+1:
+
+        tst             w4,  #8 // LR_HAVE_BOTTOM
+        b.eq            1f
+        // LR_HAVE_BOTTOM
+        add             w3,  w3,  #2  // Sum all h+2 lines with the main loop
+        add             w11, w11, #2
+1:
+        mov             w9,  w3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into v16-v21 and v24-v26 taking top
+        // padding into consideration.
+        tst             w4,  #4 // LR_HAVE_TOP
+        ld1             {v16.4s, v17.4s}, [x5], x7
+        ld1             {v24.8h},         [x6], x8
+        b.eq            2f
+        // LR_HAVE_TOP
+        ld1             {v18.4s, v19.4s}, [x5], x7
+        ld1             {v25.8h},         [x6], x8
+        ld1             {v20.4s, v21.4s}, [x5], x7
+        ld1             {v26.8h},         [x6], x8
+        b               3f
+2:      // !LR_HAVE_TOP
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v17.16b
+        mov             v25.16b, v24.16b
+        mov             v20.16b, v16.16b
+        mov             v21.16b, v17.16b
+        mov             v26.16b, v24.16b
+
+3:
+        subs            w3,  w3,  #1
+.macro add3
+        add             v16.4s,  v16.4s,  v18.4s
+        add             v17.4s,  v17.4s,  v19.4s
+        add             v24.8h,  v24.8h,  v25.8h
+        add             v16.4s,  v16.4s,  v20.4s
+        add             v17.4s,  v17.4s,  v21.4s
+        add             v24.8h,  v24.8h,  v26.8h
+        st1             {v16.4s, v17.4s}, [x0], x7
+        st1             {v24.8h},         [x1], x8
+.endm
+        add3
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        mov             v24.16b, v25.16b
+        mov             v18.16b, v20.16b
+        mov             v19.16b, v21.16b
+        mov             v25.16b, v26.16b
+        b.le            4f
+        ld1             {v20.4s, v21.4s}, [x5], x7
+        ld1             {v26.8h},         [x6], x8
+        b               3b
+
+4:
+        tst             w4,  #8 // LR_HAVE_BOTTOM
+        b.ne            5f
+        // !LR_HAVE_BOTTOM
+        // Produce two more rows, extending the already loaded rows.
+        add3
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        mov             v24.16b, v25.16b
+        add3
+
+5:      // End of one vertical slice.
+        subs            w2,  w2,  #8
+        b.le            0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        msub            x5,  x7,  x11, x5
+        msub            x6,  x8,  x11, x6
+        // Output pointers
+        msub            x0,  x7,  x10, x0
+        msub            x1,  x8,  x10, x1
+        add             x0,  x0,  #32
+        add             x1,  x1,  #16
+        add             x5,  x5,  #32
+        add             x6,  x6,  #16
+        mov             w3,  w9
+        b               1b
+
+0:
+        ret
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+        add             w10, w3,  #2 // Number of output rows to move back
+        mov             w11, w3      // Number of input rows to move back
+        add             w2,  w2,  #8 // Actual summed width
+        mov             x7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             x8,       #(2*SUM_STRIDE) // sum stride
+        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             w4,  #4 // LR_HAVE_TOP
+        b.eq            0f
+        // If have top, read from row -2.
+        sub             x5,  x0,  #(4*SUM_STRIDE)
+        sub             x6,  x1,  #(2*SUM_STRIDE)
+        add             w11, w11, #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             x5,  x0,  #(4*SUM_STRIDE)
+        add             x6,  x1,  #(2*SUM_STRIDE)
+1:
+
+        tst             w4,  #8 // LR_HAVE_BOTTOM
+        b.eq            0f
+        // LR_HAVE_BOTTOM
+        add             w3,  w3,  #2  // Handle h+2 lines with the main loop
+        add             w11, w11, #2
+        b               1f
+0:
+        // !LR_HAVE_BOTTOM
+        sub             w3,  w3,  #1  // Handle h-1 lines with the main loop
+1:
+        mov             w9,  w3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into v16-v25 and v26-v30 taking top
+        // padding into consideration.
+        tst             w4,  #4 // LR_HAVE_TOP
+        ld1             {v16.4s, v17.4s}, [x5], x7
+        ld1             {v26.8h},         [x6], x8
+        b.eq            2f
+        // LR_HAVE_TOP
+        ld1             {v20.4s, v21.4s}, [x5], x7
+        ld1             {v28.8h},         [x6], x8
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v17.16b
+        mov             v27.16b, v26.16b
+        ld1             {v22.4s, v23.4s}, [x5], x7
+        ld1             {v29.8h},         [x6], x8
+        b               3f
+2:      // !LR_HAVE_TOP
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v17.16b
+        mov             v27.16b, v26.16b
+        mov             v20.16b, v16.16b
+        mov             v21.16b, v17.16b
+        mov             v28.16b, v26.16b
+        mov             v22.16b, v16.16b
+        mov             v23.16b, v17.16b
+        mov             v29.16b, v26.16b
+
+3:
+        cbz             w3,  4f
+        ld1             {v24.4s, v25.4s}, [x5], x7
+        ld1             {v30.8h},         [x6], x8
+
+3:
+        // Start of vertical loop
+        subs            w3,  w3,  #2
+.macro add5
+        add             v16.4s,  v16.4s,  v18.4s
+        add             v17.4s,  v17.4s,  v19.4s
+        add             v26.8h,  v26.8h,  v27.8h
+        add             v0.4s,   v20.4s,  v22.4s
+        add             v1.4s,   v21.4s,  v23.4s
+        add             v2.8h,   v28.8h,  v29.8h
+        add             v16.4s,  v16.4s,  v24.4s
+        add             v17.4s,  v17.4s,  v25.4s
+        add             v26.8h,  v26.8h,  v30.8h
+        add             v16.4s,  v16.4s,  v0.4s
+        add             v17.4s,  v17.4s,  v1.4s
+        add             v26.8h,  v26.8h,  v2.8h
+        st1             {v16.4s, v17.4s}, [x0], x7
+        st1             {v26.8h},         [x1], x8
+.endm
+        add5
+.macro shift2
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        mov             v26.16b, v28.16b
+        mov             v18.16b, v22.16b
+        mov             v19.16b, v23.16b
+        mov             v27.16b, v29.16b
+        mov             v20.16b, v24.16b
+        mov             v21.16b, v25.16b
+        mov             v28.16b, v30.16b
+.endm
+        shift2
+        add             x0,  x0,  x7
+        add             x1,  x1,  x8
+        b.le            5f
+        ld1             {v22.4s, v23.4s}, [x5], x7
+        ld1             {v29.8h},         [x6], x8
+        ld1             {v24.4s, v25.4s}, [x5], x7
+        ld1             {v30.8h},         [x6], x8
+        b               3b
+
+4:
+        // h == 1, !LR_HAVE_BOTTOM.
+        // Pad the last row with the only content row, and add.
+        mov             v24.16b, v22.16b
+        mov             v25.16b, v23.16b
+        mov             v30.16b, v29.16b
+        add5
+        shift2
+        add             x0,  x0,  x7
+        add             x1,  x1,  x8
+        add5
+        b               6f
+
+5:
+        tst             w4,  #8 // LR_HAVE_BOTTOM
+        b.ne            6f
+        // !LR_HAVE_BOTTOM
+        cbnz            w3,  5f
+        // The intended three edge rows left; output the one at h-2 and
+        // the past edge one at h.
+        ld1             {v22.4s, v23.4s}, [x5], x7
+        ld1             {v29.8h},         [x6], x8
+        // Pad the past-edge row from the last content row.
+        mov             v24.16b, v22.16b
+        mov             v25.16b, v23.16b
+        mov             v30.16b, v29.16b
+        add5
+        shift2
+        add             x0,  x0,  x7
+        add             x1,  x1,  x8
+        // The last two rows are already padded properly here.
+        add5
+        b               6f
+
+5:
+        // w3 == -1, two rows left, output one.
+        // Pad the last two rows from the mid one.
+        mov             v22.16b, v20.16b
+        mov             v23.16b, v21.16b
+        mov             v29.16b, v28.16b
+        mov             v24.16b, v20.16b
+        mov             v25.16b, v21.16b
+        mov             v30.16b, v28.16b
+        add5
+        add             x0,  x0,  x7
+        add             x1,  x1,  x8
+        b               6f
+
+6:      // End of one vertical slice.
+        subs            w2,  w2,  #8
+        b.le            0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        msub            x5,  x7,  x11, x5
+        msub            x6,  x8,  x11, x6
+        // Output pointers
+        msub            x0,  x7,  x10, x0
+        msub            x1,  x8,  x10, x1
+        add             x0,  x0,  #32
+        add             x1,  x1,  #16
+        add             x5,  x5,  #32
+        add             x6,  x6,  #16
+        mov             w3,  w9
+        b               1b
+
+0:
+        ret
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength,
+//                              const int bitdepth_max);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength,
+//                              const int bitdepth_max);
+function sgr_calc_ab1_neon, export=1
+        clz             w9,  w5
+        add             x3,  x3,  #2 // h += 2
+        movi            v31.4s,   #9 // n
+        mov             x5,  #455
+        mov             x8,  #SUM_STRIDE
+        b               sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+        clz             w9,  w5
+        add             x3,  x3,  #3  // h += 3
+        asr             x3,  x3,  #1  // h /= 2
+        movi            v31.4s,   #25 // n
+        mov             x5,  #164
+        mov             x8,  #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+        sub             w9,  w9,  #24  // -bitdepth_min_8
+        movrel          x12, X(sgr_x_by_x)
+        ld1             {v16.16b, v17.16b, v18.16b}, [x12]
+        dup             v6.8h,    w9   // -bitdepth_min_8
+        movi            v19.16b,  #5
+        movi            v20.8b,   #55  // idx of last 5
+        movi            v21.8b,   #72  // idx of last 4
+        movi            v22.8b,   #101 // idx of last 3
+        movi            v23.8b,   #169 // idx of last 2
+        movi            v24.8b,   #254 // idx of last 1
+        saddl           v7.4s,    v6.4h,   v6.4h  // -2*bitdepth_min_8
+        add             x2,  x2,  #2 // w += 2
+        add             x7,  x2,  #7
+        bic             x7,  x7,  #7 // aligned w
+        sub             x7,  x8,  x7 // increment between rows
+        movi            v29.8h,   #1, lsl #8
+        dup             v28.4s,   w4
+        dup             v30.4s,   w5 // one_by_x
+        sub             x0,  x0,  #(4*(SUM_STRIDE))
+        sub             x1,  x1,  #(2*(SUM_STRIDE))
+        mov             x6,  x2   // backup of w
+        sub             v16.16b, v16.16b, v19.16b
+        sub             v17.16b, v17.16b, v19.16b
+        sub             v18.16b, v18.16b, v19.16b
+1:
+        subs            x2,  x2,  #8
+        ld1             {v0.4s, v1.4s}, [x0]   // a
+        ld1             {v2.8h}, [x1]          // b
+        srshl           v0.4s,  v0.4s,  v7.4s
+        srshl           v1.4s,  v1.4s,  v7.4s
+        srshl           v4.8h,  v2.8h,  v6.8h
+        mul             v0.4s,  v0.4s,  v31.4s // a * n
+        mul             v1.4s,  v1.4s,  v31.4s // a * n
+        umull           v3.4s,  v4.4h,  v4.4h  // b * b
+        umull2          v4.4s,  v4.8h,  v4.8h  // b * b
+        uqsub           v0.4s,  v0.4s,  v3.4s  // imax(a * n - b * b, 0)
+        uqsub           v1.4s,  v1.4s,  v4.4s  // imax(a * n - b * b, 0)
+        mul             v0.4s,  v0.4s,  v28.4s // p * s
+        mul             v1.4s,  v1.4s,  v28.4s // p * s
+        uqshrn          v0.4h,  v0.4s,  #16
+        uqshrn2         v0.8h,  v1.4s,  #16
+        uqrshrn         v0.8b,  v0.8h,  #4     // imin(z, 255)
+
+        cmhi            v25.8b, v0.8b,  v20.8b // = -1 if sgr_x_by_x[v0] < 5
+        cmhi            v26.8b, v0.8b,  v21.8b // = -1 if sgr_x_by_x[v0] < 4
+        tbl             v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
+        cmhi            v27.8b, v0.8b,  v22.8b // = -1 if sgr_x_by_x[v0] < 3
+        cmhi            v4.8b,  v0.8b,  v23.8b // = -1 if sgr_x_by_x[v0] < 2
+        add             v25.8b, v25.8b, v26.8b
+        cmhi            v5.8b,  v0.8b,  v24.8b // = -1 if sgr_x_by_x[v0] < 1
+        add             v27.8b, v27.8b, v4.8b
+        add             v5.8b,  v5.8b,  v19.8b
+        add             v25.8b, v25.8b, v27.8b
+        add             v1.8b,  v1.8b,  v5.8b
+        add             v1.8b,  v1.8b,  v25.8b
+        uxtl            v1.8h,  v1.8b          // x
+
+        umull           v3.4s,  v1.4h,  v2.4h  // x * BB[i]
+        umull2          v4.4s,  v1.8h,  v2.8h  // x * BB[i]
+        mul             v3.4s,  v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v4.4s,  v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        srshr           v3.4s,  v3.4s,  #12    // AA[i]
+        srshr           v4.4s,  v4.4s,  #12    // AA[i]
+        sub             v2.8h,  v29.8h, v1.8h  // 256 - x
+
+        st1             {v3.4s, v4.4s}, [x0], #32
+        st1             {v2.8h}, [x1], #16
+        b.gt            1b
+
+        subs            x3,  x3,  #1
+        b.le            0f
+        add             x0,  x0,  x7, lsl #2
+        add             x1,  x1,  x7, lsl #1
+        mov             x2,  x6
+        b               1b
+0:
+        ret
+endfunc
diff --git a/src/arm/64/looprestoration_tmpl.S b/src/arm/64/looprestoration_tmpl.S
new file mode 100644 (file)
index 0000000..520365b
--- /dev/null
@@ -0,0 +1,597 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+.macro sgr_funcs bpc
+// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter1_\bpc\()bpc_neon, export=1
+        sub             x7,  x3,  #(4*SUM_STRIDE)
+        add             x8,  x3,  #(4*SUM_STRIDE)
+        sub             x9,  x4,  #(2*SUM_STRIDE)
+        add             x10, x4,  #(2*SUM_STRIDE)
+        mov             x11, #SUM_STRIDE
+        mov             x12, #FILTER_OUT_STRIDE
+        add             x13, x5,  #7
+        bic             x13, x13, #7 // Aligned width
+.if \bpc == 8
+        sub             x2,  x2,  x13
+.else
+        sub             x2,  x2,  x13, lsl #1
+.endif
+        sub             x12, x12, x13
+        sub             x11, x11, x13
+        sub             x11, x11, #4 // We read 4 extra elements from a
+        sub             x14, x11, #4 // We read 8 extra elements from b
+        mov             x13, x5
+        movi            v6.8h,  #3
+        movi            v7.4s,  #3
+1:
+        ld1             {v0.8h, v1.8h}, [x9], #32
+        ld1             {v2.8h, v3.8h}, [x4], #32
+        ld1             {v4.8h, v5.8h}, [x10], #32
+        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
+        ld1             {v19.4s, v20.4s, v21.4s}, [x3], #48
+        ld1             {v22.4s, v23.4s, v24.4s}, [x8], #48
+
+2:
+        subs            x5,  x5,  #8
+        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride
+        ext             v26.16b, v2.16b,  v3.16b, #2  // 0
+        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride
+        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride
+        ext             v29.16b, v2.16b,  v3.16b, #4  // +1
+        ext             v30.16b, v4.16b,  v5.16b, #4  // +1+stride
+        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride
+        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride
+        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride
+        add             v2.8h,   v2.8h,   v26.8h
+        add             v4.8h,   v4.8h,   v30.8h      // -1+stride, +1+stride
+        add             v2.8h,   v2.8h,   v29.8h      // +1
+        add             v0.8h,   v0.8h,   v4.8h
+
+        ext             v25.16b, v16.16b, v17.16b, #4 // -stride
+        ext             v26.16b, v17.16b, v18.16b, #4
+        shl             v2.8h,   v2.8h,   #2
+        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride
+        ext             v28.16b, v17.16b, v18.16b, #8
+        ext             v29.16b, v19.16b, v20.16b, #4 // 0
+        ext             v30.16b, v20.16b, v21.16b, #4
+        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a
+        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1
+        add             v26.4s,  v26.4s,  v20.4s
+        add             v16.4s,  v16.4s,  v27.4s      // -1-stride, +1-stride
+        add             v17.4s,  v17.4s,  v28.4s
+        ext             v27.16b, v19.16b, v20.16b, #8 // +1
+        ext             v28.16b, v20.16b, v21.16b, #8
+        add             v16.4s,  v16.4s,  v22.4s      // -1+stride
+        add             v17.4s,  v17.4s,  v23.4s
+        add             v29.4s,  v29.4s,  v27.4s      // 0, +1
+        add             v30.4s,  v30.4s,  v28.4s
+        add             v25.4s,  v25.4s,  v29.4s
+        add             v26.4s,  v26.4s,  v30.4s
+        ext             v27.16b, v22.16b, v23.16b, #4 // +stride
+        ext             v28.16b, v23.16b, v24.16b, #4
+        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride
+        ext             v30.16b, v23.16b, v24.16b, #8
+.if \bpc == 8
+        ld1             {v19.8b}, [x1], #8            // src
+.else
+        ld1             {v19.8h}, [x1], #16           // src
+.endif
+        add             v25.4s,  v25.4s,  v27.4s      // +stride
+        add             v26.4s,  v26.4s,  v28.4s
+        add             v16.4s,  v16.4s,  v29.4s      // +1+stride
+        add             v17.4s,  v17.4s,  v30.4s
+        shl             v25.4s,  v25.4s,  #2
+        shl             v26.4s,  v26.4s,  #2
+        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b
+        mla             v26.4s,  v17.4s,  v7.4s
+.if \bpc == 8
+        uxtl            v19.8h,  v19.8b               // src
+.endif
+        mov             v0.16b,  v1.16b
+        umlal           v25.4s,  v2.4h,   v19.4h      // b + a * src
+        umlal2          v26.4s,  v2.8h,   v19.8h
+        mov             v2.16b,  v3.16b
+        rshrn           v25.4h,  v25.4s,  #9
+        rshrn2          v25.8h,  v26.4s,  #9
+        mov             v4.16b,  v5.16b
+        st1             {v25.8h}, [x0], #16
+
+        b.le            3f
+        mov             v16.16b, v18.16b
+        mov             v19.16b, v21.16b
+        mov             v22.16b, v24.16b
+        ld1             {v1.8h}, [x9], #16
+        ld1             {v3.8h}, [x4], #16
+        ld1             {v5.8h}, [x10], #16
+        ld1             {v17.4s, v18.4s}, [x7], #32
+        ld1             {v20.4s, v21.4s}, [x3], #32
+        ld1             {v23.4s, v24.4s}, [x8], #32
+        b               2b
+
+3:
+        subs            x6,  x6,  #1
+        b.le            0f
+        mov             x5,  x13
+        add             x0,  x0,  x12, lsl #1
+        add             x1,  x1,  x2
+        add             x3,  x3,  x11, lsl #2
+        add             x7,  x7,  x11, lsl #2
+        add             x8,  x8,  x11, lsl #2
+        add             x4,  x4,  x14, lsl #1
+        add             x9,  x9,  x14, lsl #1
+        add             x10, x10, x14, lsl #1
+        b               1b
+0:
+        ret
+endfunc
+
+// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter2_\bpc\()bpc_neon, export=1
+        add             x7,  x3,  #(4*(SUM_STRIDE))
+        sub             x3,  x3,  #(4*(SUM_STRIDE))
+        add             x8,  x4,  #(2*(SUM_STRIDE))
+        sub             x4,  x4,  #(2*(SUM_STRIDE))
+        mov             x9,  #(2*SUM_STRIDE)
+        mov             x10, #FILTER_OUT_STRIDE
+        add             x11, x5,  #7
+        bic             x11, x11, #7 // Aligned width
+.if \bpc == 8
+        sub             x2,  x2,  x11
+.else
+        sub             x2,  x2,  x11, lsl #1
+.endif
+        sub             x10, x10, x11
+        sub             x9,  x9,  x11
+        sub             x9,  x9,  #4 // We read 4 extra elements from a
+        sub             x12, x9,  #4 // We read 8 extra elements from b
+        mov             x11, x5
+        movi            v4.8h,  #5
+        movi            v5.4s,  #5
+        movi            v6.8h,  #6
+        movi            v7.4s,  #6
+1:
+        ld1             {v0.8h, v1.8h}, [x4], #32
+        ld1             {v2.8h, v3.8h}, [x8], #32
+        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
+        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48
+
+2:
+        subs            x5,  x5,  #8
+        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
+        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
+        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
+        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride
+        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride
+        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride
+        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
+        add             v0.8h,   v0.8h,   v25.8h
+
+        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
+        ext             v23.16b, v17.16b, v18.16b, #4
+        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
+        ext             v25.16b, v20.16b, v21.16b, #4
+        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride
+        ext             v27.16b, v17.16b, v18.16b, #8
+        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride
+        ext             v29.16b, v20.16b, v21.16b, #8
+        mul             v0.8h,   v0.8h,   v4.8h       // * 5
+        mla             v0.8h,   v2.8h,   v6.8h       // * 6
+.if \bpc == 8
+        ld1             {v31.8b}, [x1], #8
+.else
+        ld1             {v31.8h}, [x1], #16
+.endif
+        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
+        add             v17.4s,  v17.4s,  v27.4s
+        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
+        add             v20.4s,  v20.4s,  v29.4s
+        add             v16.4s,  v16.4s,  v19.4s
+        add             v17.4s,  v17.4s,  v20.4s
+
+        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
+        add             v23.4s,  v23.4s,  v25.4s
+        // This is, surprisingly, faster than other variants where the
+        // mul+mla pairs are further apart, on Cortex A53.
+        mul             v16.4s,  v16.4s,  v5.4s       // * 5
+        mla             v16.4s,  v22.4s,  v7.4s       // * 6
+        mul             v17.4s,  v17.4s,  v5.4s       // * 5
+        mla             v17.4s,  v23.4s,  v7.4s       // * 6
+
+.if \bpc == 8
+        uxtl            v31.8h,  v31.8b
+.endif
+        umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src
+        umlal2          v17.4s,  v0.8h,   v31.8h
+        mov             v0.16b,  v1.16b
+        rshrn           v16.4h,  v16.4s,  #9
+        rshrn2          v16.8h,  v17.4s,  #9
+        mov             v2.16b,  v3.16b
+        st1             {v16.8h}, [x0], #16
+
+        b.le            3f
+        mov             v16.16b, v18.16b
+        mov             v19.16b, v21.16b
+        ld1             {v1.8h}, [x4], #16
+        ld1             {v3.8h}, [x8], #16
+        ld1             {v17.4s, v18.4s}, [x3], #32
+        ld1             {v20.4s, v21.4s}, [x7], #32
+        b               2b
+
+3:
+        subs            x6,  x6,  #1
+        b.le            0f
+        mov             x5,  x11
+        add             x0,  x0,  x10, lsl #1
+        add             x1,  x1,  x2
+        add             x3,  x3,  x9, lsl #2
+        add             x7,  x7,  x9, lsl #2
+        add             x4,  x4,  x12, lsl #1
+        add             x8,  x8,  x12, lsl #1
+        mov             x13, x3
+        mov             x14, x4
+
+        ld1             {v0.8h, v1.8h}, [x4], #32
+        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
+
+4:
+        subs            x5,  x5,  #8
+        ext             v23.16b, v0.16b,  v1.16b, #4  // +1
+        ext             v22.16b, v0.16b,  v1.16b, #2  // 0
+        add             v0.8h,   v0.8h,   v23.8h      // -1, +1
+
+        ext             v24.16b, v16.16b, v17.16b, #4 // 0
+        ext             v25.16b, v17.16b, v18.16b, #4
+        ext             v26.16b, v16.16b, v17.16b, #8 // +1
+        ext             v27.16b, v17.16b, v18.16b, #8
+        mul             v2.8h,   v22.8h,  v6.8h       // * 6
+        mla             v2.8h,   v0.8h,   v4.8h       // * 5 -> a
+.if \bpc == 8
+        ld1             {v31.8b}, [x1], #8
+.else
+        ld1             {v31.8h}, [x1], #16
+.endif
+        add             v16.4s,  v16.4s,  v26.4s      // -1, +1
+        add             v17.4s,  v17.4s,  v27.4s
+.if \bpc == 8
+        uxtl            v31.8h,  v31.8b
+.endif
+        // This is, surprisingly, faster than other variants where the
+        // mul+mla pairs are further apart, on Cortex A53.
+        mul             v24.4s,  v24.4s,  v7.4s       // * 6
+        mla             v24.4s,  v16.4s,  v5.4s       // * 5 -> b
+        mul             v25.4s,  v25.4s,  v7.4s       // * 6
+        mla             v25.4s,  v17.4s,  v5.4s       // * 5 -> b
+
+        umlal           v24.4s,  v2.4h,   v31.4h      // b + a * src
+        umlal2          v25.4s,  v2.8h,   v31.8h
+        mov             v0.16b,  v1.16b
+        rshrn           v24.4h,  v24.4s,  #8
+        rshrn2          v24.8h,  v25.4s,  #8
+        mov             v16.16b, v18.16b
+        st1             {v24.8h}, [x0], #16
+
+        b.le            5f
+        ld1             {v1.8h}, [x4], #16
+        ld1             {v17.4s, v18.4s}, [x3], #32
+        b               4b
+
+5:
+        subs            x6,  x6,  #1
+        b.le            0f
+        mov             x5,  x11
+        add             x0,  x0,  x10, lsl #1
+        add             x1,  x1,  x2
+        mov             x3,  x13 // Rewind x3/x4 to where they started
+        mov             x4,  x14
+        b               1b
+0:
+        ret
+endfunc
+
+// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int w, const int h,
+//                                    const int wt, const int bitdepth_max);
+function sgr_weighted1_\bpc\()bpc_neon, export=1
+.if \bpc == 16
+        ldr             w8,  [sp]
+.endif
+        dup             v31.8h, w7
+        cmp             x6,  #2
+.if \bpc == 16
+        dup             v30.8h, w8
+.endif
+        add             x9,  x0,  x1
+        add             x10, x2,  x3
+        add             x11, x4,  #2*FILTER_OUT_STRIDE
+        mov             x7,  #(4*FILTER_OUT_STRIDE)
+        lsl             x1,  x1,  #1
+        lsl             x3,  x3,  #1
+        add             x8,  x5,  #7
+        bic             x8,  x8,  #7 // Aligned width
+.if \bpc == 8
+        sub             x1,  x1,  x8
+        sub             x3,  x3,  x8
+.else
+        sub             x1,  x1,  x8, lsl #1
+        sub             x3,  x3,  x8, lsl #1
+.endif
+        sub             x7,  x7,  x8, lsl #1
+        mov             x8,  x5
+        b.lt            2f
+1:
+.if \bpc == 8
+        ld1             {v0.8b}, [x2],  #8
+        ld1             {v4.8b}, [x10], #8
+.else
+        ld1             {v0.8h}, [x2],  #16
+        ld1             {v4.8h}, [x10], #16
+.endif
+        ld1             {v1.8h}, [x4],  #16
+        ld1             {v5.8h}, [x11], #16
+        subs            x5,  x5,  #8
+.if \bpc == 8
+        ushll           v0.8h,  v0.8b,  #4     // u
+        ushll           v4.8h,  v4.8b,  #4     // u
+.else
+        shl             v0.8h,  v0.8h,  #4     // u
+        shl             v4.8h,  v4.8h,  #4     // u
+.endif
+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
+        sub             v5.8h,  v5.8h,  v4.8h  // t1 - u
+        ushll           v2.4s,  v0.4h,  #7     // u << 7
+        ushll2          v3.4s,  v0.8h,  #7     // u << 7
+        ushll           v6.4s,  v4.4h,  #7     // u << 7
+        ushll2          v7.4s,  v4.8h,  #7     // u << 7
+        smlal           v2.4s,  v1.4h,  v31.4h // v
+        smlal2          v3.4s,  v1.8h,  v31.8h // v
+        smlal           v6.4s,  v5.4h,  v31.4h // v
+        smlal2          v7.4s,  v5.8h,  v31.8h // v
+.if \bpc == 8
+        rshrn           v2.4h,  v2.4s,  #11
+        rshrn2          v2.8h,  v3.4s,  #11
+        rshrn           v6.4h,  v6.4s,  #11
+        rshrn2          v6.8h,  v7.4s,  #11
+        sqxtun          v2.8b,  v2.8h
+        sqxtun          v6.8b,  v6.8h
+        st1             {v2.8b}, [x0], #8
+        st1             {v6.8b}, [x9], #8
+.else
+        sqrshrun        v2.4h,  v2.4s,  #11
+        sqrshrun2       v2.8h,  v3.4s,  #11
+        sqrshrun        v6.4h,  v6.4s,  #11
+        sqrshrun2       v6.8h,  v7.4s,  #11
+        umin            v2.8h,  v2.8h,  v30.8h
+        umin            v6.8h,  v6.8h,  v30.8h
+        st1             {v2.8h}, [x0], #16
+        st1             {v6.8h}, [x9], #16
+.endif
+        b.gt            1b
+
+        sub             x6,  x6,  #2
+        cmp             x6,  #1
+        b.lt            0f
+        mov             x5,  x8
+        add             x0,  x0,  x1
+        add             x9,  x9,  x1
+        add             x2,  x2,  x3
+        add             x10, x10, x3
+        add             x4,  x4,  x7
+        add             x11, x11, x7
+        b.eq            2f
+        b               1b
+
+2:
+.if \bpc == 8
+        ld1             {v0.8b}, [x2], #8
+.else
+        ld1             {v0.8h}, [x2], #16
+.endif
+        ld1             {v1.8h}, [x4], #16
+        subs            x5,  x5,  #8
+.if \bpc == 8
+        ushll           v0.8h,  v0.8b,  #4     // u
+.else
+        shl             v0.8h,  v0.8h,  #4     // u
+.endif
+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
+        ushll           v2.4s,  v0.4h,  #7     // u << 7
+        ushll2          v3.4s,  v0.8h,  #7     // u << 7
+        smlal           v2.4s,  v1.4h,  v31.4h // v
+        smlal2          v3.4s,  v1.8h,  v31.8h // v
+.if \bpc == 8
+        rshrn           v2.4h,  v2.4s,  #11
+        rshrn2          v2.8h,  v3.4s,  #11
+        sqxtun          v2.8b,  v2.8h
+        st1             {v2.8b}, [x0], #8
+.else
+        sqrshrun        v2.4h,  v2.4s,  #11
+        sqrshrun2       v2.8h,  v3.4s,  #11
+        umin            v2.8h,  v2.8h,  v30.8h
+        st1             {v2.8h}, [x0], #16
+.endif
+        b.gt            2b
+0:
+        ret
+endfunc
+
+// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int16_t *t2,
+//                                    const int w, const int h,
+//                                    const int16_t wt[2]);
+function sgr_weighted2_\bpc\()bpc_neon, export=1
+.if \bpc == 8
+        ldr             x8,  [sp]
+.else
+        ldp             x8,  x9,  [sp]
+.endif
+        cmp             x7,  #2
+        add             x10, x0,  x1
+        add             x11, x2,  x3
+        add             x12, x4,  #2*FILTER_OUT_STRIDE
+        add             x13, x5,  #2*FILTER_OUT_STRIDE
+        ld2r            {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
+.if \bpc == 16
+        dup             v29.8h,  w9
+.endif
+        mov             x8,  #4*FILTER_OUT_STRIDE
+        lsl             x1,  x1,  #1
+        lsl             x3,  x3,  #1
+        add             x9,  x6,  #7
+        bic             x9,  x9,  #7 // Aligned width
+.if \bpc == 8
+        sub             x1,  x1,  x9
+        sub             x3,  x3,  x9
+.else
+        sub             x1,  x1,  x9, lsl #1
+        sub             x3,  x3,  x9, lsl #1
+.endif
+        sub             x8,  x8,  x9, lsl #1
+        mov             x9,  x6
+        b.lt            2f
+1:
+.if \bpc == 8
+        ld1             {v0.8b},  [x2],  #8
+        ld1             {v16.8b}, [x11], #8
+.else
+        ld1             {v0.8h},  [x2],  #16
+        ld1             {v16.8h}, [x11], #16
+.endif
+        ld1             {v1.8h},  [x4],  #16
+        ld1             {v17.8h}, [x12], #16
+        ld1             {v2.8h},  [x5],  #16
+        ld1             {v18.8h}, [x13], #16
+        subs            x6,  x6,  #8
+.if \bpc == 8
+        ushll           v0.8h,  v0.8b,  #4     // u
+        ushll           v16.8h, v16.8b, #4     // u
+.else
+        shl             v0.8h,  v0.8h,  #4     // u
+        shl             v16.8h, v16.8h, #4     // u
+.endif
+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
+        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
+        sub             v17.8h, v17.8h, v16.8h // t1 - u
+        sub             v18.8h, v18.8h, v16.8h // t2 - u
+        ushll           v3.4s,  v0.4h,  #7     // u << 7
+        ushll2          v4.4s,  v0.8h,  #7     // u << 7
+        ushll           v19.4s, v16.4h, #7     // u << 7
+        ushll2          v20.4s, v16.8h, #7     // u << 7
+        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)
+        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
+        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
+        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
+        smlal           v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
+        smlal           v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
+        smlal2          v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
+        smlal2          v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
+        rshrn           v3.4h,  v3.4s,  #11
+        rshrn2          v3.8h,  v4.4s,  #11
+        rshrn           v19.4h, v19.4s, #11
+        rshrn2          v19.8h, v20.4s, #11
+        sqxtun          v3.8b,  v3.8h
+        sqxtun          v19.8b, v19.8h
+        st1             {v3.8b},  [x0],  #8
+        st1             {v19.8b}, [x10], #8
+.else
+        sqrshrun        v3.4h,  v3.4s,  #11
+        sqrshrun2       v3.8h,  v4.4s,  #11
+        sqrshrun        v19.4h, v19.4s, #11
+        sqrshrun2       v19.8h, v20.4s, #11
+        umin            v3.8h,  v3.8h,  v29.8h
+        umin            v19.8h, v19.8h, v29.8h
+        st1             {v3.8h},  [x0],  #16
+        st1             {v19.8h}, [x10], #16
+.endif
+        b.gt            1b
+
+        subs            x7,  x7,  #2
+        cmp             x7,  #1
+        b.lt            0f
+        mov             x6,  x9
+        add             x0,  x0,  x1
+        add             x10, x10, x1
+        add             x2,  x2,  x3
+        add             x11, x11, x3
+        add             x4,  x4,  x8
+        add             x12, x12, x8
+        add             x5,  x5,  x8
+        add             x13, x13, x8
+        b.eq            2f
+        b               1b
+
+2:
+.if \bpc == 8
+        ld1             {v0.8b}, [x2], #8
+.else
+        ld1             {v0.8h}, [x2], #16
+.endif
+        ld1             {v1.8h}, [x4], #16
+        ld1             {v2.8h}, [x5], #16
+        subs            x6,  x6,  #8
+.if \bpc == 8
+        ushll           v0.8h,  v0.8b,  #4     // u
+.else
+        shl             v0.8h,  v0.8h,  #4     // u
+.endif
+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
+        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
+        ushll           v3.4s,  v0.4h,  #7     // u << 7
+        ushll2          v4.4s,  v0.8h,  #7     // u << 7
+        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)
+        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
+        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
+        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
+        rshrn           v3.4h,  v3.4s,  #11
+        rshrn2          v3.8h,  v4.4s,  #11
+        sqxtun          v3.8b,  v3.8h
+        st1             {v3.8b}, [x0], #8
+.else
+        sqrshrun        v3.4h,  v3.4s,  #11
+        sqrshrun2       v3.8h,  v4.4s,  #11
+        umin            v3.8h,  v3.8h,  v29.8h
+        st1             {v3.8h}, [x0], #16
+.endif
+        b.gt            1b
+0:
+        ret
+endfunc
+.endm
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
new file mode 100644 (file)
index 0000000..f6970de
--- /dev/null
@@ -0,0 +1,3249 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro avg dst, t0, t1, t2, t3
+        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
+        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
+        add             \t0\().8h,   \t0\().8h,   \t2\().8h
+        add             \t1\().8h,   \t1\().8h,   \t3\().8h
+        sqrshrun        \dst\().8b,  \t0\().8h,   #5
+        sqrshrun2       \dst\().16b, \t1\().8h,   #5
+.endm
+
+.macro w_avg dst, t0, t1, t2, t3
+        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
+        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
+        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
+        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
+        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
+        sqdmulh         \t1\().8h,   \t1\().8h,   v30.8h
+        add             \t0\().8h,   \t2\().8h,   \t0\().8h
+        add             \t1\().8h,   \t3\().8h,   \t1\().8h
+        sqrshrun        \dst\().8b,  \t0\().8h,   #4
+        sqrshrun2       \dst\().16b, \t1\().8h,   #4
+.endm
+
+.macro mask dst, t0, t1, t2, t3
+        ld1             {v30.16b}, [x6],  16
+        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
+        mul             v30.16b, v30.16b, v31.16b
+        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
+        shll            v28.8h, v30.8b,  #8
+        shll2           v29.8h, v30.16b, #8
+        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
+        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
+        sqdmulh         \t0\().8h,   \t0\().8h,   v28.8h
+        sqdmulh         \t1\().8h,   \t1\().8h,   v29.8h
+        add             \t0\().8h,   \t2\().8h,   \t0\().8h
+        add             \t1\().8h,   \t3\().8h,   \t1\().8h
+        sqrshrun        \dst\().8b,  \t0\().8h,   #4
+        sqrshrun2       \dst\().16b, \t1\().8h,   #4
+.endm
+
+.macro bidir_fn type
+function \type\()_8bpc_neon, export=1
+        clz             w4,  w4
+.ifc \type, w_avg
+        dup             v30.8h, w6
+        neg             v30.8h, v30.8h
+        shl             v30.8h, v30.8h, #11
+.endif
+.ifc \type, mask
+        movi            v31.16b, #256-2
+.endif
+        adr             x7,  L(\type\()_tbl)
+        sub             w4,  w4,  #24
+        ldrh            w4,  [x7, x4, lsl #1]
+        \type           v4,  v0,  v1,  v2,  v3
+        sub             x7,  x7,  w4, uxtw
+        br              x7
+40:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+4:
+        cmp             w5,  #4
+        st1             {v4.s}[0],  [x0], x1
+        st1             {v4.s}[1],  [x7], x1
+        st1             {v4.s}[2],  [x0], x1
+        st1             {v4.s}[3],  [x7], x1
+        b.eq            0f
+        \type           v5,  v0,  v1,  v2,  v3
+        cmp             w5,  #8
+        st1             {v5.s}[0],  [x0], x1
+        st1             {v5.s}[1],  [x7], x1
+        st1             {v5.s}[2],  [x0], x1
+        st1             {v5.s}[3],  [x7], x1
+        b.eq            0f
+        \type           v4,  v0,  v1,  v2,  v3
+        st1             {v4.s}[0],  [x0], x1
+        st1             {v4.s}[1],  [x7], x1
+        \type           v5,  v0,  v1,  v2,  v3
+        st1             {v4.s}[2],  [x0], x1
+        st1             {v4.s}[3],  [x7], x1
+        st1             {v5.s}[0],  [x0], x1
+        st1             {v5.s}[1],  [x7], x1
+        st1             {v5.s}[2],  [x0], x1
+        st1             {v5.s}[3],  [x7], x1
+        ret
+80:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+8:
+        st1             {v4.d}[0],  [x0], x1
+        \type           v5,  v0,  v1,  v2,  v3
+        st1             {v4.d}[1],  [x7], x1
+        st1             {v5.d}[0],  [x0], x1
+        subs            w5,  w5,  #4
+        st1             {v5.d}[1],  [x7], x1
+        b.le            0f
+        \type           v4,  v0,  v1,  v2,  v3
+        b               8b
+16:
+        \type           v5,  v0,  v1,  v2,  v3
+        st1             {v4.16b}, [x0], x1
+        \type           v6,  v0,  v1,  v2,  v3
+        st1             {v5.16b}, [x0], x1
+        \type           v7,  v0,  v1,  v2,  v3
+        st1             {v6.16b}, [x0], x1
+        subs            w5,  w5,  #4
+        st1             {v7.16b}, [x0], x1
+        b.le            0f
+        \type           v4,  v0,  v1,  v2,  v3
+        b               16b
+320:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+32:
+        \type           v5,  v0,  v1,  v2,  v3
+        \type           v6,  v0,  v1,  v2,  v3
+        st1             {v4.16b,v5.16b}, [x0], x1
+        \type           v7,  v0,  v1,  v2,  v3
+        subs            w5,  w5,  #2
+        st1             {v6.16b,v7.16b}, [x7], x1
+        b.le            0f
+        \type           v4,  v0,  v1,  v2,  v3
+        b               32b
+640:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+64:
+        \type           v5,  v0,  v1,  v2,  v3
+        \type           v6,  v0,  v1,  v2,  v3
+        \type           v7,  v0,  v1,  v2,  v3
+        \type           v16, v0,  v1,  v2,  v3
+        \type           v17, v0,  v1,  v2,  v3
+        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
+        \type           v18, v0,  v1,  v2,  v3
+        \type           v19, v0,  v1,  v2,  v3
+        subs            w5,  w5,  #2
+        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
+        b.le            0f
+        \type           v4, v0,  v1,  v2,  v3
+        b               64b
+1280:
+        add             x7,  x0,  #64
+128:
+        \type           v5,  v0,  v1,  v2,  v3
+        \type           v6,  v0,  v1,  v2,  v3
+        \type           v7,  v0,  v1,  v2,  v3
+        \type           v16, v0,  v1,  v2,  v3
+        \type           v17, v0,  v1,  v2,  v3
+        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
+        \type           v18, v0,  v1,  v2,  v3
+        \type           v19, v0,  v1,  v2,  v3
+        subs            w5,  w5,  #1
+        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
+        b.le            0f
+        \type           v4, v0,  v1,  v2,  v3
+        b               128b
+0:
+        ret
+L(\type\()_tbl):
+        .hword L(\type\()_tbl) - 1280b
+        .hword L(\type\()_tbl) -  640b
+        .hword L(\type\()_tbl) -  320b
+        .hword L(\type\()_tbl) -   16b
+        .hword L(\type\()_tbl) -   80b
+        .hword L(\type\()_tbl) -   40b
+endfunc
+.endm
+
+bidir_fn avg
+bidir_fn w_avg
+bidir_fn mask
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_8bpc_neon, export=1
+        clz             w8,  w4
+        adr             x9,  L(w_mask_\type\()_tbl)
+        sub             w8,  w8,  #24
+        ldrh            w8,  [x9,  x8,  lsl #1]
+        sub             x9,  x9,  w8,  uxtw
+        mov             w10, #6903
+        dup             v0.8h,   w10
+.if \type == 444
+        movi            v1.16b,  #64
+.elseif \type == 422
+        dup             v2.8b,   w7
+        movi            v3.8b,   #129
+        sub             v3.8b,   v3.8b,   v2.8b
+.elseif \type == 420
+        dup             v2.8h,   w7
+        movi            v3.8h,   #1, lsl #8
+        sub             v3.8h,   v3.8h,   v2.8h
+.endif
+        add             x12,  x0,  x1
+        lsl             x1,   x1,  #1
+        br              x9
+4:
+        ld1             {v4.8h,   v5.8h},   [x2],  #32  // tmp1 (four rows at once)
+        ld1             {v6.8h,   v7.8h},   [x3],  #32  // tmp2 (four rows at once)
+        subs            w5,  w5,  #4
+        sub             v16.8h,  v6.8h,   v4.8h
+        sub             v17.8h,  v7.8h,   v5.8h
+        sabd            v18.8h,  v4.8h,   v6.8h
+        sabd            v19.8h,  v5.8h,   v7.8h
+        uqsub           v18.8h,  v0.8h,   v18.8h
+        uqsub           v19.8h,  v0.8h,   v19.8h
+        ushr            v18.8h,  v18.8h,  #8
+        ushr            v19.8h,  v19.8h,  #8
+        shl             v20.8h,  v18.8h,  #9
+        shl             v21.8h,  v19.8h,  #9
+        sqdmulh         v20.8h,  v20.8h,  v16.8h
+        sqdmulh         v21.8h,  v21.8h,  v17.8h
+        add             v20.8h,  v20.8h,  v4.8h
+        add             v21.8h,  v21.8h,  v5.8h
+        sqrshrun        v22.8b,  v20.8h,  #4
+        sqrshrun        v23.8b,  v21.8h,  #4
+.if \type == 444
+        xtn             v18.8b,   v18.8h
+        xtn2            v18.16b,  v19.8h
+        sub             v18.16b,  v1.16b,  v18.16b
+        st1             {v18.16b}, [x6],  #16
+.elseif \type == 422
+        addp            v18.8h,   v18.8h,  v19.8h
+        xtn             v18.8b,   v18.8h
+        uhsub           v18.8b,   v3.8b,   v18.8b
+        st1             {v18.8b},  [x6],  #8
+.elseif \type == 420
+        trn1            v24.2d,   v18.2d,  v19.2d
+        trn2            v25.2d,   v18.2d,  v19.2d
+        add             v24.8h,   v24.8h,  v25.8h
+        addp            v18.8h,   v24.8h,  v24.8h
+        sub             v18.4h,   v3.4h,   v18.4h
+        rshrn           v18.8b,   v18.8h,  #2
+        st1             {v18.s}[0],  [x6],  #4
+.endif
+        st1             {v22.s}[0],  [x0],  x1
+        st1             {v22.s}[1],  [x12], x1
+        st1             {v23.s}[0],  [x0],  x1
+        st1             {v23.s}[1],  [x12], x1
+        b.gt            4b
+        ret
+8:
+        ld1             {v4.8h,   v5.8h},   [x2],  #32
+        ld1             {v6.8h,   v7.8h},   [x3],  #32
+        subs            w5,  w5,  #2
+        sub             v16.8h,  v6.8h,   v4.8h
+        sub             v17.8h,  v7.8h,   v5.8h
+        sabd            v18.8h,  v4.8h,   v6.8h
+        sabd            v19.8h,  v5.8h,   v7.8h
+        uqsub           v18.8h,  v0.8h,   v18.8h
+        uqsub           v19.8h,  v0.8h,   v19.8h
+        ushr            v18.8h,  v18.8h,  #8
+        ushr            v19.8h,  v19.8h,  #8
+        shl             v20.8h,  v18.8h,  #9
+        shl             v21.8h,  v19.8h,  #9
+        sqdmulh         v20.8h,  v20.8h,  v16.8h
+        sqdmulh         v21.8h,  v21.8h,  v17.8h
+        add             v20.8h,  v20.8h,  v4.8h
+        add             v21.8h,  v21.8h,  v5.8h
+        sqrshrun        v22.8b,  v20.8h,  #4
+        sqrshrun        v23.8b,  v21.8h,  #4
+.if \type == 444
+        xtn             v18.8b,  v18.8h
+        xtn2            v18.16b, v19.8h
+        sub             v18.16b, v1.16b,  v18.16b
+        st1             {v18.16b}, [x6],  #16
+.elseif \type == 422
+        addp            v18.8h,  v18.8h,  v19.8h
+        xtn             v18.8b,  v18.8h
+        uhsub           v18.8b,  v3.8b,   v18.8b
+        st1             {v18.8b},  [x6],  #8
+.elseif \type == 420
+        add             v18.8h,  v18.8h,  v19.8h
+        addp            v18.8h,  v18.8h,  v18.8h
+        sub             v18.4h,  v3.4h,   v18.4h
+        rshrn           v18.8b,  v18.8h,  #2
+        st1             {v18.s}[0],  [x6],  #4
+.endif
+        st1             {v22.8b},  [x0],  x1
+        st1             {v23.8b},  [x12], x1
+        b.gt            8b
+        ret
+1280:
+640:
+320:
+160:
+        mov             w11, w4
+        sub             x1,  x1,  w4,  uxtw
+.if \type == 444
+        add             x10, x6,  w4,  uxtw
+.elseif \type == 422
+        add             x10, x6,  x11, lsr #1
+.endif
+        add             x9,  x3,  w4,  uxtw #1
+        add             x7,  x2,  w4,  uxtw #1
+161:
+        mov             w8,  w4
+16:
+        ld1             {v4.8h,   v5.8h},   [x2],  #32
+        ld1             {v6.8h,   v7.8h},   [x3],  #32
+        ld1             {v16.8h,  v17.8h},  [x7],  #32
+        ld1             {v18.8h,  v19.8h},  [x9],  #32
+        subs            w8,  w8,  #16
+        sub             v6.8h,   v6.8h,   v4.8h
+        sub             v7.8h,   v7.8h,   v5.8h
+        sub             v18.8h,  v18.8h,  v16.8h
+        sub             v19.8h,  v19.8h,  v17.8h
+        abs             v20.8h,  v6.8h
+        abs             v21.8h,  v7.8h
+        abs             v22.8h,  v18.8h
+        abs             v23.8h,  v19.8h
+        uqsub           v20.8h,  v0.8h,   v20.8h
+        uqsub           v21.8h,  v0.8h,   v21.8h
+        uqsub           v22.8h,  v0.8h,   v22.8h
+        uqsub           v23.8h,  v0.8h,   v23.8h
+        ushr            v20.8h,  v20.8h,  #8
+        ushr            v21.8h,  v21.8h,  #8
+        ushr            v22.8h,  v22.8h,  #8
+        ushr            v23.8h,  v23.8h,  #8
+        shl             v24.8h,  v20.8h,  #9
+        shl             v25.8h,  v21.8h,  #9
+        shl             v26.8h,  v22.8h,  #9
+        shl             v27.8h,  v23.8h,  #9
+        sqdmulh         v24.8h,  v24.8h,  v6.8h
+        sqdmulh         v25.8h,  v25.8h,  v7.8h
+        sqdmulh         v26.8h,  v26.8h,  v18.8h
+        sqdmulh         v27.8h,  v27.8h,  v19.8h
+        add             v24.8h,  v24.8h,  v4.8h
+        add             v25.8h,  v25.8h,  v5.8h
+        add             v26.8h,  v26.8h,  v16.8h
+        add             v27.8h,  v27.8h,  v17.8h
+        sqrshrun        v24.8b,  v24.8h,  #4
+        sqrshrun        v25.8b,  v25.8h,  #4
+        sqrshrun        v26.8b,  v26.8h,  #4
+        sqrshrun        v27.8b,  v27.8h,  #4
+.if \type == 444
+        xtn             v20.8b,  v20.8h
+        xtn2            v20.16b, v21.8h
+        xtn             v21.8b,  v22.8h
+        xtn2            v21.16b, v23.8h
+        sub             v20.16b, v1.16b,  v20.16b
+        sub             v21.16b, v1.16b,  v21.16b
+        st1             {v20.16b}, [x6],  #16
+        st1             {v21.16b}, [x10], #16
+.elseif \type == 422
+        addp            v20.8h,  v20.8h,  v21.8h
+        addp            v21.8h,  v22.8h,  v23.8h
+        xtn             v20.8b,  v20.8h
+        xtn             v21.8b,  v21.8h
+        uhsub           v20.8b,  v3.8b,   v20.8b
+        uhsub           v21.8b,  v3.8b,   v21.8b
+        st1             {v20.8b},  [x6],  #8
+        st1             {v21.8b},  [x10], #8
+.elseif \type == 420
+        add             v20.8h,  v20.8h,  v22.8h
+        add             v21.8h,  v21.8h,  v23.8h
+        addp            v20.8h,  v20.8h,  v21.8h
+        sub             v20.8h,  v3.8h,   v20.8h
+        rshrn           v20.8b,  v20.8h,  #2
+        st1             {v20.8b},  [x6],  #8
+.endif
+        st1             {v24.8b,  v25.8b},  [x0],  #16
+        st1             {v26.8b,  v27.8b},  [x12], #16
+        b.gt            16b
+        subs            w5,  w5,  #2
+        add             x2,  x2,  w4,  uxtw #1
+        add             x3,  x3,  w4,  uxtw #1
+        add             x7,  x7,  w4,  uxtw #1
+        add             x9,  x9,  w4,  uxtw #1
+.if \type == 444
+        add             x6,  x6,  w4,  uxtw
+        add             x10, x10, w4,  uxtw
+.elseif \type == 422
+        add             x6,  x6,  x11, lsr #1
+        add             x10, x10, x11, lsr #1
+.endif
+        add             x0,  x0,  x1
+        add             x12, x12, x1
+        b.gt            161b
+        ret
+L(w_mask_\type\()_tbl):
+        .hword L(w_mask_\type\()_tbl) - 1280b
+        .hword L(w_mask_\type\()_tbl) -  640b
+        .hword L(w_mask_\type\()_tbl) -  320b
+        .hword L(w_mask_\type\()_tbl) -  160b
+        .hword L(w_mask_\type\()_tbl) -    8b
+        .hword L(w_mask_\type\()_tbl) -    4b
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_8bpc_neon, export=1
+        adr             x6,  L(blend_tbl)
+        clz             w3,  w3
+        sub             w3,  w3,  #26
+        ldrh            w3,  [x6,  x3,  lsl #1]
+        sub             x6,  x6,  w3,  uxtw
+        movi            v4.16b,  #64
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x6
+4:
+        ld1             {v2.8b},     [x5],  #8
+        ld1             {v1.d}[0],   [x2],  #8
+        ld1             {v0.s}[0],   [x0]
+        subs            w4,  w4,  #2
+        ld1             {v0.s}[1],   [x8]
+        sub             v3.8b,   v4.8b,   v2.8b
+        umull           v5.8h,   v1.8b,   v2.8b
+        umlal           v5.8h,   v0.8b,   v3.8b
+        rshrn           v6.8b,   v5.8h,   #6
+        st1             {v6.s}[0],   [x0],  x1
+        st1             {v6.s}[1],   [x8],  x1
+        b.gt            4b
+        ret
+8:
+        ld1             {v2.16b},  [x5],  #16
+        ld1             {v1.16b},  [x2],  #16
+        ld1             {v0.d}[0],   [x0]
+        ld1             {v0.d}[1],   [x8]
+        sub             v3.16b,  v4.16b,  v2.16b
+        subs            w4,  w4,  #2
+        umull           v5.8h,   v1.8b,   v2.8b
+        umlal           v5.8h,   v0.8b,   v3.8b
+        umull2          v6.8h,   v1.16b,  v2.16b
+        umlal2          v6.8h,   v0.16b,  v3.16b
+        rshrn           v7.8b,   v5.8h,   #6
+        rshrn2          v7.16b,  v6.8h,   #6
+        st1             {v7.d}[0],   [x0],  x1
+        st1             {v7.d}[1],   [x8],  x1
+        b.gt            8b
+        ret
+16:
+        ld1             {v1.16b,  v2.16b},  [x5],  #32
+        ld1             {v5.16b,  v6.16b},  [x2],  #32
+        ld1             {v0.16b},  [x0]
+        subs            w4,  w4,  #2
+        sub             v7.16b,  v4.16b,  v1.16b
+        sub             v20.16b, v4.16b,  v2.16b
+        ld1             {v3.16b},  [x8]
+        umull           v16.8h,  v5.8b,   v1.8b
+        umlal           v16.8h,  v0.8b,   v7.8b
+        umull2          v17.8h,  v5.16b,  v1.16b
+        umlal2          v17.8h,  v0.16b,  v7.16b
+        umull           v21.8h,  v6.8b,   v2.8b
+        umlal           v21.8h,  v3.8b,   v20.8b
+        umull2          v22.8h,  v6.16b,  v2.16b
+        umlal2          v22.8h,  v3.16b,  v20.16b
+        rshrn           v18.8b,  v16.8h,  #6
+        rshrn2          v18.16b, v17.8h,  #6
+        rshrn           v19.8b,  v21.8h,  #6
+        rshrn2          v19.16b, v22.8h,  #6
+        st1             {v18.16b}, [x0],  x1
+        st1             {v19.16b}, [x8],  x1
+        b.gt            16b
+        ret
+32:
+        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5],  #64
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
+        ld1             {v20.16b, v21.16b}, [x0]
+        subs            w4,  w4,  #2
+        ld1             {v22.16b, v23.16b}, [x8]
+        sub             v5.16b,  v4.16b,  v0.16b
+        sub             v6.16b,  v4.16b,  v1.16b
+        sub             v30.16b, v4.16b,  v2.16b
+        sub             v31.16b, v4.16b,  v3.16b
+        umull           v24.8h,  v16.8b,  v0.8b
+        umlal           v24.8h,  v20.8b,  v5.8b
+        umull2          v26.8h,  v16.16b, v0.16b
+        umlal2          v26.8h,  v20.16b, v5.16b
+        umull           v28.8h,  v17.8b,  v1.8b
+        umlal           v28.8h,  v21.8b,  v6.8b
+        umull2          v7.8h,   v17.16b, v1.16b
+        umlal2          v7.8h,   v21.16b, v6.16b
+        umull           v27.8h,  v18.8b,  v2.8b
+        umlal           v27.8h,  v22.8b,  v30.8b
+        umull2          v1.8h,   v18.16b, v2.16b
+        umlal2          v1.8h,   v22.16b, v30.16b
+        umull           v29.8h,  v19.8b,  v3.8b
+        umlal           v29.8h,  v23.8b,  v31.8b
+        umull2          v21.8h,  v19.16b, v3.16b
+        umlal2          v21.8h,  v23.16b, v31.16b
+        rshrn           v24.8b,  v24.8h,  #6
+        rshrn2          v24.16b, v26.8h,  #6
+        rshrn           v25.8b,  v28.8h,  #6
+        rshrn2          v25.16b, v7.8h,   #6
+        rshrn           v27.8b,  v27.8h,  #6
+        rshrn2          v27.16b, v1.8h,   #6
+        rshrn           v28.8b,  v29.8h,  #6
+        rshrn2          v28.16b, v21.8h,  #6
+        st1             {v24.16b, v25.16b}, [x0],  x1
+        st1             {v27.16b, v28.16b}, [x8],  x1
+        b.gt            32b
+        ret
+L(blend_tbl):
+        .hword L(blend_tbl) - 32b
+        .hword L(blend_tbl) - 16b
+        .hword L(blend_tbl) -  8b
+        .hword L(blend_tbl) -  4b
+endfunc
+
+function blend_h_8bpc_neon, export=1
+        adr             x6,  L(blend_h_tbl)
+        movrel          x5,  X(obmc_masks)
+        add             x5,  x5,  w4,  uxtw
+        sub             w4,  w4,  w4,  lsr #2
+        clz             w7,  w3
+        movi            v4.16b,  #64
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        sub             w7,  w7,  #24
+        ldrh            w7,  [x6,  x7,  lsl #1]
+        sub             x6,  x6,  w7, uxtw
+        br              x6
+2:
+        ld1             {v0.h}[0],   [x5],  #2
+        ld1             {v1.s}[0],   [x2],  #4
+        subs            w4,  w4,  #2
+        ld1             {v2.h}[0],   [x0]
+        zip1            v0.8b,   v0.8b,   v0.8b
+        sub             v3.8b,   v4.8b,   v0.8b
+        ld1             {v2.h}[1],   [x8]
+        umull           v5.8h,   v1.8b,   v0.8b
+        umlal           v5.8h,   v2.8b,   v3.8b
+        rshrn           v5.8b,   v5.8h,   #6
+        st1             {v5.h}[0],   [x0],  x1
+        st1             {v5.h}[1],   [x8],  x1
+        b.gt            2b
+        ret
+4:
+        ld2r            {v0.8b,   v1.8b},   [x5],  #2
+        ld1             {v2.8b},   [x2],  #8
+        subs            w4,  w4,  #2
+        ext             v0.8b,   v0.8b,   v1.8b,   #4
+        ld1             {v3.s}[0],   [x0]
+        sub             v5.8b,   v4.8b,   v0.8b
+        ld1             {v3.s}[1],   [x8]
+        umull           v6.8h,   v2.8b,   v0.8b
+        umlal           v6.8h,   v3.8b,   v5.8b
+        rshrn           v6.8b,   v6.8h,   #6
+        st1             {v6.s}[0],   [x0],  x1
+        st1             {v6.s}[1],   [x8],  x1
+        b.gt            4b
+        ret
+8:
+        ld2r            {v0.16b,  v1.16b},  [x5],  #2
+        ld1             {v2.16b},  [x2],  #16
+        ld1             {v3.d}[0],   [x0]
+        ext             v0.16b,  v0.16b,  v1.16b,  #8
+        sub             v5.16b,  v4.16b,  v0.16b
+        ld1             {v3.d}[1],   [x8]
+        subs            w4,  w4,  #2
+        umull           v6.8h,   v0.8b,   v2.8b
+        umlal           v6.8h,   v3.8b,   v5.8b
+        umull2          v7.8h,   v0.16b,  v2.16b
+        umlal2          v7.8h,   v3.16b,  v5.16b
+        rshrn           v16.8b,  v6.8h,   #6
+        rshrn2          v16.16b, v7.8h,   #6
+        st1             {v16.d}[0],  [x0],  x1
+        st1             {v16.d}[1],  [x8],  x1
+        b.gt            8b
+        ret
+16:
+        ld2r            {v0.16b,  v1.16b},  [x5],  #2
+        ld1             {v2.16b,  v3.16b},  [x2],  #32
+        ld1             {v5.16b},  [x0]
+        sub             v7.16b,  v4.16b,  v0.16b
+        sub             v16.16b, v4.16b,  v1.16b
+        ld1             {v6.16b},  [x8]
+        subs            w4,  w4,  #2
+        umull           v17.8h,  v0.8b,   v2.8b
+        umlal           v17.8h,  v5.8b,   v7.8b
+        umull2          v18.8h,  v0.16b,  v2.16b
+        umlal2          v18.8h,  v5.16b,  v7.16b
+        umull           v19.8h,  v1.8b,   v3.8b
+        umlal           v19.8h,  v6.8b,   v16.8b
+        umull2          v20.8h,  v1.16b,  v3.16b
+        umlal2          v20.8h,  v6.16b,  v16.16b
+        rshrn           v21.8b,  v17.8h,  #6
+        rshrn2          v21.16b, v18.8h,  #6
+        rshrn           v22.8b,  v19.8h,  #6
+        rshrn2          v22.16b, v20.8h,  #6
+        st1             {v21.16b}, [x0],  x1
+        st1             {v22.16b}, [x8],  x1
+        b.gt            16b
+        ret
+1280:
+640:
+320:
+        sub             x1,  x1,  w3,  uxtw
+        add             x7,  x2,  w3,  uxtw
+321:
+        ld2r            {v0.16b,  v1.16b},  [x5],  #2
+        mov             w6,  w3
+        sub             v20.16b, v4.16b,  v0.16b
+        sub             v21.16b, v4.16b,  v1.16b
+32:
+        ld1             {v16.16b, v17.16b}, [x2],  #32
+        ld1             {v2.16b,  v3.16b},  [x0]
+        subs            w6,  w6,  #32
+        umull           v23.8h,  v0.8b,   v16.8b
+        umlal           v23.8h,  v2.8b,   v20.8b
+        ld1             {v18.16b, v19.16b}, [x7],  #32
+        umull2          v27.8h,  v0.16b,  v16.16b
+        umlal2          v27.8h,  v2.16b,  v20.16b
+        ld1             {v6.16b,  v7.16b},  [x8]
+        umull           v24.8h,  v0.8b,   v17.8b
+        umlal           v24.8h,  v3.8b,   v20.8b
+        umull2          v28.8h,  v0.16b,  v17.16b
+        umlal2          v28.8h,  v3.16b,  v20.16b
+        umull           v25.8h,  v1.8b,   v18.8b
+        umlal           v25.8h,  v6.8b,   v21.8b
+        umull2          v5.8h,   v1.16b,  v18.16b
+        umlal2          v5.8h,   v6.16b,  v21.16b
+        rshrn           v29.8b,  v23.8h,  #6
+        rshrn2          v29.16b, v27.8h,  #6
+        umull           v26.8h,  v1.8b,   v19.8b
+        umlal           v26.8h,  v7.8b,   v21.8b
+        umull2          v31.8h,  v1.16b,  v19.16b
+        umlal2          v31.8h,  v7.16b,  v21.16b
+        rshrn           v30.8b,  v24.8h,  #6
+        rshrn2          v30.16b, v28.8h,  #6
+        rshrn           v23.8b,  v25.8h,  #6
+        rshrn2          v23.16b, v5.8h,   #6
+        rshrn           v24.8b,  v26.8h,  #6
+        st1             {v29.16b, v30.16b}, [x0],  #32
+        rshrn2          v24.16b, v31.8h,  #6
+        st1             {v23.16b, v24.16b}, [x8],  #32
+        b.gt            32b
+        subs            w4,  w4,  #2
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        add             x2,  x2,  w3,  uxtw
+        add             x7,  x7,  w3,  uxtw
+        b.gt            321b
+        ret
+L(blend_h_tbl):
+        .hword L(blend_h_tbl) - 1280b
+        .hword L(blend_h_tbl) -  640b
+        .hword L(blend_h_tbl) -  320b
+        .hword L(blend_h_tbl) -   16b
+        .hword L(blend_h_tbl) -    8b
+        .hword L(blend_h_tbl) -    4b
+        .hword L(blend_h_tbl) -    2b
+endfunc
+
+function blend_v_8bpc_neon, export=1
+        adr             x6,  L(blend_v_tbl)
+        movrel          x5,  X(obmc_masks)
+        add             x5,  x5,  w3,  uxtw
+        clz             w3,  w3
+        movi            v4.16b,  #64
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        sub             w3,  w3,  #26
+        ldrh            w3,  [x6,  x3,  lsl #1]
+        sub             x6,  x6,  w3,  uxtw
+        br              x6
+20:
+        ld1r            {v0.8b},   [x5]
+        sub             v1.8b,   v4.8b,   v0.8b
+2:
+        ld1             {v2.h}[0],   [x2],  #2
+        ld1             {v3.b}[0],   [x0]
+        subs            w4,  w4,  #2
+        ld1             {v2.b}[1],   [x2]
+        ld1             {v3.b}[1],   [x8]
+        umull           v5.8h,   v2.8b,   v0.8b
+        umlal           v5.8h,   v3.8b,   v1.8b
+        rshrn           v5.8b,   v5.8h,   #6
+        add             x2,  x2,  #2
+        st1             {v5.b}[0],   [x0],  x1
+        st1             {v5.b}[1],   [x8],  x1
+        b.gt            2b
+        ret
+40:
+        ld1r            {v0.2s},   [x5]
+        sub             x1,  x1,  #2
+        sub             v1.8b,   v4.8b,   v0.8b
+4:
+        ld1             {v2.8b},   [x2],  #8
+        ld1             {v3.s}[0],   [x0]
+        ld1             {v3.s}[1],   [x8]
+        subs            w4,  w4,  #2
+        umull           v5.8h,   v2.8b,   v0.8b
+        umlal           v5.8h,   v3.8b,   v1.8b
+        rshrn           v5.8b,   v5.8h,   #6
+        st1             {v5.h}[0],   [x0],  #2
+        st1             {v5.h}[2],   [x8],  #2
+        st1             {v5.b}[2],   [x0],  x1
+        st1             {v5.b}[6],   [x8],  x1
+        b.gt            4b
+        ret
+80:
+        ld1r            {v0.2d},   [x5]
+        sub             x1,  x1,  #4
+        sub             v1.16b,  v4.16b,  v0.16b
+8:
+        ld1             {v2.16b},  [x2],  #16
+        ld1             {v3.d}[0],   [x0]
+        ld1             {v3.d}[1],   [x8]
+        subs            w4,  w4,  #2
+        umull           v5.8h,  v0.8b,  v2.8b
+        umlal           v5.8h,  v3.8b,  v1.8b
+        umull2          v6.8h,  v0.16b, v2.16b
+        umlal2          v6.8h,  v3.16b, v1.16b
+        rshrn           v7.8b,  v5.8h,  #6
+        rshrn2          v7.16b, v6.8h,  #6
+        st1             {v7.s}[0],   [x0],  #4
+        st1             {v7.s}[2],   [x8],  #4
+        st1             {v7.h}[2],   [x0],  x1
+        st1             {v7.h}[6],   [x8],  x1
+        b.gt            8b
+        ret
+160:
+        ld1             {v0.16b},  [x5]
+        sub             x1,  x1,  #8
+        sub             v2.16b,  v4.16b,  v0.16b
+16:
+        ld1             {v5.16b,  v6.16b},  [x2],  #32
+        ld1             {v7.16b},  [x0]
+        subs            w4,  w4,  #2
+        ld1             {v16.16b}, [x8]
+        umull           v17.8h,  v5.8b,   v0.8b
+        umlal           v17.8h,  v7.8b,   v2.8b
+        umull2          v18.8h,  v5.16b,  v0.16b
+        umlal2          v18.8h,  v7.16b,  v2.16b
+        umull           v20.8h,  v6.8b,   v0.8b
+        umlal           v20.8h,  v16.8b,  v2.8b
+        umull2          v21.8h,  v6.16b,  v0.16b
+        umlal2          v21.8h,  v16.16b, v2.16b
+        rshrn           v19.8b,  v17.8h,  #6
+        rshrn2          v19.16b, v18.8h,  #6
+        rshrn           v22.8b,  v20.8h,  #6
+        rshrn2          v22.16b, v21.8h,  #6
+        st1             {v19.8b},  [x0],  #8
+        st1             {v22.8b},  [x8],  #8
+        st1             {v19.s}[2],  [x0],  x1
+        st1             {v22.s}[2],  [x8],  x1
+        b.gt            16b
+        ret
+320:
+        ld1             {v0.16b,  v1.16b},  [x5]
+        sub             x1,  x1,  #16
+        sub             v2.16b,  v4.16b,  v0.16b
+        sub             v3.8b,   v4.8b,   v1.8b
+32:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
+        ld1             {v5.16b,  v6.16b},  [x0]
+        subs            w4,  w4,  #2
+        ld1             {v20.16b, v21.16b}, [x8]
+        umull           v22.8h,  v16.8b,  v0.8b
+        umlal           v22.8h,  v5.8b,   v2.8b
+        umull2          v23.8h,  v16.16b, v0.16b
+        umlal2          v23.8h,  v5.16b,  v2.16b
+        umull           v28.8h,  v17.8b,  v1.8b
+        umlal           v28.8h,  v6.8b,   v3.8b
+        umull           v30.8h,  v18.8b,  v0.8b
+        umlal           v30.8h,  v20.8b,  v2.8b
+        umull2          v31.8h,  v18.16b, v0.16b
+        umlal2          v31.8h,  v20.16b, v2.16b
+        umull           v25.8h,  v19.8b,  v1.8b
+        umlal           v25.8h,  v21.8b,  v3.8b
+        rshrn           v24.8b,  v22.8h,  #6
+        rshrn2          v24.16b, v23.8h,  #6
+        rshrn           v28.8b,  v28.8h,  #6
+        rshrn           v30.8b,  v30.8h,  #6
+        rshrn2          v30.16b, v31.8h,  #6
+        rshrn           v27.8b,  v25.8h,  #6
+        st1             {v24.16b}, [x0],  #16
+        st1             {v30.16b}, [x8],  #16
+        st1             {v28.8b},  [x0],  x1
+        st1             {v27.8b},  [x8],  x1
+        b.gt            32b
+        ret
+L(blend_v_tbl):
+        .hword L(blend_v_tbl) - 320b
+        .hword L(blend_v_tbl) - 160b
+        .hword L(blend_v_tbl) -  80b
+        .hword L(blend_v_tbl) -  40b
+        .hword L(blend_v_tbl) -  20b
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that x8 is set to (clz(w)-24).
+function put_neon
+        adr             x9,  L(put_tbl)
+        ldrh            w8,  [x9, x8, lsl #1]
+        sub             x9,  x9,  w8, uxtw
+        br              x9
+
+2:
+        ld1             {v0.h}[0], [x2], x3
+        ld1             {v1.h}[0], [x2], x3
+        subs            w5,  w5,  #2
+        st1             {v0.h}[0], [x0], x1
+        st1             {v1.h}[0], [x0], x1
+        b.gt            2b
+        ret
+4:
+        ld1             {v0.s}[0], [x2], x3
+        ld1             {v1.s}[0], [x2], x3
+        subs            w5,  w5,  #2
+        st1             {v0.s}[0], [x0], x1
+        st1             {v1.s}[0], [x0], x1
+        b.gt            4b
+        ret
+8:
+        ld1             {v0.8b}, [x2], x3
+        ld1             {v1.8b}, [x2], x3
+        subs            w5,  w5,  #2
+        st1             {v0.8b}, [x0], x1
+        st1             {v1.8b}, [x0], x1
+        b.gt            8b
+        ret
+160:
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        add             x9,  x2,  x3
+        lsl             x3,  x3,  #1
+16:
+        ld1             {v0.16b}, [x2], x3
+        ld1             {v1.16b}, [x9], x3
+        subs            w5,  w5,  #2
+        st1             {v0.16b}, [x0], x1
+        st1             {v1.16b}, [x8], x1
+        b.gt            16b
+        ret
+32:
+        ldp             x6,  x7,  [x2]
+        ldp             x8,  x9,  [x2, #16]
+        stp             x6,  x7,  [x0]
+        subs            w5,  w5,  #1
+        stp             x8,  x9,  [x0, #16]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.gt            32b
+        ret
+64:
+        ldp             x6,  x7,  [x2]
+        ldp             x8,  x9,  [x2, #16]
+        stp             x6,  x7,  [x0]
+        ldp             x10, x11, [x2, #32]
+        stp             x8,  x9,  [x0, #16]
+        subs            w5,  w5,  #1
+        ldp             x12, x13, [x2, #48]
+        stp             x10, x11, [x0, #32]
+        stp             x12, x13, [x0, #48]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.gt            64b
+        ret
+128:
+        ldp             q0,  q1,  [x2]
+        ldp             q2,  q3,  [x2, #32]
+        stp             q0,  q1,  [x0]
+        ldp             q4,  q5,  [x2, #64]
+        stp             q2,  q3,  [x0, #32]
+        ldp             q6,  q7,  [x2, #96]
+        subs            w5,  w5,  #1
+        stp             q4,  q5,  [x0, #64]
+        stp             q6,  q7,  [x0, #96]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.gt            128b
+        ret
+
+L(put_tbl):
+        .hword L(put_tbl) - 128b
+        .hword L(put_tbl) -  64b
+        .hword L(put_tbl) -  32b
+        .hword L(put_tbl) - 160b
+        .hword L(put_tbl) -   8b
+        .hword L(put_tbl) -   4b
+        .hword L(put_tbl) -   2b
+endfunc
+
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
+function prep_neon
+        adr             x9,  L(prep_tbl)
+        ldrh            w8,  [x9, x8, lsl #1]
+        sub             x9,  x9,  w8, uxtw
+        br              x9
+
+4:
+        ld1             {v0.s}[0], [x1], x2
+        ld1             {v1.s}[0], [x1], x2
+        subs            w4,  w4,  #2
+        ushll           v0.8h, v0.8b, #4
+        ushll           v1.8h, v1.8b, #4
+        st1             {v0.4h, v1.4h}, [x0], #16
+        b.gt            4b
+        ret
+8:
+        ld1             {v0.8b}, [x1], x2
+        ld1             {v1.8b}, [x1], x2
+        subs            w4,  w4,  #2
+        ushll           v0.8h, v0.8b, #4
+        ushll           v1.8h, v1.8b, #4
+        st1             {v0.8h, v1.8h}, [x0], #32
+        b.gt            8b
+        ret
+160:
+        add             x9,  x1,  x2
+        lsl             x2,  x2,  #1
+16:
+        ld1             {v0.16b}, [x1], x2
+        ld1             {v1.16b}, [x9], x2
+        subs            w4,  w4,  #2
+        ushll           v4.8h, v0.8b,  #4
+        ushll2          v5.8h, v0.16b, #4
+        ushll           v6.8h, v1.8b,  #4
+        ushll2          v7.8h, v1.16b, #4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        b.gt            16b
+        ret
+320:
+        add             x8,  x0,  w3, uxtw
+32:
+        ld1             {v0.16b, v1.16b},  [x1], x2
+        subs            w4,  w4,  #2
+        ushll           v4.8h,  v0.8b,  #4
+        ushll2          v5.8h,  v0.16b, #4
+        ld1             {v2.16b, v3.16b},  [x1], x2
+        ushll           v6.8h,  v1.8b,  #4
+        ushll2          v7.8h,  v1.16b, #4
+        ushll           v16.8h, v2.8b,  #4
+        st1             {v4.8h,  v5.8h},  [x0], x7
+        ushll2          v17.8h, v2.16b, #4
+        st1             {v6.8h,  v7.8h},  [x8], x7
+        ushll           v18.8h, v3.8b,  #4
+        st1             {v16.8h, v17.8h}, [x0], x7
+        ushll2          v19.8h, v3.16b, #4
+        st1             {v18.8h, v19.8h}, [x8], x7
+        b.gt            32b
+        ret
+640:
+        add             x8,  x0,  #32
+        mov             x6,  #64
+64:
+        ldp             q0,  q1,  [x1]
+        subs            w4,  w4,  #1
+        ushll           v4.8h,  v0.8b,  #4
+        ushll2          v5.8h,  v0.16b, #4
+        ldp             q2,  q3,  [x1, #32]
+        ushll           v6.8h,  v1.8b,  #4
+        ushll2          v7.8h,  v1.16b, #4
+        add             x1,  x1,  x2
+        ushll           v16.8h, v2.8b,  #4
+        st1             {v4.8h,  v5.8h},  [x0], x6
+        ushll2          v17.8h, v2.16b, #4
+        ushll           v18.8h, v3.8b,  #4
+        st1             {v6.8h,  v7.8h},  [x8], x6
+        ushll2          v19.8h, v3.16b, #4
+        st1             {v16.8h, v17.8h}, [x0], x6
+        st1             {v18.8h, v19.8h}, [x8], x6
+        b.gt            64b
+        ret
+1280:
+        add             x8,  x0,  #64
+        mov             x6,  #128
+128:
+        ldp             q0,  q1,  [x1]
+        ldp             q2,  q3,  [x1, #32]
+        ushll           v16.8h,  v0.8b,  #4
+        ushll2          v17.8h,  v0.16b, #4
+        ushll           v18.8h,  v1.8b,  #4
+        ushll2          v19.8h,  v1.16b, #4
+        ushll           v20.8h,  v2.8b,  #4
+        ushll2          v21.8h,  v2.16b, #4
+        ldp             q4,  q5,  [x1, #64]
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
+        ushll           v22.8h,  v3.8b,  #4
+        ushll2          v23.8h,  v3.16b, #4
+        ushll           v24.8h,  v4.8b,  #4
+        ushll2          v25.8h,  v4.16b, #4
+        ushll           v26.8h,  v5.8b,  #4
+        ushll2          v27.8h,  v5.16b, #4
+        ldp             q6,  q7,  [x1, #96]
+        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
+        ushll           v28.8h,  v6.8b,  #4
+        ushll2          v29.8h,  v6.16b, #4
+        ushll           v30.8h,  v7.8b,  #4
+        ushll2          v31.8h,  v7.16b, #4
+        subs            w4,  w4,  #1
+        add             x1,  x1,  x2
+        st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
+        st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
+        b.gt            128b
+        ret
+
+L(prep_tbl):
+        .hword L(prep_tbl) - 1280b
+        .hword L(prep_tbl) -  640b
+        .hword L(prep_tbl) -  320b
+        .hword L(prep_tbl) -  160b
+        .hword L(prep_tbl) -    8b
+        .hword L(prep_tbl) -    4b
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+        ld1             {\d0\wd}[0], [\s0], \strd
+        ld1             {\d1\wd}[0], [\s1], \strd
+.ifnb \d2
+        ld1             {\d2\wd}[0], [\s0], \strd
+        ld1             {\d3\wd}[0], [\s1], \strd
+.endif
+.ifnb \d4
+        ld1             {\d4\wd}[0], [\s0], \strd
+.endif
+.ifnb \d5
+        ld1             {\d5\wd}[0], [\s1], \strd
+.endif
+.ifnb \d6
+        ld1             {\d6\wd}[0], [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+        ld1             {\d0\wd}, [\s0], \strd
+        ld1             {\d1\wd}, [\s1], \strd
+.ifnb \d2
+        ld1             {\d2\wd}, [\s0], \strd
+        ld1             {\d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+        ld1             {\d4\wd}, [\s0], \strd
+.endif
+.ifnb \d5
+        ld1             {\d5\wd}, [\s1], \strd
+.endif
+.ifnb \d6
+        ld1             {\d6\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_slice      \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_reg        \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_reg        \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro interleave_1 wd, r0, r1, r2, r3, r4
+        trn1            \r0\wd, \r0\wd, \r1\wd
+        trn1            \r1\wd, \r1\wd, \r2\wd
+.ifnb \r3
+        trn1            \r2\wd, \r2\wd, \r3\wd
+        trn1            \r3\wd, \r3\wd, \r4\wd
+.endif
+.endm
+.macro interleave_1_h r0, r1, r2, r3, r4
+        interleave_1    .4h, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro interleave_1_s r0, r1, r2, r3, r4
+        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro interleave_2 wd, r0, r1, r2, r3, r4, r5
+        trn1            \r0\wd,  \r0\wd, \r2\wd
+        trn1            \r1\wd,  \r1\wd, \r3\wd
+        trn1            \r2\wd,  \r2\wd, \r4\wd
+        trn1            \r3\wd,  \r3\wd, \r5\wd
+.endm
+.macro interleave_2_s r0, r1, r2, r3, r4, r5
+        interleave_2    .2s, \r0, \r1, \r2, \r3, \r4, \r5
+.endm
+.macro uxtl_b r0, r1, r2, r3, r4, r5, r6
+        uxtl            \r0\().8h, \r0\().8b
+        uxtl            \r1\().8h, \r1\().8b
+.ifnb \r2
+        uxtl            \r2\().8h, \r2\().8b
+        uxtl            \r3\().8h, \r3\().8b
+.endif
+.ifnb \r4
+        uxtl            \r4\().8h, \r4\().8b
+.endif
+.ifnb \r5
+        uxtl            \r5\().8h, \r5\().8b
+.endif
+.ifnb \r6
+        uxtl            \r6\().8h, \r6\().8b
+.endif
+.endm
+.macro mul_mla_4 d, s0, s1, s2, s3, wd
+        mul             \d\wd,  \s0\wd,  v0.h[0]
+        mla             \d\wd,  \s1\wd,  v0.h[1]
+        mla             \d\wd,  \s2\wd,  v0.h[2]
+        mla             \d\wd,  \s3\wd,  v0.h[3]
+.endm
+// Interleaving the mul/mla chains actually hurts performance
+// significantly on Cortex A53, thus keeping mul/mla tightly
+// chained like this.
+.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+        mul             \d0\().8h, \s0\().8h, v0.h[0]
+        mla             \d0\().8h, \s1\().8h, v0.h[1]
+        mla             \d0\().8h, \s2\().8h, v0.h[2]
+        mla             \d0\().8h, \s3\().8h, v0.h[3]
+        mla             \d0\().8h, \s4\().8h, v0.h[4]
+        mla             \d0\().8h, \s5\().8h, v0.h[5]
+        mla             \d0\().8h, \s6\().8h, v0.h[6]
+        mla             \d0\().8h, \s7\().8h, v0.h[7]
+        mul             \d1\().8h, \s1\().8h, v0.h[0]
+        mla             \d1\().8h, \s2\().8h, v0.h[1]
+        mla             \d1\().8h, \s3\().8h, v0.h[2]
+        mla             \d1\().8h, \s4\().8h, v0.h[3]
+        mla             \d1\().8h, \s5\().8h, v0.h[4]
+        mla             \d1\().8h, \s6\().8h, v0.h[5]
+        mla             \d1\().8h, \s7\().8h, v0.h[6]
+        mla             \d1\().8h, \s8\().8h, v0.h[7]
+.endm
+.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+        mul             \d0\().8h, \s0\().8h, v0.h[0]
+        mla             \d0\().8h, \s1\().8h, v0.h[1]
+        mla             \d0\().8h, \s2\().8h, v0.h[2]
+        mla             \d0\().8h, \s3\().8h, v0.h[3]
+        mla             \d0\().8h, \s4\().8h, v0.h[4]
+        mla             \d0\().8h, \s5\().8h, v0.h[5]
+        mla             \d0\().8h, \s6\().8h, v0.h[6]
+        mla             \d0\().8h, \s7\().8h, v0.h[7]
+        mul             \d1\().8h, \s2\().8h, v0.h[0]
+        mla             \d1\().8h, \s3\().8h, v0.h[1]
+        mla             \d1\().8h, \s4\().8h, v0.h[2]
+        mla             \d1\().8h, \s5\().8h, v0.h[3]
+        mla             \d1\().8h, \s6\().8h, v0.h[4]
+        mla             \d1\().8h, \s7\().8h, v0.h[5]
+        mla             \d1\().8h, \s8\().8h, v0.h[6]
+        mla             \d1\().8h, \s9\().8h, v0.h[7]
+.endm
+.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
+        mul             \d0\().8h, \s0\().8h,  v0.h[0]
+        mla             \d0\().8h, \s1\().8h,  v0.h[1]
+        mla             \d0\().8h, \s2\().8h,  v0.h[2]
+        mla             \d0\().8h, \s3\().8h,  v0.h[3]
+        mla             \d0\().8h, \s4\().8h,  v0.h[4]
+        mla             \d0\().8h, \s5\().8h,  v0.h[5]
+        mla             \d0\().8h, \s6\().8h,  v0.h[6]
+        mla             \d0\().8h, \s7\().8h,  v0.h[7]
+        mul             \d1\().8h, \s4\().8h,  v0.h[0]
+        mla             \d1\().8h, \s5\().8h,  v0.h[1]
+        mla             \d1\().8h, \s6\().8h,  v0.h[2]
+        mla             \d1\().8h, \s7\().8h,  v0.h[3]
+        mla             \d1\().8h, \s8\().8h,  v0.h[4]
+        mla             \d1\().8h, \s9\().8h,  v0.h[5]
+        mla             \d1\().8h, \s10\().8h, v0.h[6]
+        mla             \d1\().8h, \s11\().8h, v0.h[7]
+.endm
+.macro sqrshrun_b shift, r0, r1, r2, r3
+        sqrshrun        \r0\().8b, \r0\().8h,  #\shift
+.ifnb \r1
+        sqrshrun        \r1\().8b, \r1\().8h,  #\shift
+.endif
+.ifnb \r2
+        sqrshrun        \r2\().8b, \r2\().8h,  #\shift
+        sqrshrun        \r3\().8b, \r3\().8h,  #\shift
+.endif
+.endm
+.macro srshr_h shift, r0, r1, r2, r3
+        srshr           \r0\().8h, \r0\().8h,  #\shift
+.ifnb \r1
+        srshr           \r1\().8h, \r1\().8h,  #\shift
+.endif
+.ifnb \r2
+        srshr           \r2\().8h, \r2\().8h,  #\shift
+        srshr           \r3\().8h, \r3\().8h,  #\shift
+.endif
+.endm
+.macro st_h strd, reg, lanes
+        st1             {\reg\().h}[0], [x0], \strd
+        st1             {\reg\().h}[1], [x8], \strd
+.if \lanes > 2
+        st1             {\reg\().h}[2], [x0], \strd
+        st1             {\reg\().h}[3], [x8], \strd
+.endif
+.endm
+.macro st_s strd, r0, r1
+        st1             {\r0\().s}[0], [x0], \strd
+        st1             {\r0\().s}[1], [x8], \strd
+.ifnb \r1
+        st1             {\r1\().s}[0], [x0], \strd
+        st1             {\r1\().s}[1], [x8], \strd
+.endif
+.endm
+.macro st_d strd, r0, r1
+        st1             {\r0\().d}[0], [x0], \strd
+        st1             {\r0\().d}[1], [x8], \strd
+.ifnb \r1
+        st1             {\r1\().d}[0], [x0], \strd
+        st1             {\r1\().d}[1], [x8], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, r0, r1
+.ifc \type, put
+        sqrshrun_b      6,     \r0, \r1
+        st_s            \strd, \r0, \r1
+.else
+        srshr_h         2,     \r0, \r1
+        st_d            \strd, \r0, \r1
+.endif
+.endm
+.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
+        st1             {\r0\wd}, [x0], \strd
+        st1             {\r1\wd}, [x8], \strd
+.ifnb \r2
+        st1             {\r2\wd}, [x0], \strd
+        st1             {\r3\wd}, [x8], \strd
+.endif
+.ifnb \r4
+        st1             {\r4\wd}, [x0], \strd
+        st1             {\r5\wd}, [x8], \strd
+        st1             {\r6\wd}, [x0], \strd
+        st1             {\r7\wd}, [x8], \strd
+.endif
+.endm
+.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
+        st_reg          \strd, .8b,  \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
+        st_reg          \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro shift_store_8 type, strd, r0, r1, r2, r3
+.ifc \type, put
+        sqrshrun_b      6,     \r0, \r1, \r2, \r3
+        st_8b           \strd, \r0, \r1, \r2, \r3
+.else
+        srshr_h         2,     \r0, \r1, \r2, \r3
+        st_16b          \strd, \r0, \r1, \r2, \r3
+.endif
+.endm
+.macro shift_store_16 type, strd, r0, r1, r2, r3
+.ifc \type, put
+        sqrshrun        \r0\().8b,  \r0\().8h, #6
+        sqrshrun2       \r0\().16b, \r1\().8h, #6
+        sqrshrun        \r2\().8b,  \r2\().8h, #6
+        sqrshrun2       \r2\().16b, \r3\().8h, #6
+        st_16b          \strd, \r0, \r2
+.else
+        srshr_h         2,     \r0, \r1, \r2, \r3
+        st1             {\r0\().8h, \r1\().8h}, [x0], \strd
+        st1             {\r2\().8h, \r3\().8h}, [x8], \strd
+.endif
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_8bpc_neon, export=1
+        mov             x8,  \type_h
+        mov             x9,  \type_v
+        b               \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH  ((1*15<<7)|4*15)
+#define SHARP   ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
+make_8tap_fn \type, regular,        REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
+make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
+make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
+make_8tap_fn \type, sharp,          SHARP,   SHARP
+make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
+make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
+
+function \type\()_8tap_neon
+        mov             w10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
+        mul             \mx,  \mx, w10
+        mul             \my,  \my, w10
+        add             \mx,  \mx, w8 // mx, 8tap_h, 4tap_h
+        add             \my,  \my, w9 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+        uxtw            \d_strd, \w
+        lsl             \d_strd, \d_strd, #1
+.endif
+
+        clz             w8,  \w
+        tst             \mx, #(0x7f << 14)
+        sub             w8,  w8,  #24
+        movrel          x10, X(mc_subpel_filters), -8
+        b.ne            L(\type\()_8tap_h)
+        tst             \my, #(0x7f << 14)
+        b.ne            L(\type\()_8tap_v)
+        b               \type\()_neon
+
+L(\type\()_8tap_h):
+        cmp             \w,  #4
+        ubfx            w9,  \mx, #7, #7
+        and             \mx, \mx, #0x7f
+        b.le            4f
+        mov             \mx,  w9
+4:
+        tst             \my,  #(0x7f << 14)
+        add             \xmx, x10, \mx, uxtw #3
+        b.ne            L(\type\()_8tap_hv)
+
+        adr             x9,  L(\type\()_8tap_h_tbl)
+        ldrh            w8,  [x9, x8, lsl #1]
+        sub             x9,  x9,  w8, uxtw
+        br              x9
+
+20:     // 2xN h
+.ifc \type, put
+        add             \xmx,  \xmx,  #2
+        ld1             {v0.s}[0], [\xmx]
+        sub             \src,  \src,  #1
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h,  v0.8b
+2:
+        ld1             {v4.8b},  [\src], \s_strd
+        ld1             {v6.8b},  [\sr2], \s_strd
+        uxtl            v4.8h,  v4.8b
+        uxtl            v6.8h,  v6.8b
+        ext             v5.16b, v4.16b, v4.16b, #2
+        ext             v7.16b, v6.16b, v6.16b, #2
+        subs            \h,  \h,  #2
+        trn1            v3.2s,  v4.2s,  v6.2s
+        trn2            v6.2s,  v4.2s,  v6.2s
+        trn1            v4.2s,  v5.2s,  v7.2s
+        trn2            v7.2s,  v5.2s,  v7.2s
+        mul             v3.4h,  v3.4h,  v0.h[0]
+        mla             v3.4h,  v4.4h,  v0.h[1]
+        mla             v3.4h,  v6.4h,  v0.h[2]
+        mla             v3.4h,  v7.4h,  v0.h[3]
+        srshr           v3.4h,  v3.4h,  #2
+        sqrshrun        v3.8b,  v3.8h,  #4
+        st1             {v3.h}[0], [\dst], \d_strd
+        st1             {v3.h}[1], [\ds2], \d_strd
+        b.gt            2b
+        ret
+.endif
+
+40:     // 4xN h
+        add             \xmx,  \xmx,  #2
+        ld1             {v0.s}[0], [\xmx]
+        sub             \src,  \src,  #1
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h,  v0.8b
+4:
+        ld1             {v16.8b}, [\src], \s_strd
+        ld1             {v20.8b}, [\sr2], \s_strd
+        uxtl            v16.8h,  v16.8b
+        uxtl            v20.8h,  v20.8b
+        ext             v17.16b, v16.16b, v16.16b, #2
+        ext             v18.16b, v16.16b, v16.16b, #4
+        ext             v19.16b, v16.16b, v16.16b, #6
+        ext             v21.16b, v20.16b, v20.16b, #2
+        ext             v22.16b, v20.16b, v20.16b, #4
+        ext             v23.16b, v20.16b, v20.16b, #6
+        subs            \h,  \h,  #2
+        mul             v16.4h,  v16.4h,  v0.h[0]
+        mla             v16.4h,  v17.4h,  v0.h[1]
+        mla             v16.4h,  v18.4h,  v0.h[2]
+        mla             v16.4h,  v19.4h,  v0.h[3]
+        mul             v20.4h,  v20.4h,  v0.h[0]
+        mla             v20.4h,  v21.4h,  v0.h[1]
+        mla             v20.4h,  v22.4h,  v0.h[2]
+        mla             v20.4h,  v23.4h,  v0.h[3]
+        srshr           v16.4h,  v16.4h,  #2
+        srshr           v20.4h,  v20.4h,  #2
+.ifc \type, put
+        sqrshrun        v16.8b,  v16.8h,  #4
+        sqrshrun        v20.8b,  v20.8h,  #4
+        st1             {v16.s}[0], [\dst], \d_strd
+        st1             {v20.s}[0], [\ds2], \d_strd
+.else
+        st1             {v16.4h}, [\dst], \d_strd
+        st1             {v20.4h}, [\ds2], \d_strd
+.endif
+        b.gt            4b
+        ret
+
+80:     // 8xN h
+        ld1             {v0.8b}, [\xmx]
+        sub             \src,  \src,  #3
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h, v0.8b
+8:
+        ld1             {v16.8b, v17.8b},  [\src], \s_strd
+        ld1             {v20.8b, v21.8b},  [\sr2], \s_strd
+        uxtl            v16.8h,  v16.8b
+        uxtl            v17.8h,  v17.8b
+        uxtl            v20.8h,  v20.8b
+        uxtl            v21.8h,  v21.8b
+
+        mul             v18.8h,  v16.8h,  v0.h[0]
+        mul             v22.8h,  v20.8h,  v0.h[0]
+.irpc i, 1234567
+        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
+        mla             v18.8h,  v19.8h,  v0.h[\i]
+        mla             v22.8h,  v23.8h,  v0.h[\i]
+.endr
+        subs            \h,  \h,  #2
+        srshr           v18.8h,  v18.8h, #2
+        srshr           v22.8h,  v22.8h, #2
+.ifc \type, put
+        sqrshrun        v18.8b,  v18.8h, #4
+        sqrshrun        v22.8b,  v22.8h, #4
+        st1             {v18.8b}, [\dst], \d_strd
+        st1             {v22.8b}, [\ds2], \d_strd
+.else
+        st1             {v18.8h}, [\dst], \d_strd
+        st1             {v22.8h}, [\ds2], \d_strd
+.endif
+        b.gt            8b
+        ret
+160:
+320:
+640:
+1280:   // 16xN, 32xN, ... h
+        ld1             {v0.8b}, [\xmx]
+        sub             \src,  \src,  #3
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h, v0.8b
+
+        sub             \s_strd,  \s_strd,  \w, uxtw
+        sub             \s_strd,  \s_strd,  #8
+.ifc \type, put
+        lsl             \d_strd,  \d_strd,  #1
+        sub             \d_strd,  \d_strd,  \w, uxtw
+.endif
+161:
+        ld1             {v16.8b, v17.8b, v18.8b},  [\src], #24
+        ld1             {v20.8b, v21.8b, v22.8b},  [\sr2], #24
+        mov             \mx, \w
+        uxtl            v16.8h,  v16.8b
+        uxtl            v17.8h,  v17.8b
+        uxtl            v18.8h,  v18.8b
+        uxtl            v20.8h,  v20.8b
+        uxtl            v21.8h,  v21.8b
+        uxtl            v22.8h,  v22.8b
+
+16:
+        mul             v24.8h,  v16.8h,  v0.h[0]
+        mul             v25.8h,  v17.8h,  v0.h[0]
+        mul             v26.8h,  v20.8h,  v0.h[0]
+        mul             v27.8h,  v21.8h,  v0.h[0]
+.irpc i, 1234567
+        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
+        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
+        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
+        mla             v24.8h,  v28.8h,  v0.h[\i]
+        mla             v25.8h,  v29.8h,  v0.h[\i]
+        mla             v26.8h,  v30.8h,  v0.h[\i]
+        mla             v27.8h,  v31.8h,  v0.h[\i]
+.endr
+        srshr           v24.8h,  v24.8h, #2
+        srshr           v25.8h,  v25.8h, #2
+        srshr           v26.8h,  v26.8h, #2
+        srshr           v27.8h,  v27.8h, #2
+        subs            \mx, \mx, #16
+.ifc \type, put
+        sqrshrun        v24.8b,  v24.8h, #4
+        sqrshrun2       v24.16b, v25.8h, #4
+        sqrshrun        v26.8b,  v26.8h, #4
+        sqrshrun2       v26.16b, v27.8h, #4
+        st1             {v24.16b}, [\dst], #16
+        st1             {v26.16b}, [\ds2], #16
+.else
+        st1             {v24.8h, v25.8h}, [\dst], #32
+        st1             {v26.8h, v27.8h}, [\ds2], #32
+.endif
+        b.le            9f
+
+        mov             v16.16b, v18.16b
+        mov             v20.16b, v22.16b
+        ld1             {v17.8b, v18.8b}, [\src], #16
+        ld1             {v21.8b, v22.8b}, [\sr2], #16
+        uxtl            v17.8h,  v17.8b
+        uxtl            v18.8h,  v18.8b
+        uxtl            v21.8h,  v21.8b
+        uxtl            v22.8h,  v22.8b
+        b               16b
+
+9:
+        add             \dst,  \dst,  \d_strd
+        add             \ds2,  \ds2,  \d_strd
+        add             \src,  \src,  \s_strd
+        add             \sr2,  \sr2,  \s_strd
+
+        subs            \h,  \h,  #2
+        b.gt            161b
+        ret
+
+L(\type\()_8tap_h_tbl):
+        .hword L(\type\()_8tap_h_tbl) - 1280b
+        .hword L(\type\()_8tap_h_tbl) -  640b
+        .hword L(\type\()_8tap_h_tbl) -  320b
+        .hword L(\type\()_8tap_h_tbl) -  160b
+        .hword L(\type\()_8tap_h_tbl) -   80b
+        .hword L(\type\()_8tap_h_tbl) -   40b
+        .hword L(\type\()_8tap_h_tbl) -   20b
+        .hword 0
+
+
+L(\type\()_8tap_v):
+        cmp             \h,  #4
+        ubfx            w9,  \my, #7, #7
+        and             \my, \my, #0x7f
+        b.le            4f
+        mov             \my, w9
+4:
+        add             \xmy, x10, \my, uxtw #3
+
+        adr             x9,  L(\type\()_8tap_v_tbl)
+        ldrh            w8,  [x9, x8, lsl #1]
+        sub             x9,  x9,  w8, uxtw
+        br              x9
+
+20:     // 2xN v
+.ifc \type, put
+        b.gt            28f
+
+        cmp             \h,  #2
+        add             \xmy, \xmy, #2
+        ld1             {v0.s}[0], [\xmy]
+        sub             \src,  \src,  \s_strd
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        sxtl            v0.8h, v0.8b
+
+        // 2x2 v
+        load_h          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+        interleave_1_h  v1, v2, v3, v4, v5
+        b.gt            24f
+        uxtl_b          v1, v2, v3, v4
+        mul_mla_4       v6, v1, v2, v3, v4, .4h
+        sqrshrun_b      6,  v6
+        st_h            \d_strd, v6, 2
+        ret
+
+24:     // 2x4 v
+        load_h          \sr2, \src, \s_strd, v6, v7
+        interleave_1_h  v5, v6, v7
+        interleave_2_s  v1, v2, v3, v4, v5, v6
+        uxtl_b          v1, v2, v3, v4
+        mul_mla_4       v6, v1, v2, v3, v4, .8h
+        sqrshrun_b      6,  v6
+        st_h            \d_strd, v6, 4
+        ret
+
+28:     // 2x8, 2x16 v
+        ld1             {v0.8b}, [\xmy]
+        sub             \sr2,  \src,  \s_strd, lsl #1
+        add             \ds2,  \dst,  \d_strd
+        sub             \src,  \sr2,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h, v0.8b
+
+        load_h          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
+        interleave_1_h  v1,  v2,  v3,  v4,  v5
+        interleave_1_h  v5,  v6,  v7
+        interleave_2_s  v1,  v2,  v3,  v4,  v5,  v6
+        uxtl_b          v1,  v2,  v3,  v4
+216:
+        subs            \h,  \h,  #8
+        load_h          \sr2, \src, \s_strd, v16, v17, v18, v19
+        load_h          \sr2, \src, \s_strd, v20, v21, v22, v23
+        interleave_1_h  v7,  v16, v17, v18, v19
+        interleave_1_h  v19, v20, v21, v22, v23
+        interleave_2_s  v5,  v6,  v7,  v16, v17, v18
+        interleave_2_s  v17, v18, v19, v20, v21, v22
+        uxtl_b          v5,  v6,  v7,  v16
+        uxtl_b          v17, v18, v19, v20
+        mul_mla_8_4     v30, v31, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16, v17, v18, v19, v20
+        sqrshrun_b      6,   v30, v31
+        st_h            \d_strd, v30, 4
+        st_h            \d_strd, v31, 4
+        b.le            0f
+        mov             v1.16b,  v17.16b
+        mov             v2.16b,  v18.16b
+        mov             v3.16b,  v19.16b
+        mov             v4.16b,  v20.16b
+        mov             v5.16b,  v21.16b
+        mov             v6.16b,  v22.16b
+        mov             v7.16b,  v23.16b
+        b               216b
+0:
+        ret
+.endif
+
+40:
+        b.gt            480f
+
+        // 4x2, 4x4 v
+        cmp             \h,  #2
+        add             \xmy, \xmy, #2
+        ld1             {v0.s}[0], [\xmy]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h, v0.8b
+
+        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+        interleave_1_s  v1, v2, v3, v4, v5
+        uxtl_b          v1, v2, v3, v4
+        mul_mla_4       v6, v1, v2, v3, v4, .8h
+        shift_store_4   \type, \d_strd, v6
+        b.le            0f
+        load_s          \sr2, \src, \s_strd, v6, v7
+        interleave_1_s  v5, v6, v7
+        uxtl_b          v5, v6
+        mul_mla_4       v7, v3, v4, v5, v6, .8h
+        shift_store_4   \type, \d_strd, v7
+0:
+        ret
+
+480:    // 4x8, 4x16 v
+        ld1             {v0.8b}, [\xmy]
+        sub             \sr2, \src, \s_strd, lsl #1
+        add             \ds2, \dst, \d_strd
+        sub             \src, \sr2, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h, v0.8b
+
+        load_s          \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+        interleave_1_s  v16, v17, v18
+        interleave_1_s  v18, v19, v20, v21, v22
+        uxtl_b          v16, v17
+        uxtl_b          v18, v19, v20, v21
+
+48:
+        subs            \h,  \h,  #4
+        load_s          \sr2, \src, \s_strd, v23, v24, v25, v26
+        interleave_1_s  v22, v23, v24, v25, v26
+        uxtl_b          v22, v23, v24, v25
+        mul_mla_8_2     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+        shift_store_4   \type, \d_strd, v1, v2
+        b.le            0f
+        subs            \h,  \h,  #4
+        load_s          \sr2,  \src, \s_strd, v27, v16, v17, v18
+        interleave_1_s  v26, v27, v16, v17, v18
+        uxtl_b          v26, v27, v16, v17
+        mul_mla_8_2     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16, v17
+        shift_store_4   \type, \d_strd, v1, v2
+        b.le            0f
+        subs            \h,  \h,  #4
+        load_s          \sr2, \src, \s_strd, v19, v20, v21, v22
+        interleave_1_s  v18, v19, v20, v21, v22
+        uxtl_b          v18, v19, v20, v21
+        mul_mla_8_2     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+        shift_store_4   \type, \d_strd, v1, v2
+        b.gt            48b
+0:
+        ret
+
+80:
+        b.gt            880f
+
+        // 8x2, 8x4 v
+        cmp             \h,  #2
+        add             \xmy, \xmy, #2
+        ld1             {v0.s}[0], [\xmy]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h, v0.8b
+
+        load_8b         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+        uxtl_b          v1, v2, v3, v4, v5
+        mul_mla_4       v6, v1, v2, v3, v4, .8h
+        mul_mla_4       v7, v2, v3, v4, v5, .8h
+        shift_store_8   \type, \d_strd, v6, v7
+        b.le            0f
+        load_8b         \sr2, \src, \s_strd, v6, v7
+        uxtl_b          v6, v7
+        mul_mla_4       v1, v3, v4, v5, v6, .8h
+        mul_mla_4       v2, v4, v5, v6, v7, .8h
+        shift_store_8   \type, \d_strd, v1, v2
+0:
+        ret
+
+880:    // 8x6, 8x8, 8x16, 8x32 v
+1680:   // 16x8, 16x16, ...
+320:    // 32x8, 32x16, ...
+640:
+1280:
+        ld1             {v0.8b}, [\xmy]
+        sub             \src, \src, \s_strd
+        sub             \src, \src, \s_strd, lsl #1
+        sxtl            v0.8h, v0.8b
+        mov             \my,  \h
+168:
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        load_8b         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+        uxtl_b          v16, v17, v18, v19, v20, v21, v22
+
+88:
+        subs            \h,  \h,  #2
+        load_8b         \sr2, \src, \s_strd, v23, v24
+        uxtl_b          v23, v24
+        mul_mla_8_1     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24
+        shift_store_8   \type, \d_strd, v1, v2
+        b.le            9f
+        subs            \h,  \h,  #2
+        load_8b         \sr2, \src, \s_strd, v25, v26
+        uxtl_b          v25, v26
+        mul_mla_8_1     v3,  v4,  v18, v19, v20, v21, v22, v23, v24, v25, v26
+        shift_store_8   \type, \d_strd, v3, v4
+        b.le            9f
+        subs            \h,  \h,  #2
+        load_8b         \sr2, \src, \s_strd, v27, v16
+        uxtl_b          v27, v16
+        mul_mla_8_1     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16
+        shift_store_8   \type, \d_strd, v1, v2
+        b.le            9f
+        subs            \h,  \h,  #2
+        load_8b         \sr2, \src, \s_strd, v17, v18
+        uxtl_b          v17, v18
+        mul_mla_8_1     v3,  v4,  v22, v23, v24, v25, v26, v27, v16, v17, v18
+        shift_store_8   \type, \d_strd, v3, v4
+        b.le            9f
+        subs            \h,  \h,  #4
+        load_8b         \sr2, \src, \s_strd, v19, v20, v21, v22
+        uxtl_b          v19, v20, v21, v22
+        mul_mla_8_1     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20
+        mul_mla_8_1     v3,  v4,  v26, v27, v16, v17, v18, v19, v20, v21, v22
+        shift_store_8   \type, \d_strd, v1, v2, v3, v4
+        b.gt            88b
+9:
+        subs            \w,  \w,  #8
+        b.le            0f
+        asr             \s_strd, \s_strd, #1
+        asr             \d_strd, \d_strd, #1
+        msub            \src, \s_strd, \xmy, \src
+        msub            \dst, \d_strd, \xmy, \dst
+        sub             \src, \src, \s_strd, lsl #3
+        mov             \h,  \my
+        add             \src, \src, #8
+.ifc \type, put
+        add             \dst, \dst, #8
+.else
+        add             \dst, \dst, #16
+.endif
+        b               168b
+0:
+        ret
+
+160:
+        b.gt            1680b
+
+        // 16x2, 16x4 v
+        add             \xmy, \xmy, #2
+        ld1             {v0.s}[0], [\xmy]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h, v0.8b
+
+        cmp             \h,  #2
+        load_16b        \src, \sr2, \s_strd, v1,  v2,  v3,  v4,  v5
+        uxtl            v16.8h, v1.8b
+        uxtl            v17.8h, v2.8b
+        uxtl            v18.8h, v3.8b
+        uxtl            v19.8h, v4.8b
+        uxtl            v20.8h, v5.8b
+        uxtl2           v23.8h, v1.16b
+        uxtl2           v24.8h, v2.16b
+        uxtl2           v25.8h, v3.16b
+        uxtl2           v26.8h, v4.16b
+        uxtl2           v27.8h, v5.16b
+        mul_mla_4       v1,  v16, v17, v18, v19, .8h
+        mul_mla_4       v16, v17, v18, v19, v20, .8h
+        mul_mla_4       v2,  v23, v24, v25, v26, .8h
+        mul_mla_4       v17, v24, v25, v26, v27, .8h
+        shift_store_16  \type, \d_strd, v1, v2, v16, v17
+        b.le            0f
+        load_16b        \sr2, \src, \s_strd, v6,  v7
+        uxtl            v21.8h, v6.8b
+        uxtl            v22.8h, v7.8b
+        uxtl2           v28.8h, v6.16b
+        uxtl2           v29.8h, v7.16b
+        mul_mla_4       v1,  v18, v19, v20, v21, .8h
+        mul_mla_4       v3,  v19, v20, v21, v22, .8h
+        mul_mla_4       v2,  v25, v26, v27, v28, .8h
+        mul_mla_4       v4,  v26, v27, v28, v29, .8h
+        shift_store_16  \type, \d_strd, v1, v2, v3, v4
+0:
+        ret
+
+L(\type\()_8tap_v_tbl):
+        .hword L(\type\()_8tap_v_tbl) - 1280b
+        .hword L(\type\()_8tap_v_tbl) -  640b
+        .hword L(\type\()_8tap_v_tbl) -  320b
+        .hword L(\type\()_8tap_v_tbl) -  160b
+        .hword L(\type\()_8tap_v_tbl) -   80b
+        .hword L(\type\()_8tap_v_tbl) -   40b
+        .hword L(\type\()_8tap_v_tbl) -   20b
+        .hword 0
+
+L(\type\()_8tap_hv):
+        cmp             \h,  #4
+        ubfx            w9,  \my, #7, #7
+        and             \my, \my, #0x7f
+        b.le            4f
+        mov             \my,  w9
+4:
+        add             \xmy,  x10, \my, uxtw #3
+
+        adr             x9,  L(\type\()_8tap_hv_tbl)
+        ldrh            w8,  [x9, x8, lsl #1]
+        sub             x9,  x9,  w8, uxtw
+        br              x9
+
+20:
+.ifc \type, put
+        add             \xmx,  \xmx,  #2
+        ld1             {v0.s}[0],  [\xmx]
+        b.gt            280f
+        add             \xmy,  \xmy,  #2
+        ld1             {v1.s}[0],  [\xmy]
+
+        // 2x2, 2x4 hv
+        sub             \sr2, \src, #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,  v0.8b
+        sxtl            v1.8h,  v1.8b
+        mov             x15, x30
+
+        ld1             {v28.8b}, [\src], \s_strd
+        uxtl            v28.8h,  v28.8b
+        ext             v29.16b, v28.16b, v28.16b, #2
+        mul             v28.4h,  v28.4h,  v0.4h
+        mul             v29.4h,  v29.4h,  v0.4h
+        addp            v28.4h,  v28.4h,  v29.4h
+        addp            v16.4h,  v28.4h,  v28.4h
+        srshr           v16.4h,  v16.4h,  #2
+        bl              L(\type\()_8tap_filter_2)
+
+        trn1            v16.2s, v16.2s, v28.2s
+        mov             v17.8b, v28.8b
+
+2:
+        bl              L(\type\()_8tap_filter_2)
+
+        ext             v18.8b, v17.8b, v28.8b, #4
+        mov             v19.8b, v28.8b
+        smull           v2.4s,  v16.4h, v1.h[0]
+        smlal           v2.4s,  v17.4h, v1.h[1]
+        smlal           v2.4s,  v18.4h, v1.h[2]
+        smlal           v2.4s,  v19.4h, v1.h[3]
+
+        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
+        sqxtun          v2.8b,  v2.8h
+        subs            \h,  \h,  #2
+        st1             {v2.h}[0], [\dst], \d_strd
+        st1             {v2.h}[1], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.8b, v18.8b
+        mov             v17.8b, v19.8b
+        b               2b
+
+280:    // 2x8, 2x16, 2x32 hv
+        ld1             {v1.8b},  [\xmy]
+        sub             \src, \src, #1
+        sub             \sr2, \src, \s_strd, lsl #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,  v0.8b
+        sxtl            v1.8h,  v1.8b
+        mov             x15, x30
+
+        ld1             {v28.8b}, [\src], \s_strd
+        uxtl            v28.8h,  v28.8b
+        ext             v29.16b, v28.16b, v28.16b, #2
+        mul             v28.4h,  v28.4h,  v0.4h
+        mul             v29.4h,  v29.4h,  v0.4h
+        addp            v28.4h,  v28.4h,  v29.4h
+        addp            v16.4h,  v28.4h,  v28.4h
+        srshr           v16.4h,  v16.4h,  #2
+
+        bl              L(\type\()_8tap_filter_2)
+        trn1            v16.2s, v16.2s, v28.2s
+        mov             v17.8b, v28.8b
+        bl              L(\type\()_8tap_filter_2)
+        ext             v18.8b, v17.8b, v28.8b, #4
+        mov             v19.8b, v28.8b
+        bl              L(\type\()_8tap_filter_2)
+        ext             v20.8b, v19.8b, v28.8b, #4
+        mov             v21.8b, v28.8b
+
+28:
+        bl              L(\type\()_8tap_filter_2)
+        ext             v22.8b, v21.8b, v28.8b, #4
+        mov             v23.8b, v28.8b
+        smull           v2.4s,  v16.4h, v1.h[0]
+        smlal           v2.4s,  v17.4h, v1.h[1]
+        smlal           v2.4s,  v18.4h, v1.h[2]
+        smlal           v2.4s,  v19.4h, v1.h[3]
+        smlal           v2.4s,  v20.4h, v1.h[4]
+        smlal           v2.4s,  v21.4h, v1.h[5]
+        smlal           v2.4s,  v22.4h, v1.h[6]
+        smlal           v2.4s,  v23.4h, v1.h[7]
+
+        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
+        sqxtun          v2.8b,  v2.8h
+        subs            \h,  \h,  #2
+        st1             {v2.h}[0], [\dst], \d_strd
+        st1             {v2.h}[1], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.8b, v18.8b
+        mov             v17.8b, v19.8b
+        mov             v18.8b, v20.8b
+        mov             v19.8b, v21.8b
+        mov             v20.8b, v22.8b
+        mov             v21.8b, v23.8b
+        b               28b
+
+0:
+        br              x15
+
+L(\type\()_8tap_filter_2):
+        ld1             {v28.8b},  [\sr2], \s_strd
+        ld1             {v30.8b},  [\src], \s_strd
+        uxtl            v28.8h,  v28.8b
+        uxtl            v30.8h,  v30.8b
+        ext             v29.16b, v28.16b, v28.16b, #2
+        ext             v31.16b, v30.16b, v30.16b, #2
+        trn1            v27.2s,  v28.2s,  v30.2s
+        trn2            v30.2s,  v28.2s,  v30.2s
+        trn1            v28.2s,  v29.2s,  v31.2s
+        trn2            v31.2s,  v29.2s,  v31.2s
+        mul             v27.4h,  v27.4h,  v0.h[0]
+        mla             v27.4h,  v28.4h,  v0.h[1]
+        mla             v27.4h,  v30.4h,  v0.h[2]
+        mla             v27.4h,  v31.4h,  v0.h[3]
+        srshr           v28.4h,  v27.4h,  #2
+        ret
+.endif
+
+40:
+        add             \xmx, \xmx, #2
+        ld1             {v0.s}[0],  [\xmx]
+        b.gt            480f
+        add             \xmy, \xmy,  #2
+        ld1             {v1.s}[0],  [\xmy]
+        sub             \sr2, \src, #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,  v0.8b
+        sxtl            v1.8h,  v1.8b
+        mov             x15, x30
+
+        // 4x2, 4x4 hv
+        ld1             {v26.8b}, [\src], \s_strd
+        uxtl            v26.8h,  v26.8b
+        ext             v28.16b, v26.16b, v26.16b, #2
+        ext             v29.16b, v26.16b, v26.16b, #4
+        ext             v30.16b, v26.16b, v26.16b, #6
+        mul             v31.4h,  v26.4h,  v0.h[0]
+        mla             v31.4h,  v28.4h,  v0.h[1]
+        mla             v31.4h,  v29.4h,  v0.h[2]
+        mla             v31.4h,  v30.4h,  v0.h[3]
+        srshr           v16.4h,  v31.4h,  #2
+
+        bl              L(\type\()_8tap_filter_4)
+        mov             v17.8b, v28.8b
+        mov             v18.8b, v29.8b
+
+4:
+        bl              L(\type\()_8tap_filter_4)
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        smull           v2.4s,  v16.4h, v1.h[0]
+        smlal           v2.4s,  v17.4h, v1.h[1]
+        smlal           v2.4s,  v18.4h, v1.h[2]
+        smlal           v2.4s,  v28.4h, v1.h[3]
+        smull           v3.4s,  v17.4h, v1.h[0]
+        smlal           v3.4s,  v18.4h, v1.h[1]
+        smlal           v3.4s,  v28.4h, v1.h[2]
+        smlal           v3.4s,  v29.4h, v1.h[3]
+        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
+        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
+        subs            \h,  \h,  #2
+.ifc \type, put
+        sqxtun          v2.8b,  v2.8h
+        sqxtun          v3.8b,  v3.8h
+        st1             {v2.s}[0], [\dst], \d_strd
+        st1             {v3.s}[0], [\ds2], \d_strd
+.else
+        st1             {v2.4h}, [\dst], \d_strd
+        st1             {v3.4h}, [\ds2], \d_strd
+.endif
+        b.le            0f
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v28.8b
+        mov             v18.8b,  v29.8b
+        b               4b
+
+480:    // 4x8, 4x16, 4x32 hv
+        ld1             {v1.8b},  [\xmy]
+        sub             \src, \src, #1
+        sub             \sr2, \src, \s_strd, lsl #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,  v0.8b
+        sxtl            v1.8h,  v1.8b
+        mov             x15, x30
+
+        ld1             {v26.8b}, [\src], \s_strd
+        uxtl            v26.8h,  v26.8b
+        ext             v28.16b, v26.16b, v26.16b, #2
+        ext             v29.16b, v26.16b, v26.16b, #4
+        ext             v30.16b, v26.16b, v26.16b, #6
+        mul             v31.4h,  v26.4h,  v0.h[0]
+        mla             v31.4h,  v28.4h,  v0.h[1]
+        mla             v31.4h,  v29.4h,  v0.h[2]
+        mla             v31.4h,  v30.4h,  v0.h[3]
+        srshr           v16.4h,  v31.4h,  #2
+
+        bl              L(\type\()_8tap_filter_4)
+        mov             v17.8b, v28.8b
+        mov             v18.8b, v29.8b
+        bl              L(\type\()_8tap_filter_4)
+        mov             v19.8b, v28.8b
+        mov             v20.8b, v29.8b
+        bl              L(\type\()_8tap_filter_4)
+        mov             v21.8b, v28.8b
+        mov             v22.8b, v29.8b
+
+48:
+        bl              L(\type\()_8tap_filter_4)
+        smull           v2.4s,  v16.4h, v1.h[0]
+        smlal           v2.4s,  v17.4h, v1.h[1]
+        smlal           v2.4s,  v18.4h, v1.h[2]
+        smlal           v2.4s,  v19.4h, v1.h[3]
+        smlal           v2.4s,  v20.4h, v1.h[4]
+        smlal           v2.4s,  v21.4h, v1.h[5]
+        smlal           v2.4s,  v22.4h, v1.h[6]
+        smlal           v2.4s,  v28.4h, v1.h[7]
+        smull           v3.4s,  v17.4h, v1.h[0]
+        smlal           v3.4s,  v18.4h, v1.h[1]
+        smlal           v3.4s,  v19.4h, v1.h[2]
+        smlal           v3.4s,  v20.4h, v1.h[3]
+        smlal           v3.4s,  v21.4h, v1.h[4]
+        smlal           v3.4s,  v22.4h, v1.h[5]
+        smlal           v3.4s,  v28.4h, v1.h[6]
+        smlal           v3.4s,  v29.4h, v1.h[7]
+        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
+        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
+        subs            \h,  \h,  #2
+.ifc \type, put
+        sqxtun          v2.8b,  v2.8h
+        sqxtun          v3.8b,  v3.8h
+        st1             {v2.s}[0], [\dst], \d_strd
+        st1             {v3.s}[0], [\ds2], \d_strd
+.else
+        st1             {v2.4h}, [\dst], \d_strd
+        st1             {v3.4h}, [\ds2], \d_strd
+.endif
+        b.le            0f
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v19.8b
+        mov             v18.8b,  v20.8b
+        mov             v19.8b,  v21.8b
+        mov             v20.8b,  v22.8b
+        mov             v21.8b,  v28.8b
+        mov             v22.8b,  v29.8b
+        b               48b
+0:
+        br              x15
+
+L(\type\()_8tap_filter_4):
+        ld1             {v26.8b}, [\sr2], \s_strd
+        ld1             {v27.8b}, [\src], \s_strd
+        uxtl            v26.8h,  v26.8b
+        uxtl            v27.8h,  v27.8b
+        ext             v28.16b, v26.16b, v26.16b, #2
+        ext             v29.16b, v26.16b, v26.16b, #4
+        ext             v30.16b, v26.16b, v26.16b, #6
+        mul             v31.4h,  v26.4h,  v0.h[0]
+        mla             v31.4h,  v28.4h,  v0.h[1]
+        mla             v31.4h,  v29.4h,  v0.h[2]
+        mla             v31.4h,  v30.4h,  v0.h[3]
+        ext             v28.16b, v27.16b, v27.16b, #2
+        ext             v29.16b, v27.16b, v27.16b, #4
+        ext             v30.16b, v27.16b, v27.16b, #6
+        mul             v27.4h,  v27.4h,  v0.h[0]
+        mla             v27.4h,  v28.4h,  v0.h[1]
+        mla             v27.4h,  v29.4h,  v0.h[2]
+        mla             v27.4h,  v30.4h,  v0.h[3]
+        srshr           v28.4h,  v31.4h,  #2
+        srshr           v29.4h,  v27.4h,  #2
+        ret
+
+80:
+160:
+320:
+        b.gt            880f
+        add             \xmy,  \xmy,  #2
+        ld1             {v0.8b},  [\xmx]
+        ld1             {v1.s}[0],  [\xmy]
+        sub             \src,  \src,  #3
+        sub             \src,  \src,  \s_strd
+        sxtl            v0.8h,  v0.8b
+        sxtl            v1.8h,  v1.8b
+        mov             x15, x30
+        mov             \my,  \h
+
+164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd, \d_strd, #1
+        lsl             \s_strd, \s_strd, #1
+
+        ld1             {v28.8b, v29.8b},  [\src], \s_strd
+        uxtl            v28.8h,  v28.8b
+        uxtl            v29.8h,  v29.8b
+        mul             v24.8h,  v28.8h,  v0.h[0]
+.irpc i, 1234567
+        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
+        mla             v24.8h,  v26.8h,  v0.h[\i]
+.endr
+        srshr           v16.8h,  v24.8h, #2
+
+        bl              L(\type\()_8tap_filter_8)
+        mov             v17.16b, v24.16b
+        mov             v18.16b, v25.16b
+
+8:
+        smull           v2.4s,  v16.4h, v1.h[0]
+        smull2          v3.4s,  v16.8h, v1.h[0]
+        bl              L(\type\()_8tap_filter_8)
+        smull           v4.4s,  v17.4h, v1.h[0]
+        smull2          v5.4s,  v17.8h, v1.h[0]
+        smlal           v2.4s,  v17.4h, v1.h[1]
+        smlal2          v3.4s,  v17.8h, v1.h[1]
+        smlal           v4.4s,  v18.4h, v1.h[1]
+        smlal2          v5.4s,  v18.8h, v1.h[1]
+        smlal           v2.4s,  v18.4h, v1.h[2]
+        smlal2          v3.4s,  v18.8h, v1.h[2]
+        smlal           v4.4s,  v24.4h, v1.h[2]
+        smlal2          v5.4s,  v24.8h, v1.h[2]
+        smlal           v2.4s,  v24.4h, v1.h[3]
+        smlal2          v3.4s,  v24.8h, v1.h[3]
+        smlal           v4.4s,  v25.4h, v1.h[3]
+        smlal2          v5.4s,  v25.8h, v1.h[3]
+        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
+        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
+        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
+        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
+        subs            \h,  \h,  #2
+.ifc \type, put
+        sqxtun          v2.8b,  v2.8h
+        sqxtun          v4.8b,  v4.8h
+        st1             {v2.8b}, [\dst], \d_strd
+        st1             {v4.8b}, [\ds2], \d_strd
+.else
+        st1             {v2.8h}, [\dst], \d_strd
+        st1             {v4.8h}, [\ds2], \d_strd
+.endif
+        b.le            9f
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v24.16b
+        mov             v18.16b, v25.16b
+        b               8b
+9:
+        subs            \w,  \w,  #8
+        b.le            0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        msub            \src,  \s_strd,  \xmy,  \src
+        msub            \dst,  \d_strd,  \xmy,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #2
+        mov             \h,  \my
+        add             \src,  \src,  #8
+.ifc \type, put
+        add             \dst,  \dst,  #8
+.else
+        add             \dst,  \dst,  #16
+.endif
+        b               164b
+
+880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+        ld1             {v0.8b},  [\xmx]
+        ld1             {v1.8b},  [\xmy]
+        sub             \src,  \src,  #3
+        sub             \src,  \src,  \s_strd
+        sub             \src,  \src,  \s_strd, lsl #1
+        sxtl            v0.8h,  v0.8b
+        sxtl            v1.8h,  v1.8b
+        mov             x15, x30
+        mov             \my,  \h
+
+168:
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd, \d_strd, #1
+        lsl             \s_strd, \s_strd, #1
+
+        ld1             {v28.8b, v29.8b},  [\src], \s_strd
+        uxtl            v28.8h,  v28.8b
+        uxtl            v29.8h,  v29.8b
+        mul             v24.8h,  v28.8h,  v0.h[0]
+.irpc i, 1234567
+        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
+        mla             v24.8h,  v26.8h,  v0.h[\i]
+.endr
+        srshr           v16.8h,  v24.8h, #2
+
+        bl              L(\type\()_8tap_filter_8)
+        mov             v17.16b, v24.16b
+        mov             v18.16b, v25.16b
+        bl              L(\type\()_8tap_filter_8)
+        mov             v19.16b, v24.16b
+        mov             v20.16b, v25.16b
+        bl              L(\type\()_8tap_filter_8)
+        mov             v21.16b, v24.16b
+        mov             v22.16b, v25.16b
+
+88:
+        smull           v2.4s,  v16.4h, v1.h[0]
+        smull2          v3.4s,  v16.8h, v1.h[0]
+        bl              L(\type\()_8tap_filter_8)
+        smull           v4.4s,  v17.4h, v1.h[0]
+        smull2          v5.4s,  v17.8h, v1.h[0]
+        smlal           v2.4s,  v17.4h, v1.h[1]
+        smlal2          v3.4s,  v17.8h, v1.h[1]
+        smlal           v4.4s,  v18.4h, v1.h[1]
+        smlal2          v5.4s,  v18.8h, v1.h[1]
+        smlal           v2.4s,  v18.4h, v1.h[2]
+        smlal2          v3.4s,  v18.8h, v1.h[2]
+        smlal           v4.4s,  v19.4h, v1.h[2]
+        smlal2          v5.4s,  v19.8h, v1.h[2]
+        smlal           v2.4s,  v19.4h, v1.h[3]
+        smlal2          v3.4s,  v19.8h, v1.h[3]
+        smlal           v4.4s,  v20.4h, v1.h[3]
+        smlal2          v5.4s,  v20.8h, v1.h[3]
+        smlal           v2.4s,  v20.4h, v1.h[4]
+        smlal2          v3.4s,  v20.8h, v1.h[4]
+        smlal           v4.4s,  v21.4h, v1.h[4]
+        smlal2          v5.4s,  v21.8h, v1.h[4]
+        smlal           v2.4s,  v21.4h, v1.h[5]
+        smlal2          v3.4s,  v21.8h, v1.h[5]
+        smlal           v4.4s,  v22.4h, v1.h[5]
+        smlal2          v5.4s,  v22.8h, v1.h[5]
+        smlal           v2.4s,  v22.4h, v1.h[6]
+        smlal2          v3.4s,  v22.8h, v1.h[6]
+        smlal           v4.4s,  v24.4h, v1.h[6]
+        smlal2          v5.4s,  v24.8h, v1.h[6]
+        smlal           v2.4s,  v24.4h, v1.h[7]
+        smlal2          v3.4s,  v24.8h, v1.h[7]
+        smlal           v4.4s,  v25.4h, v1.h[7]
+        smlal2          v5.4s,  v25.8h, v1.h[7]
+        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
+        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
+        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
+        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
+        subs            \h,  \h,  #2
+.ifc \type, put
+        sqxtun          v2.8b,  v2.8h
+        sqxtun          v4.8b,  v4.8h
+        st1             {v2.8b}, [\dst], \d_strd
+        st1             {v4.8b}, [\ds2], \d_strd
+.else
+        st1             {v2.8h}, [\dst], \d_strd
+        st1             {v4.8h}, [\ds2], \d_strd
+.endif
+        b.le            9f
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        mov             v18.16b, v20.16b
+        mov             v19.16b, v21.16b
+        mov             v20.16b, v22.16b
+        mov             v21.16b, v24.16b
+        mov             v22.16b, v25.16b
+        b               88b
+9:
+        subs            \w,  \w,  #8
+        b.le            0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        msub            \src,  \s_strd,  \xmy,  \src
+        msub            \dst,  \d_strd,  \xmy,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #3
+        mov             \h,  \my
+        add             \src,  \src,  #8
+.ifc \type, put
+        add             \dst,  \dst,  #8
+.else
+        add             \dst,  \dst,  #16
+.endif
+        b               168b
+0:
+        br              x15
+
+L(\type\()_8tap_filter_8):
+        ld1             {v28.8b, v29.8b},  [\sr2], \s_strd
+        ld1             {v30.8b, v31.8b},  [\src], \s_strd
+        uxtl            v28.8h,  v28.8b
+        uxtl            v29.8h,  v29.8b
+        uxtl            v30.8h,  v30.8b
+        uxtl            v31.8h,  v31.8b
+        mul             v24.8h,  v28.8h,  v0.h[0]
+        mul             v25.8h,  v30.8h,  v0.h[0]
+.irpc i, 1234567
+        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
+        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
+        mla             v24.8h,  v26.8h,  v0.h[\i]
+        mla             v25.8h,  v27.8h,  v0.h[\i]
+.endr
+        srshr           v24.8h,  v24.8h, #2
+        srshr           v25.8h,  v25.8h, #2
+        ret
+
+L(\type\()_8tap_hv_tbl):
+        .hword L(\type\()_8tap_hv_tbl) - 1280b
+        .hword L(\type\()_8tap_hv_tbl) -  640b
+        .hword L(\type\()_8tap_hv_tbl) -  320b
+        .hword L(\type\()_8tap_hv_tbl) -  160b
+        .hword L(\type\()_8tap_hv_tbl) -   80b
+        .hword L(\type\()_8tap_hv_tbl) -   40b
+        .hword L(\type\()_8tap_hv_tbl) -   20b
+        .hword 0
+endfunc
+
+
+function \type\()_bilin_8bpc_neon, export=1
+        dup             v1.16b, \mx
+        dup             v3.16b, \my
+        mov             w9,  #16
+        sub             w8, w9, \mx
+        sub             w9, w9, \my
+        dup             v0.16b, w8
+        dup             v2.16b, w9
+.ifc \type, prep
+        uxtw            \d_strd, \w
+        lsl             \d_strd, \d_strd, #1
+.endif
+
+        clz             w8,  \w
+        sub             w8,  w8,  #24
+        cbnz            \mx, L(\type\()_bilin_h)
+        cbnz            \my, L(\type\()_bilin_v)
+        b               \type\()_neon
+
+L(\type\()_bilin_h):
+        cbnz            \my, L(\type\()_bilin_hv)
+
+        adr             x9,  L(\type\()_bilin_h_tbl)
+        ldrh            w8,  [x9, x8, lsl #1]
+        sub             x9,  x9,  w8, uxtw
+        br              x9
+
+20:     // 2xN h
+.ifc \type, put
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+2:
+        ld1             {v4.s}[0],  [\src], \s_strd
+        ld1             {v6.s}[0],  [\sr2], \s_strd
+        ext             v5.8b,  v4.8b,  v4.8b, #1
+        ext             v7.8b,  v6.8b,  v6.8b, #1
+        trn1            v4.4h,  v4.4h,  v6.4h
+        trn1            v5.4h,  v5.4h,  v7.4h
+        subs            \h,  \h,  #2
+        umull           v4.8h,  v4.8b,  v0.8b
+        umlal           v4.8h,  v5.8b,  v1.8b
+        uqrshrn         v4.8b,  v4.8h,  #4
+        st1             {v4.h}[0], [\dst], \d_strd
+        st1             {v4.h}[1], [\ds2], \d_strd
+        b.gt            2b
+        ret
+.endif
+
+40:     // 4xN h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+4:
+        ld1             {v4.8b}, [\src], \s_strd
+        ld1             {v6.8b}, [\sr2], \s_strd
+        ext             v5.8b,  v4.8b,  v4.8b, #1
+        ext             v7.8b,  v6.8b,  v6.8b, #1
+        trn1            v4.2s,  v4.2s,  v6.2s
+        trn1            v5.2s,  v5.2s,  v7.2s
+        subs            \h,  \h,  #2
+        umull           v4.8h,  v4.8b,  v0.8b
+        umlal           v4.8h,  v5.8b,  v1.8b
+.ifc \type, put
+        uqrshrn         v4.8b,  v4.8h,  #4
+        st1             {v4.s}[0], [\dst], \d_strd
+        st1             {v4.s}[1], [\ds2], \d_strd
+.else
+        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.d}[1], [\ds2], \d_strd
+.endif
+        b.gt            4b
+        ret
+
+80:     // 8xN h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+8:
+        ld1             {v4.16b}, [\src], \s_strd
+        ld1             {v6.16b}, [\sr2], \s_strd
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v7.16b, v6.16b, v6.16b, #1
+        subs            \h,  \h,  #2
+        umull           v4.8h,  v4.8b,  v0.8b
+        umull           v6.8h,  v6.8b,  v0.8b
+        umlal           v4.8h,  v5.8b,  v1.8b
+        umlal           v6.8h,  v7.8b,  v1.8b
+.ifc \type, put
+        uqrshrn         v4.8b,  v4.8h,  #4
+        uqrshrn         v6.8b,  v6.8h,  #4
+        st1             {v4.8b}, [\dst], \d_strd
+        st1             {v6.8b}, [\ds2], \d_strd
+.else
+        st1             {v4.8h}, [\dst], \d_strd
+        st1             {v6.8h}, [\ds2], \d_strd
+.endif
+        b.gt            8b
+        ret
+160:
+320:
+640:
+1280:   // 16xN, 32xN, ... h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+
+        sub             \s_strd,  \s_strd,  \w, uxtw
+        sub             \s_strd,  \s_strd,  #8
+.ifc \type, put
+        lsl             \d_strd,  \d_strd,  #1
+        sub             \d_strd,  \d_strd,  \w, uxtw
+.endif
+161:
+        ld1             {v16.d}[1],  [\src], #8
+        ld1             {v20.d}[1],  [\sr2], #8
+        mov             \mx, \w
+
+16:
+        ld1             {v18.16b},  [\src], #16
+        ld1             {v22.16b},  [\sr2], #16
+        ext             v17.16b, v16.16b, v18.16b, #8
+        ext             v19.16b, v16.16b, v18.16b, #9
+        ext             v21.16b, v20.16b, v22.16b, #8
+        ext             v23.16b, v20.16b, v22.16b, #9
+        umull           v16.8h,  v17.8b,  v0.8b
+        umull2          v17.8h,  v17.16b, v0.16b
+        umull           v20.8h,  v21.8b,  v0.8b
+        umull2          v21.8h,  v21.16b, v0.16b
+        umlal           v16.8h,  v19.8b,  v1.8b
+        umlal2          v17.8h,  v19.16b, v1.16b
+        umlal           v20.8h,  v23.8b,  v1.8b
+        umlal2          v21.8h,  v23.16b, v1.16b
+        subs            \mx, \mx, #16
+.ifc \type, put
+        uqrshrn         v16.8b,  v16.8h, #4
+        uqrshrn2        v16.16b, v17.8h, #4
+        uqrshrn         v20.8b,  v20.8h, #4
+        uqrshrn2        v20.16b, v21.8h, #4
+        st1             {v16.16b}, [\dst], #16
+        st1             {v20.16b}, [\ds2], #16
+.else
+        st1             {v16.8h, v17.8h}, [\dst], #32
+        st1             {v20.8h, v21.8h}, [\ds2], #32
+.endif
+        b.le            9f
+
+        mov             v16.16b, v18.16b
+        mov             v20.16b, v22.16b
+        b               16b
+
+9:
+        add             \dst,  \dst,  \d_strd
+        add             \ds2,  \ds2,  \d_strd
+        add             \src,  \src,  \s_strd
+        add             \sr2,  \sr2,  \s_strd
+
+        subs            \h,  \h,  #2
+        b.gt            161b
+        ret
+
+L(\type\()_bilin_h_tbl):
+        .hword L(\type\()_bilin_h_tbl) - 1280b
+        .hword L(\type\()_bilin_h_tbl) -  640b
+        .hword L(\type\()_bilin_h_tbl) -  320b
+        .hword L(\type\()_bilin_h_tbl) -  160b
+        .hword L(\type\()_bilin_h_tbl) -   80b
+        .hword L(\type\()_bilin_h_tbl) -   40b
+        .hword L(\type\()_bilin_h_tbl) -   20b
+        .hword 0
+
+
+L(\type\()_bilin_v):
+        cmp             \h,  #4
+        adr             x9,  L(\type\()_bilin_v_tbl)
+        ldrh            w8,  [x9, x8, lsl #1]
+        sub             x9,  x9,  w8, uxtw
+        br              x9
+
+20:     // 2xN v
+.ifc \type, put
+        cmp             \h,  #2
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+
+        // 2x2 v
+        ld1             {v16.h}[0], [\src], \s_strd
+        b.gt            24f
+        ld1             {v17.h}[0], [\sr2], \s_strd
+        ld1             {v18.h}[0], [\src], \s_strd
+        trn1            v16.4h, v16.4h, v17.4h
+        trn1            v17.4h, v17.4h, v18.4h
+        umull           v4.8h,  v16.8b,  v2.8b
+        umlal           v4.8h,  v17.8b,  v3.8b
+        uqrshrn         v4.8b,  v4.8h,  #4
+        st1             {v4.h}[0], [\dst]
+        st1             {v4.h}[1], [\ds2]
+        ret
+24:     // 2x4, 2x8, ... v
+        ld1             {v17.h}[0], [\sr2], \s_strd
+        ld1             {v18.h}[0], [\src], \s_strd
+        ld1             {v19.h}[0], [\sr2], \s_strd
+        ld1             {v20.h}[0], [\src], \s_strd
+        trn1            v16.4h, v16.4h, v17.4h
+        trn1            v17.4h, v17.4h, v18.4h
+        trn1            v18.4h, v18.4h, v19.4h
+        trn1            v19.4h, v19.4h, v20.4h
+        trn1            v16.2s, v16.2s, v18.2s
+        trn1            v17.2s, v17.2s, v19.2s
+        umull           v4.8h,  v16.8b,  v2.8b
+        umlal           v4.8h,  v17.8b,  v3.8b
+        subs            \h,  \h,  #4
+        uqrshrn         v4.8b,  v4.8h,  #4
+        st1             {v4.h}[0], [\dst], \d_strd
+        st1             {v4.h}[1], [\ds2], \d_strd
+        st1             {v4.h}[2], [\dst], \d_strd
+        st1             {v4.h}[3], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.8b, v20.8b
+        b               24b
+0:
+        ret
+.endif
+
+40:     // 4xN v
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        ld1             {v16.s}[0], [\src], \s_strd
+4:
+        ld1             {v17.s}[0], [\sr2], \s_strd
+        ld1             {v18.s}[0], [\src], \s_strd
+        trn1            v16.2s, v16.2s, v17.2s
+        trn1            v17.2s, v17.2s, v18.2s
+        umull           v4.8h,  v16.8b,  v2.8b
+        umlal           v4.8h,  v17.8b,  v3.8b
+        subs            \h,  \h,  #2
+.ifc \type, put
+        uqrshrn         v4.8b,  v4.8h,  #4
+        st1             {v4.s}[0], [\dst], \d_strd
+        st1             {v4.s}[1], [\ds2], \d_strd
+.else
+        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.d}[1], [\ds2], \d_strd
+.endif
+        b.le            0f
+        mov             v16.8b, v18.8b
+        b               4b
+0:
+        ret
+
+80:     // 8xN v
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        ld1             {v16.8b}, [\src], \s_strd
+8:
+        ld1             {v17.8b}, [\sr2], \s_strd
+        ld1             {v18.8b}, [\src], \s_strd
+        umull           v4.8h,  v16.8b,  v2.8b
+        umull           v5.8h,  v17.8b,  v2.8b
+        umlal           v4.8h,  v17.8b,  v3.8b
+        umlal           v5.8h,  v18.8b,  v3.8b
+        subs            \h,  \h,  #2
+.ifc \type, put
+        uqrshrn         v4.8b,  v4.8h,  #4
+        uqrshrn         v5.8b,  v5.8h,  #4
+        st1             {v4.8b}, [\dst], \d_strd
+        st1             {v5.8b}, [\ds2], \d_strd
+.else
+        st1             {v4.8h}, [\dst], \d_strd
+        st1             {v5.8h}, [\ds2], \d_strd
+.endif
+        b.le            0f
+        mov             v16.8b, v18.8b
+        b               8b
+0:
+        ret
+
+160:    // 16xN, 32xN, ...
+320:
+640:
+1280:
+        mov             \my,  \h
+1:
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        ld1             {v16.16b}, [\src], \s_strd
+2:
+        ld1             {v17.16b}, [\sr2], \s_strd
+        ld1             {v18.16b}, [\src], \s_strd
+        umull           v4.8h,  v16.8b,  v2.8b
+        umull2          v5.8h,  v16.16b, v2.16b
+        umull           v6.8h,  v17.8b,  v2.8b
+        umull2          v7.8h,  v17.16b, v2.16b
+        umlal           v4.8h,  v17.8b,  v3.8b
+        umlal2          v5.8h,  v17.16b, v3.16b
+        umlal           v6.8h,  v18.8b,  v3.8b
+        umlal2          v7.8h,  v18.16b, v3.16b
+        subs            \h,  \h,  #2
+.ifc \type, put
+        uqrshrn         v4.8b,  v4.8h,  #4
+        uqrshrn2        v4.16b, v5.8h,  #4
+        uqrshrn         v6.8b,  v6.8h,  #4
+        uqrshrn2        v6.16b, v7.8h,  #4
+        st1             {v4.16b}, [\dst], \d_strd
+        st1             {v6.16b}, [\ds2], \d_strd
+.else
+        st1             {v4.8h, v5.8h}, [\dst], \d_strd
+        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
+.endif
+        b.le            9f
+        mov             v16.16b, v18.16b
+        b               2b
+9:
+        subs            \w,  \w,  #16
+        b.le            0f
+        asr             \s_strd, \s_strd, #1
+        asr             \d_strd, \d_strd, #1
+        msub            \src, \s_strd, \xmy, \src
+        msub            \dst, \d_strd, \xmy, \dst
+        sub             \src, \src, \s_strd, lsl #1
+        mov             \h,  \my
+        add             \src, \src, #16
+.ifc \type, put
+        add             \dst, \dst, #16
+.else
+        add             \dst, \dst, #32
+.endif
+        b               1b
+0:
+        ret
+
+L(\type\()_bilin_v_tbl):
+        .hword L(\type\()_bilin_v_tbl) - 1280b
+        .hword L(\type\()_bilin_v_tbl) -  640b
+        .hword L(\type\()_bilin_v_tbl) -  320b
+        .hword L(\type\()_bilin_v_tbl) -  160b
+        .hword L(\type\()_bilin_v_tbl) -   80b
+        .hword L(\type\()_bilin_v_tbl) -   40b
+        .hword L(\type\()_bilin_v_tbl) -   20b
+        .hword 0
+
+L(\type\()_bilin_hv):
+        uxtl            v2.8h, v2.8b
+        uxtl            v3.8h, v3.8b
+        adr             x9,  L(\type\()_bilin_hv_tbl)
+        ldrh            w8,  [x9, x8, lsl #1]
+        sub             x9,  x9,  w8, uxtw
+        br              x9
+
+20:     // 2xN hv
+.ifc \type, put
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        ld1             {v28.s}[0],  [\src], \s_strd
+        ext             v29.8b, v28.8b, v28.8b, #1
+        umull           v16.8h, v28.8b, v0.8b
+        umlal           v16.8h, v29.8b, v1.8b
+
+2:
+        ld1             {v28.s}[0],  [\sr2], \s_strd
+        ld1             {v30.s}[0],  [\src], \s_strd
+        ext             v29.8b, v28.8b, v28.8b, #1
+        ext             v31.8b, v30.8b, v30.8b, #1
+        trn1            v28.4h, v28.4h, v30.4h
+        trn1            v29.4h, v29.4h, v31.4h
+        umull           v17.8h, v28.8b, v0.8b
+        umlal           v17.8h, v29.8b, v1.8b
+
+        trn1            v16.2s, v16.2s, v17.2s
+
+        mul             v4.4h,  v16.4h, v2.4h
+        mla             v4.4h,  v17.4h, v3.4h
+        uqrshrn         v4.8b,  v4.8h,  #8
+        subs            \h,  \h,  #2
+        st1             {v4.h}[0], [\dst], \d_strd
+        st1             {v4.h}[1], [\ds2], \d_strd
+        b.le            0f
+        trn2            v16.2s, v17.2s, v17.2s
+        b               2b
+0:
+        ret
+.endif
+
+40:     // 4xN hv
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        ld1             {v28.8b},  [\src], \s_strd
+        ext             v29.8b, v28.8b, v28.8b, #1
+        umull           v16.8h, v28.8b, v0.8b
+        umlal           v16.8h, v29.8b, v1.8b
+
+4:
+        ld1             {v28.8b},  [\sr2], \s_strd
+        ld1             {v30.8b},  [\src], \s_strd
+        ext             v29.8b, v28.8b, v28.8b, #1
+        ext             v31.8b, v30.8b, v30.8b, #1
+        trn1            v28.2s, v28.2s, v30.2s
+        trn1            v29.2s, v29.2s, v31.2s
+        umull           v17.8h, v28.8b, v0.8b
+        umlal           v17.8h, v29.8b, v1.8b
+
+        trn1            v16.2d, v16.2d, v17.2d
+
+        mul             v4.8h,  v16.8h, v2.8h
+        mla             v4.8h,  v17.8h, v3.8h
+        subs            \h,  \h,  #2
+.ifc \type, put
+        uqrshrn         v4.8b,  v4.8h,  #8
+        st1             {v4.s}[0], [\dst], \d_strd
+        st1             {v4.s}[1], [\ds2], \d_strd
+.else
+        urshr           v4.8h,  v4.8h,  #4
+        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.d}[1], [\ds2], \d_strd
+.endif
+        b.le            0f
+        trn2            v16.2d, v17.2d, v17.2d
+        b               4b
+0:
+        ret
+
+80:     // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+        mov             \my,  \h
+
+1:
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        ld1             {v28.16b},  [\src], \s_strd
+        ext             v29.16b, v28.16b, v28.16b, #1
+        umull           v16.8h, v28.8b, v0.8b
+        umlal           v16.8h, v29.8b, v1.8b
+
+2:
+        ld1             {v28.16b},  [\sr2], \s_strd
+        ld1             {v30.16b},  [\src], \s_strd
+        ext             v29.16b, v28.16b, v28.16b, #1
+        ext             v31.16b, v30.16b, v30.16b, #1
+        umull           v17.8h, v28.8b, v0.8b
+        umlal           v17.8h, v29.8b, v1.8b
+        umull           v18.8h, v30.8b, v0.8b
+        umlal           v18.8h, v31.8b, v1.8b
+
+        mul             v4.8h,  v16.8h, v2.8h
+        mla             v4.8h,  v17.8h, v3.8h
+        mul             v5.8h,  v17.8h, v2.8h
+        mla             v5.8h,  v18.8h, v3.8h
+        subs            \h,  \h,  #2
+.ifc \type, put
+        uqrshrn         v4.8b,  v4.8h,  #8
+        uqrshrn         v5.8b,  v5.8h,  #8
+        st1             {v4.8b}, [\dst], \d_strd
+        st1             {v5.8b}, [\ds2], \d_strd
+.else
+        urshr           v4.8h,  v4.8h,  #4
+        urshr           v5.8h,  v5.8h,  #4
+        st1             {v4.8h}, [\dst], \d_strd
+        st1             {v5.8h}, [\ds2], \d_strd
+.endif
+        b.le            9f
+        mov             v16.16b, v18.16b
+        b               2b
+9:
+        subs            \w,  \w,  #8
+        b.le            0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        msub            \src,  \s_strd,  \xmy,  \src
+        msub            \dst,  \d_strd,  \xmy,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #1
+        mov             \h,  \my
+        add             \src,  \src,  #8
+.ifc \type, put
+        add             \dst,  \dst,  #8
+.else
+        add             \dst,  \dst,  #16
+.endif
+        b               1b
+0:
+        ret
+
+L(\type\()_bilin_hv_tbl):
+        .hword L(\type\()_bilin_hv_tbl) - 1280b
+        .hword L(\type\()_bilin_hv_tbl) -  640b
+        .hword L(\type\()_bilin_hv_tbl) -  320b
+        .hword L(\type\()_bilin_hv_tbl) -  160b
+        .hword L(\type\()_bilin_hv_tbl) -   80b
+        .hword L(\type\()_bilin_hv_tbl) -   40b
+        .hword L(\type\()_bilin_hv_tbl) -   20b
+        .hword 0
+endfunc
+.endm
+
+filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+
+.macro load_filter_row dst, src, inc
+        asr             w13, \src, #10
+        ldr             \dst, [x11, w13, sxtw #3]
+        add             \src, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+        add             w12, w5,  #512
+
+        ld1             {v16.8b, v17.8b}, [x2], x3
+
+        load_filter_row d0, w12, w7
+        uxtl            v16.8h,  v16.8b
+        load_filter_row d1, w12, w7
+        uxtl            v17.8h,  v17.8b
+        load_filter_row d2, w12, w7
+        sxtl            v0.8h,   v0.8b
+        load_filter_row d3, w12, w7
+        sxtl            v1.8h,   v1.8b
+        load_filter_row d4, w12, w7
+        sxtl            v2.8h,   v2.8b
+        load_filter_row d5, w12, w7
+        sxtl            v3.8h,   v3.8b
+        load_filter_row d6, w12, w7
+        sxtl            v4.8h,   v4.8b
+        load_filter_row d7, w12, w7
+        sxtl            v5.8h,   v5.8b
+        ext             v18.16b, v16.16b, v17.16b, #2*1
+        mul             v23.8h,  v16.8h,  v0.8h
+        sxtl            v6.8h,   v6.8b
+        ext             v19.16b, v16.16b, v17.16b, #2*2
+        mul             v18.8h,  v18.8h,  v1.8h
+        sxtl            v7.8h,   v7.8b
+        ext             v20.16b, v16.16b, v17.16b, #2*3
+        mul             v19.8h,  v19.8h,  v2.8h
+        ext             v21.16b, v16.16b, v17.16b, #2*4
+        saddlp          v23.4s,  v23.8h
+        mul             v20.8h,  v20.8h,  v3.8h
+        ext             v22.16b, v16.16b, v17.16b, #2*5
+        saddlp          v18.4s,  v18.8h
+        mul             v21.8h,  v21.8h,  v4.8h
+        saddlp          v19.4s,  v19.8h
+        mul             v22.8h,  v22.8h,  v5.8h
+        saddlp          v20.4s,  v20.8h
+        saddlp          v21.4s,  v21.8h
+        saddlp          v22.4s,  v22.8h
+        addp            v18.4s,  v23.4s,  v18.4s
+        ext             v23.16b, v16.16b, v17.16b, #2*6
+        addp            v19.4s,  v19.4s,  v20.4s
+        mul             v23.8h,  v23.8h,  v6.8h
+        ext             v20.16b, v16.16b, v17.16b, #2*7
+        mul             v20.8h,  v20.8h,  v7.8h
+        saddlp          v23.4s,  v23.8h
+        addp            v21.4s,  v21.4s,  v22.4s
+        saddlp          v20.4s,  v20.8h
+        addp            v20.4s,  v23.4s,  v20.4s
+        addp            v18.4s,  v18.4s,  v19.4s
+        addp            v20.4s,  v21.4s,  v20.4s
+
+        add             w5,  w5,  w8
+
+        rshrn           v16.4h,  v18.4s,  #3
+        rshrn2          v16.8h,  v20.4s,  #3
+
+        ret
+endfunc
+
+// void dav1d_warp_affine_8x8_8bpc_neon(
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *src, const ptrdiff_t src_stride,
+//         const int16_t *const abcd, int mx, int my)
+.macro warp t, shift
+function warp_affine_8x8\t\()_8bpc_neon, export=1
+        ldr             x4,  [x4]
+        sbfx            x7,  x4, #0,  #16
+        sbfx            x8,  x4, #16, #16
+        sbfx            x9,  x4, #32, #16
+        sbfx            x4,  x4, #48, #16
+        mov             w10, #8
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        sub             x2,  x2,  #3
+        movrel          x11, X(mc_warp_filter), 64*8
+        mov             x15, x30
+.ifnb \t
+        lsl             x1,  x1,  #1
+.endif
+
+        bl              warp_filter_horz_neon
+        mov             v24.16b, v16.16b
+        bl              warp_filter_horz_neon
+        mov             v25.16b, v16.16b
+        bl              warp_filter_horz_neon
+        mov             v26.16b, v16.16b
+        bl              warp_filter_horz_neon
+        mov             v27.16b, v16.16b
+        bl              warp_filter_horz_neon
+        mov             v28.16b, v16.16b
+        bl              warp_filter_horz_neon
+        mov             v29.16b, v16.16b
+        bl              warp_filter_horz_neon
+        mov             v30.16b, v16.16b
+
+1:
+        add             w14, w6,  #512
+        bl              warp_filter_horz_neon
+        mov             v31.16b, v16.16b
+
+        load_filter_row d0, w14, w9
+        load_filter_row d1, w14, w9
+        load_filter_row d2, w14, w9
+        load_filter_row d3, w14, w9
+        load_filter_row d4, w14, w9
+        load_filter_row d5, w14, w9
+        load_filter_row d6, w14, w9
+        load_filter_row d7, w14, w9
+        transpose_8x8b  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        sxtl            v2.8h,   v2.8b
+        sxtl            v3.8h,   v3.8b
+        sxtl            v4.8h,   v4.8b
+        sxtl            v5.8h,   v5.8b
+        sxtl            v6.8h,   v6.8b
+        sxtl            v7.8h,   v7.8b
+
+        // This ordering of smull/smlal/smull2/smlal2 is highly
+        // beneficial for Cortex A53 here.
+        smull           v16.4s,  v24.4h,  v0.4h
+        smlal           v16.4s,  v25.4h,  v1.4h
+        smlal           v16.4s,  v26.4h,  v2.4h
+        smlal           v16.4s,  v27.4h,  v3.4h
+        smlal           v16.4s,  v28.4h,  v4.4h
+        smlal           v16.4s,  v29.4h,  v5.4h
+        smlal           v16.4s,  v30.4h,  v6.4h
+        smlal           v16.4s,  v31.4h,  v7.4h
+        smull2          v17.4s,  v24.8h,  v0.8h
+        smlal2          v17.4s,  v25.8h,  v1.8h
+        smlal2          v17.4s,  v26.8h,  v2.8h
+        smlal2          v17.4s,  v27.8h,  v3.8h
+        smlal2          v17.4s,  v28.8h,  v4.8h
+        smlal2          v17.4s,  v29.8h,  v5.8h
+        smlal2          v17.4s,  v30.8h,  v6.8h
+        smlal2          v17.4s,  v31.8h,  v7.8h
+
+        mov             v24.16b, v25.16b
+        mov             v25.16b, v26.16b
+        sqrshrn         v16.4h,  v16.4s,  #\shift
+        mov             v26.16b, v27.16b
+        sqrshrn2        v16.8h,  v17.4s,  #\shift
+        mov             v27.16b, v28.16b
+        mov             v28.16b, v29.16b
+.ifb \t
+        sqxtun          v16.8b,  v16.8h
+.endif
+        mov             v29.16b, v30.16b
+        mov             v30.16b, v31.16b
+        subs            w10, w10, #1
+.ifnb \t
+        st1             {v16.8h}, [x0], x1
+.else
+        st1             {v16.8b}, [x0], x1
+.endif
+
+        add             w6,  w6,  w4
+        b.gt            1b
+
+        br              x15
+endfunc
+.endm
+
+warp  , 11
+warp t, 7
+
+// void dav1d_emu_edge_8bpc_neon(
+//         const intptr_t bw, const intptr_t bh,
+//         const intptr_t iw, const intptr_t ih,
+//         const intptr_t x, const intptr_t y,
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_8bpc_neon, export=1
+        ldp             x8,  x9,  [sp]
+
+        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+        // ref += iclip(x, 0, iw - 1)
+        sub             x12, x3,  #1           // ih - 1
+        cmp             x5,  x3
+        sub             x13, x2,  #1           // iw - 1
+        csel            x12, x12, x5,  ge      // min(y, ih - 1)
+        cmp             x4,  x2
+        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+        csel            x13, x13, x4,  ge      // min(x, iw - 1)
+        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
+        add             x8,  x8,  x13          // ref += iclip()
+
+        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+        // top_ext = iclip(-y, 0, bh - 1)
+        add             x10, x5,  x1           // y + bh
+        neg             x5,  x5                // -y
+        sub             x10, x10, x3           // y + bh - ih
+        sub             x12, x1,  #1           // bh - 1
+        cmp             x10, x1
+        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
+        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
+        cmp             x5,  x1
+        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
+
+        // right_ext = iclip(x + bw - iw, 0, bw - 1)
+        // left_ext = iclip(-x, 0, bw - 1)
+        add             x11, x4,  x0           // x + bw
+        neg             x4,  x4                // -x
+        sub             x11, x11, x2           // x + bw - iw
+        sub             x13, x0,  #1           // bw - 1
+        cmp             x11, x0
+        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
+        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
+        cmp             x4,  x0
+        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
+
+        // center_h = bh - top_ext - bottom_ext
+        // dst += top_ext * PXSTRIDE(dst_stride)
+        // center_w = bw - left_ext - right_ext
+        sub             x1,  x1,  x5           // bh - top_ext
+        madd            x6,  x5,  x7,  x6
+        sub             x2,  x0,  x4           // bw - left_ext
+        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
+        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
+
+        mov             x14, x6                // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+        ld1r            {v0.16b}, [x8]
+        mov             x12, x6                // out = dst
+        mov             x3,  x4
+1:
+        subs            x3,  x3,  #16
+        st1             {v0.16b}, [x12], #16
+        b.gt            1b
+.endif
+        mov             x13, x8
+        add             x12, x6,  x4           // out = dst + left_ext
+        mov             x3,  x2
+1:
+        ld1             {v0.16b, v1.16b}, [x13], #32
+        subs            x3,  x3,  #32
+        st1             {v0.16b, v1.16b}, [x12], #32
+        b.gt            1b
+.if \need_right
+        add             x3,  x8,  x2           // in + center_w
+        sub             x3,  x3,  #1           // in + center_w - 1
+        add             x12, x6,  x4           // dst + left_ext
+        ld1r            {v0.16b}, [x3]
+        add             x12, x12, x2           // out = dst + left_ext + center_w
+        mov             x3,  x11
+1:
+        subs            x3,  x3,  #16
+        st1             {v0.16b}, [x12], #16
+        b.gt            1b
+.endif
+
+        subs            x1,  x1,  #1           // center_h--
+        add             x6,  x6,  x7
+        add             x8,  x8,  x9
+        b.gt            0b
+.endm
+
+        cbz             x4,  2f
+        // need_left
+        cbz             x11, 3f
+        // need_left + need_right
+        v_loop          1,   1
+        b               5f
+
+2:
+        // !need_left
+        cbz             x11, 4f
+        // !need_left + need_right
+        v_loop          0,   1
+        b               5f
+
+3:
+        // need_left + !need_right
+        v_loop          1,   0
+        b               5f
+
+4:
+        // !need_left + !need_right
+        v_loop          0,   0
+
+5:
+
+        cbz             x10, 3f
+        // need_bottom
+        sub             x8,  x6,  x7           // ref = dst - stride
+        mov             x4,  x0
+1:
+        ld1             {v0.16b, v1.16b}, [x8], #32
+        mov             x3,  x10
+2:
+        subs            x3,  x3,  #1
+        st1             {v0.16b, v1.16b}, [x6], x7
+        b.gt            2b
+        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
+        subs            x4,  x4,  #32          // bw -= 32
+        add             x6,  x6,  #32          // dst += 32
+        b.gt            1b
+
+3:
+        cbz             x5,  3f
+        // need_top
+        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
+1:
+        ld1             {v0.16b, v1.16b}, [x14], #32
+        mov             x3,  x5
+2:
+        subs            x3,  x3,  #1
+        st1             {v0.16b, v1.16b}, [x6], x7
+        b.gt            2b
+        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
+        subs            x0,  x0,  #32          // bw -= 32
+        add             x6,  x6,  #32          // dst += 32
+        b.gt            1b
+
+3:
+        ret
+endfunc
diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S
new file mode 100644 (file)
index 0000000..7ac1863
--- /dev/null
@@ -0,0 +1,3569 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define PREP_BIAS 8192
+
+.macro avg d0, d1, t0, t1, t2, t3
+        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
+        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
+        sqadd           \t0\().8h,  \t0\().8h,  \t2\().8h
+        sqadd           \t1\().8h,  \t1\().8h,  \t3\().8h
+        smax            \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+        smax            \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+        sqsub           \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+        sqsub           \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+        sshl            \d0\().8h,  \t0\().8h,  v29.8h // -(intermediate_bits+1)
+        sshl            \d1\().8h,  \t1\().8h,  v29.8h // -(intermediate_bits+1)
+.endm
+
+.macro w_avg d0, d1, t0, t1, t2, t3
+        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
+        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
+        // This difference requires a 17 bit range, and all bits are
+        // significant for the following multiplication.
+        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
+        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
+        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
+        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
+        mul             \d0\().4s,  \d0\().4s,  v27.4s
+        mul             \t0\().4s,  \t0\().4s,  v27.4s
+        mul             \d1\().4s,  \d1\().4s,  v27.4s
+        mul             \t1\().4s,  \t1\().4s,  v27.4s
+        sshr            \d0\().4s,  \d0\().4s,  #4
+        sshr            \t0\().4s,  \t0\().4s,  #4
+        sshr            \d1\().4s,  \d1\().4s,  #4
+        sshr            \t1\().4s,  \t1\().4s,  #4
+        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
+        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
+        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
+        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
+        xtn             \d0\().4h,  \d0\().4s
+        xtn2            \d0\().8h,  \t0\().4s
+        xtn             \d1\().4h,  \d1\().4s
+        xtn2            \d1\().8h,  \t1\().4s
+        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
+        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
+        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
+        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
+        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
+        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
+        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
+        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
+.endm
+
+.macro mask d0, d1, t0, t1, t2, t3
+        ld1             {v27.16b}, [x6],  16
+        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
+        neg             v27.16b, v27.16b
+        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
+        sxtl            v26.8h,  v27.8b
+        sxtl2           v27.8h,  v27.16b
+        sxtl            v24.4s,  v26.4h
+        sxtl2           v25.4s,  v26.8h
+        sxtl            v26.4s,  v27.4h
+        sxtl2           v27.4s,  v27.8h
+        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
+        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
+        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
+        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
+        mul             \d0\().4s,  \d0\().4s,  v24.4s
+        mul             \t0\().4s,  \t0\().4s,  v25.4s
+        mul             \d1\().4s,  \d1\().4s,  v26.4s
+        mul             \t1\().4s,  \t1\().4s,  v27.4s
+        sshr            \d0\().4s,  \d0\().4s,  #6
+        sshr            \t0\().4s,  \t0\().4s,  #6
+        sshr            \d1\().4s,  \d1\().4s,  #6
+        sshr            \t1\().4s,  \t1\().4s,  #6
+        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
+        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
+        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
+        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
+        xtn             \d0\().4h,  \d0\().4s
+        xtn2            \d0\().8h,  \t0\().4s
+        xtn             \d1\().4h,  \d1\().4s
+        xtn2            \d1\().8h,  \t1\().4s
+        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
+        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
+        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
+        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
+        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
+        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
+        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
+        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
+.endm
+
+.macro bidir_fn type, bdmax
+function \type\()_16bpc_neon, export=1
+        clz             w4,  w4
+.ifnc \type, avg
+        dup             v31.8h,  \bdmax // bitdepth_max
+        movi            v30.8h,  #0
+.endif
+        clz             w7,  \bdmax
+        sub             w7,  w7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
+.ifc \type, avg
+        mov             w9,  #1
+        mov             w8,  #-2*PREP_BIAS
+        lsl             w9,  w9,  w7    // 1 << intermediate_bits
+        add             w7,  w7,  #1
+        sub             w8,  w8,  w9    // -2*PREP_BIAS - 1 << intermediate_bits
+        neg             w7,  w7         // -(intermediate_bits+1)
+        dup             v28.8h,   w8    // -2*PREP_BIAS - 1 << intermediate_bits
+        dup             v29.8h,   w7    // -(intermediate_bits+1)
+.else
+        mov             w8,  #PREP_BIAS
+        lsr             w8,  w8,  w7    // PREP_BIAS >> intermediate_bits
+        neg             w7,  w7         // -intermediate_bits
+        dup             v28.8h,  w8     // PREP_BIAS >> intermediate_bits
+        dup             v29.8h,  w7     // -intermediate_bits
+.endif
+.ifc \type, w_avg
+        dup             v27.4s,  w6
+        neg             v27.4s,  v27.4s
+.endif
+        adr             x7,  L(\type\()_tbl)
+        sub             w4,  w4,  #24
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        ldrh            w4,  [x7, x4, lsl #1]
+        sub             x7,  x7,  w4, uxtw
+        br              x7
+40:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+4:
+        subs            w5,  w5,  #4
+        st1             {v4.d}[0],  [x0], x1
+        st1             {v4.d}[1],  [x7], x1
+        st1             {v5.d}[0],  [x0], x1
+        st1             {v5.d}[1],  [x7], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               4b
+80:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+8:
+        st1             {v4.8h},  [x0], x1
+        subs            w5,  w5,  #2
+        st1             {v5.8h},  [x7], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               8b
+16:
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        st1             {v4.8h, v5.8h}, [x0], x1
+        subs            w5,  w5,  #2
+        st1             {v6.8h, v7.8h}, [x0], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               16b
+32:
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        subs            w5,  w5,  #1
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               32b
+640:
+        add             x7,  x0,  #64
+64:
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        \type           v16, v17, v0,  v1,  v2,  v3
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
+        \type           v18, v19, v0,  v1,  v2,  v3
+        subs            w5,  w5,  #1
+        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               64b
+1280:
+        add             x7,  x0,  #64
+        mov             x8,  #128
+        sub             x1,  x1,  #128
+128:
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        \type           v16, v17, v0,  v1,  v2,  v3
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8
+        \type           v18, v19, v0,  v1,  v2,  v3
+        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        \type           v16, v17, v0,  v1,  v2,  v3
+        subs            w5,  w5,  #1
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
+        \type           v18, v19, v0,  v1,  v2,  v3
+        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               128b
+0:
+        ret
+L(\type\()_tbl):
+        .hword L(\type\()_tbl) - 1280b
+        .hword L(\type\()_tbl) -  640b
+        .hword L(\type\()_tbl) -   32b
+        .hword L(\type\()_tbl) -   16b
+        .hword L(\type\()_tbl) -   80b
+        .hword L(\type\()_tbl) -   40b
+endfunc
+.endm
+
+bidir_fn avg, w6
+bidir_fn w_avg, w7
+bidir_fn mask, w7
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_16bpc_neon, export=1
+        ldr             w8,  [sp]
+        clz             w9,  w4
+        adr             x10, L(w_mask_\type\()_tbl)
+        dup             v31.8h,  w8   // bitdepth_max
+        sub             w9,  w9,  #24
+        clz             w8,  w8       // clz(bitdepth_max)
+        ldrh            w9,  [x10,  x9,  lsl #1]
+        sub             x10, x10, w9,  uxtw
+        sub             w8,  w8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
+        mov             w9,  #PREP_BIAS*64
+        neg             w8,  w8       // -sh
+        mov             w11, #27615   // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
+        dup             v30.4s,  w9   // PREP_BIAS*64
+        dup             v29.4s,  w8   // -sh
+        dup             v0.8h,   w11
+.if \type == 444
+        movi            v1.16b,  #64
+.elseif \type == 422
+        dup             v2.8b,   w7
+        movi            v3.8b,   #129
+        sub             v3.8b,   v3.8b,   v2.8b
+.elseif \type == 420
+        dup             v2.8h,   w7
+        movi            v3.8h,   #1, lsl #8
+        sub             v3.8h,   v3.8h,   v2.8h
+.endif
+        add             x12,  x0,  x1
+        lsl             x1,   x1,  #1
+        br              x10
+4:
+        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
+        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
+        subs            w5,  w5,  #4
+        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
+        sabd            v21.8h,  v5.8h,   v7.8h
+        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
+        ssubl2          v17.4s,  v6.8h,   v4.8h
+        ssubl           v18.4s,  v7.4h,   v5.4h
+        ssubl2          v19.4s,  v7.8h,   v5.8h
+        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
+        uqsub           v21.8h,  v0.8h,   v21.8h
+        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
+        sshll           v6.4s,   v5.4h,   #6
+        sshll2          v5.4s,   v4.8h,   #6
+        sshll           v4.4s,   v4.4h,   #6
+        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
+        ushr            v21.8h,  v21.8h,  #10
+        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
+        add             v5.4s,   v5.4s,   v30.4s
+        add             v6.4s,   v6.4s,   v30.4s
+        add             v7.4s,   v7.4s,   v30.4s
+        uxtl            v22.4s,  v20.4h
+        uxtl2           v23.4s,  v20.8h
+        uxtl            v24.4s,  v21.4h
+        uxtl2           v25.4s,  v21.8h
+        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
+        mla             v5.4s,   v17.4s,  v23.4s
+        mla             v6.4s,   v18.4s,  v24.4s
+        mla             v7.4s,   v19.4s,  v25.4s
+        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+        srshl           v5.4s,   v5.4s,   v29.4s
+        srshl           v6.4s,   v6.4s,   v29.4s
+        srshl           v7.4s,   v7.4s,   v29.4s
+        sqxtun          v4.4h,   v4.4s            // iclip_pixel
+        sqxtun2         v4.8h,   v5.4s
+        sqxtun          v5.4h,   v6.4s
+        sqxtun2         v5.8h,   v7.4s
+        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
+        umin            v5.8h,   v5.8h,   v31.8h
+.if \type == 444
+        xtn             v20.8b,  v20.8h           // 64 - m
+        xtn2            v20.16b, v21.8h
+        sub             v20.16b, v1.16b,  v20.16b // m
+        st1             {v20.16b}, [x6], #16
+.elseif \type == 422
+        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
+        xtn             v20.8b,  v20.8h
+        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+        st1             {v20.8b}, [x6], #8
+.elseif \type == 420
+        trn1            v24.2d,  v20.2d,  v21.2d
+        trn2            v25.2d,  v20.2d,  v21.2d
+        add             v24.8h,  v24.8h,  v25.8h  // (64 - my1) + (64 - my2) (row wise addition)
+        addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)
+        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
+        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+        st1             {v20.s}[0], [x6], #4
+.endif
+        st1             {v4.d}[0],  [x0],  x1
+        st1             {v4.d}[1],  [x12], x1
+        st1             {v5.d}[0],  [x0],  x1
+        st1             {v5.d}[1],  [x12], x1
+        b.gt            4b
+        ret
+8:
+        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1
+        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2
+        subs            w5,  w5,  #2
+        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
+        sabd            v21.8h,  v5.8h,   v7.8h
+        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
+        ssubl2          v17.4s,  v6.8h,   v4.8h
+        ssubl           v18.4s,  v7.4h,   v5.4h
+        ssubl2          v19.4s,  v7.8h,   v5.8h
+        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
+        uqsub           v21.8h,  v0.8h,   v21.8h
+        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
+        sshll           v6.4s,   v5.4h,   #6
+        sshll2          v5.4s,   v4.8h,   #6
+        sshll           v4.4s,   v4.4h,   #6
+        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
+        ushr            v21.8h,  v21.8h,  #10
+        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
+        add             v5.4s,   v5.4s,   v30.4s
+        add             v6.4s,   v6.4s,   v30.4s
+        add             v7.4s,   v7.4s,   v30.4s
+        uxtl            v22.4s,  v20.4h
+        uxtl2           v23.4s,  v20.8h
+        uxtl            v24.4s,  v21.4h
+        uxtl2           v25.4s,  v21.8h
+        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
+        mla             v5.4s,   v17.4s,  v23.4s
+        mla             v6.4s,   v18.4s,  v24.4s
+        mla             v7.4s,   v19.4s,  v25.4s
+        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+        srshl           v5.4s,   v5.4s,   v29.4s
+        srshl           v6.4s,   v6.4s,   v29.4s
+        srshl           v7.4s,   v7.4s,   v29.4s
+        sqxtun          v4.4h,   v4.4s            // iclip_pixel
+        sqxtun2         v4.8h,   v5.4s
+        sqxtun          v5.4h,   v6.4s
+        sqxtun2         v5.8h,   v7.4s
+        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
+        umin            v5.8h,   v5.8h,   v31.8h
+.if \type == 444
+        xtn             v20.8b,  v20.8h           // 64 - m
+        xtn2            v20.16b, v21.8h
+        sub             v20.16b, v1.16b,  v20.16b // m
+        st1             {v20.16b}, [x6], #16
+.elseif \type == 422
+        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
+        xtn             v20.8b,  v20.8h
+        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+        st1             {v20.8b}, [x6], #8
+.elseif \type == 420
+        add             v20.8h,  v20.8h,  v21.8h  // (64 - my1) + (64 - my2) (row wise addition)
+        addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)
+        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
+        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+        st1             {v20.s}[0], [x6], #4
+.endif
+        st1             {v4.8h}, [x0],  x1
+        st1             {v5.8h}, [x12], x1
+        b.gt            8b
+        ret
+1280:
+640:
+320:
+160:
+        mov             w11, w4
+        sub             x1,  x1,  w4,  uxtw #1
+.if \type == 444
+        add             x10, x6,  w4,  uxtw
+.elseif \type == 422
+        add             x10, x6,  x11, lsr #1
+.endif
+        add             x9,  x3,  w4,  uxtw #1
+        add             x7,  x2,  w4,  uxtw #1
+161:
+        mov             w8,  w4
+16:
+        ld1             {v4.8h,   v5.8h},  [x2], #32 // tmp1
+        ld1             {v16.8h,  v17.8h}, [x3], #32 // tmp2
+        ld1             {v6.8h,   v7.8h},  [x7], #32
+        ld1             {v18.8h,  v19.8h}, [x9], #32
+        subs            w8,  w8,  #16
+        sabd            v20.8h,  v4.8h,   v16.8h  // abs(tmp1 - tmp2)
+        sabd            v21.8h,  v5.8h,   v17.8h
+        ssubl           v22.4s,  v16.4h,  v4.4h   // tmp2 - tmp1 (requires 17 bit)
+        ssubl2          v23.4s,  v16.8h,  v4.8h
+        ssubl           v24.4s,  v17.4h,  v5.4h
+        ssubl2          v25.4s,  v17.8h,  v5.8h
+        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
+        uqsub           v21.8h,  v0.8h,   v21.8h
+        sshll2          v27.4s,  v5.8h,   #6      // tmp1 << 6
+        sshll           v26.4s,  v5.4h,   #6
+        sshll2          v5.4s,   v4.8h,   #6
+        sshll           v4.4s,   v4.4h,   #6
+        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
+        ushr            v21.8h,  v21.8h,  #10
+        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
+        add             v5.4s,   v5.4s,   v30.4s
+        add             v26.4s,  v26.4s,  v30.4s
+        add             v27.4s,  v27.4s,  v30.4s
+        uxtl            v16.4s,  v20.4h
+        uxtl2           v17.4s,  v20.8h
+        uxtl            v28.4s,  v21.4h
+        mla             v4.4s,   v22.4s,  v16.4s  // (tmp2-tmp1)*(64-m)
+        uxtl2           v16.4s,  v21.8h
+        mla             v5.4s,   v23.4s,  v17.4s
+        mla             v26.4s,  v24.4s,  v28.4s
+        mla             v27.4s,  v25.4s,  v16.4s
+        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+        srshl           v5.4s,   v5.4s,   v29.4s
+        srshl           v26.4s,  v26.4s,  v29.4s
+        srshl           v27.4s,  v27.4s,  v29.4s
+        sqxtun          v4.4h,   v4.4s            // iclip_pixel
+        sqxtun2         v4.8h,   v5.4s
+        sqxtun          v5.4h,   v26.4s
+        sqxtun2         v5.8h,   v27.4s
+
+        // Start of other half
+        sabd            v22.8h,  v6.8h,   v18.8h  // abs(tmp1 - tmp2)
+        sabd            v23.8h,  v7.8h,   v19.8h
+
+        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
+        umin            v5.8h,   v5.8h,   v31.8h
+
+        ssubl           v16.4s,  v18.4h,  v6.4h   // tmp2 - tmp1 (requires 17 bit)
+        ssubl2          v17.4s,  v18.8h,  v6.8h
+        ssubl           v18.4s,  v19.4h,  v7.4h
+        ssubl2          v19.4s,  v19.8h,  v7.8h
+        uqsub           v22.8h,  v0.8h,   v22.8h  // 27615 - abs()
+        uqsub           v23.8h,  v0.8h,   v23.8h
+        sshll           v24.4s,  v6.4h,   #6      // tmp1 << 6
+        sshll2          v25.4s,  v6.8h,   #6
+        sshll           v26.4s,  v7.4h,   #6
+        sshll2          v27.4s,  v7.8h,   #6
+        ushr            v22.8h,  v22.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
+        ushr            v23.8h,  v23.8h,  #10
+        add             v24.4s,  v24.4s,  v30.4s  // += PREP_BIAS*64
+        add             v25.4s,  v25.4s,  v30.4s
+        add             v26.4s,  v26.4s,  v30.4s
+        add             v27.4s,  v27.4s,  v30.4s
+        uxtl            v6.4s,   v22.4h
+        uxtl2           v7.4s,   v22.8h
+        uxtl            v28.4s,  v23.4h
+        mla             v24.4s,  v16.4s,  v6.4s   // (tmp2-tmp1)*(64-m)
+        uxtl2           v6.4s,   v23.8h
+        mla             v25.4s,  v17.4s,  v7.4s
+        mla             v26.4s,  v18.4s,  v28.4s
+        mla             v27.4s,  v19.4s,  v6.4s
+        srshl           v24.4s,  v24.4s,  v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+        srshl           v25.4s,  v25.4s,  v29.4s
+        srshl           v26.4s,  v26.4s,  v29.4s
+        srshl           v27.4s,  v27.4s,  v29.4s
+        sqxtun          v6.4h,   v24.4s           // iclip_pixel
+        sqxtun2         v6.8h,   v25.4s
+        sqxtun          v7.4h,   v26.4s
+        sqxtun2         v7.8h,   v27.4s
+        umin            v6.8h,   v6.8h,   v31.8h  // iclip_pixel
+        umin            v7.8h,   v7.8h,   v31.8h
+.if \type == 444
+        xtn             v20.8b,  v20.8h           // 64 - m
+        xtn2            v20.16b, v21.8h
+        xtn             v21.8b,  v22.8h
+        xtn2            v21.16b, v23.8h
+        sub             v20.16b, v1.16b,  v20.16b // m
+        sub             v21.16b, v1.16b,  v21.16b
+        st1             {v20.16b}, [x6],  #16
+        st1             {v21.16b}, [x10], #16
+.elseif \type == 422
+        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
+        addp            v21.8h,  v22.8h,  v23.8h
+        xtn             v20.8b,  v20.8h
+        xtn             v21.8b,  v21.8h
+        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+        uhsub           v21.8b,  v3.8b,   v21.8b
+        st1             {v20.8b}, [x6],  #8
+        st1             {v21.8b}, [x10], #8
+.elseif \type == 420
+        add             v20.8h,  v20.8h,  v22.8h  // (64 - my1) + (64 - my2) (row wise addition)
+        add             v21.8h,  v21.8h,  v23.8h
+        addp            v20.8h,  v20.8h,  v21.8h  // (128 - m) + (128 - n) (column wise addition)
+        sub             v20.8h,  v3.8h,   v20.8h  // (256 - sign) - ((128 - m) + (128 - n))
+        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+        st1             {v20.8b}, [x6], #8
+.endif
+        st1             {v4.8h, v5.8h}, [x0],  #32
+        st1             {v6.8h, v7.8h}, [x12], #32
+        b.gt            16b
+        subs            w5,  w5,  #2
+        add             x2,  x2,  w4,  uxtw #1
+        add             x3,  x3,  w4,  uxtw #1
+        add             x7,  x7,  w4,  uxtw #1
+        add             x9,  x9,  w4,  uxtw #1
+.if \type == 444
+        add             x6,  x6,  w4,  uxtw
+        add             x10, x10, w4,  uxtw
+.elseif \type == 422
+        add             x6,  x6,  x11, lsr #1
+        add             x10, x10, x11, lsr #1
+.endif
+        add             x0,  x0,  x1
+        add             x12, x12, x1
+        b.gt            161b
+        ret
+L(w_mask_\type\()_tbl):
+        .hword L(w_mask_\type\()_tbl) - 1280b
+        .hword L(w_mask_\type\()_tbl) -  640b
+        .hword L(w_mask_\type\()_tbl) -  320b
+        .hword L(w_mask_\type\()_tbl) -  160b
+        .hword L(w_mask_\type\()_tbl) -    8b
+        .hword L(w_mask_\type\()_tbl) -    4b
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_16bpc_neon, export=1
+        adr             x6,  L(blend_tbl)
+        clz             w3,  w3
+        sub             w3,  w3,  #26
+        ldrh            w3,  [x6,  x3,  lsl #1]
+        sub             x6,  x6,  w3,  uxtw
+        add             x8,  x0,  x1
+        br              x6
+40:
+        lsl             x1,  x1,  #1
+4:
+        ld1             {v2.8b},   [x5], #8
+        ld1             {v1.8h},   [x2], #16
+        ld1             {v0.d}[0], [x0]
+        neg             v2.8b,   v2.8b            // -m
+        subs            w4,  w4,  #2
+        ld1             {v0.d}[1], [x8]
+        sxtl            v2.8h,   v2.8b
+        shl             v2.8h,   v2.8h,   #9      // -m << 9
+        sub             v1.8h,   v0.8h,   v1.8h   // a - b
+        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
+        add             v0.8h,   v0.8h,   v1.8h
+        st1             {v0.d}[0], [x0], x1
+        st1             {v0.d}[1], [x8], x1
+        b.gt            4b
+        ret
+80:
+        lsl             x1,  x1,  #1
+8:
+        ld1             {v4.16b},       [x5], #16
+        ld1             {v2.8h, v3.8h}, [x2], #32
+        neg             v5.16b,  v4.16b           // -m
+        ld1             {v0.8h},   [x0]
+        ld1             {v1.8h},   [x8]
+        sxtl            v4.8h,   v5.8b
+        sxtl2           v5.8h,   v5.16b
+        shl             v4.8h,   v4.8h,   #9      // -m << 9
+        shl             v5.8h,   v5.8h,   #9
+        sub             v2.8h,   v0.8h,   v2.8h   // a - b
+        sub             v3.8h,   v1.8h,   v3.8h
+        subs            w4,  w4,  #2
+        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v3.8h,   v3.8h,   v5.8h
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v1.8h,   v1.8h,   v3.8h
+        st1             {v0.8h}, [x0], x1
+        st1             {v1.8h}, [x8], x1
+        b.gt            8b
+        ret
+160:
+        lsl             x1,  x1,  #1
+16:
+        ld1             {v16.16b, v17.16b},           [x5], #32
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+        subs            w4,  w4,  #2
+        neg             v18.16b, v16.16b          // -m
+        neg             v19.16b, v17.16b
+        ld1             {v0.8h, v1.8h}, [x0]
+        sxtl            v16.8h,  v18.8b
+        sxtl2           v17.8h,  v18.16b
+        sxtl            v18.8h,  v19.8b
+        sxtl2           v19.8h,  v19.16b
+        ld1             {v2.8h, v3.8h}, [x8]
+        shl             v16.8h,  v16.8h,  #9      // -m << 9
+        shl             v17.8h,  v17.8h,  #9
+        shl             v18.8h,  v18.8h,  #9
+        shl             v19.8h,  v19.8h,  #9
+        sub             v4.8h,   v0.8h,   v4.8h   // a - b
+        sub             v5.8h,   v1.8h,   v5.8h
+        sub             v6.8h,   v2.8h,   v6.8h
+        sub             v7.8h,   v3.8h,   v7.8h
+        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v5.8h,   v5.8h,   v17.8h
+        sqrdmulh        v6.8h,   v6.8h,   v18.8h
+        sqrdmulh        v7.8h,   v7.8h,   v19.8h
+        add             v0.8h,   v0.8h,   v4.8h
+        add             v1.8h,   v1.8h,   v5.8h
+        add             v2.8h,   v2.8h,   v6.8h
+        add             v3.8h,   v3.8h,   v7.8h
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v2.8h, v3.8h}, [x8], x1
+        b.gt            16b
+        ret
+32:
+        ld1             {v16.16b, v17.16b},           [x5], #32
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+        subs            w4,  w4,  #1
+        neg             v18.16b, v16.16b          // -m
+        neg             v19.16b, v17.16b
+        sxtl            v16.8h,  v18.8b
+        sxtl2           v17.8h,  v18.16b
+        sxtl            v18.8h,  v19.8b
+        sxtl2           v19.8h,  v19.16b
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+        shl             v16.8h,  v16.8h,  #9      // -m << 9
+        shl             v17.8h,  v17.8h,  #9
+        shl             v18.8h,  v18.8h,  #9
+        shl             v19.8h,  v19.8h,  #9
+        sub             v4.8h,   v0.8h,   v4.8h   // a - b
+        sub             v5.8h,   v1.8h,   v5.8h
+        sub             v6.8h,   v2.8h,   v6.8h
+        sub             v7.8h,   v3.8h,   v7.8h
+        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v5.8h,   v5.8h,   v17.8h
+        sqrdmulh        v6.8h,   v6.8h,   v18.8h
+        sqrdmulh        v7.8h,   v7.8h,   v19.8h
+        add             v0.8h,   v0.8h,   v4.8h
+        add             v1.8h,   v1.8h,   v5.8h
+        add             v2.8h,   v2.8h,   v6.8h
+        add             v3.8h,   v3.8h,   v7.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        b.gt            32b
+        ret
+L(blend_tbl):
+        .hword L(blend_tbl) -  32b
+        .hword L(blend_tbl) - 160b
+        .hword L(blend_tbl) -  80b
+        .hword L(blend_tbl) -  40b
+endfunc
+
+function blend_h_16bpc_neon, export=1
+        adr             x6,  L(blend_h_tbl)
+        movrel          x5,  X(obmc_masks)
+        add             x5,  x5,  w4,  uxtw
+        sub             w4,  w4,  w4,  lsr #2
+        clz             w7,  w3
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        sub             w7,  w7,  #24
+        ldrh            w7,  [x6,  x7,  lsl #1]
+        sub             x6,  x6,  w7, uxtw
+        br              x6
+2:
+        ld2r            {v2.8b, v3.8b}, [x5], #2
+        ld1             {v1.4h},        [x2], #8
+        ext             v2.8b,   v2.8b,   v3.8b,   #6
+        subs            w4,  w4,  #2
+        neg             v2.8b,   v2.8b            // -m
+        ld1             {v0.s}[0], [x0]
+        ld1             {v0.s}[1], [x8]
+        sxtl            v2.8h,   v2.8b
+        shl             v2.4h,   v2.4h,   #9      // -m << 9
+        sub             v1.4h,   v0.4h,   v1.4h   // a - b
+        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
+        add             v0.4h,   v0.4h,   v1.4h
+        st1             {v0.s}[0], [x0], x1
+        st1             {v0.s}[1], [x8], x1
+        b.gt            2b
+        ret
+4:
+        ld2r            {v2.8b, v3.8b}, [x5], #2
+        ld1             {v1.8h},        [x2], #16
+        ext             v2.8b,   v2.8b,   v3.8b,   #4
+        subs            w4,  w4,  #2
+        neg             v2.8b,   v2.8b            // -m
+        ld1             {v0.d}[0],   [x0]
+        ld1             {v0.d}[1],   [x8]
+        sxtl            v2.8h,   v2.8b
+        shl             v2.8h,   v2.8h,   #9      // -m << 9
+        sub             v1.8h,   v0.8h,   v1.8h   // a - b
+        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
+        add             v0.8h,   v0.8h,   v1.8h
+        st1             {v0.d}[0], [x0], x1
+        st1             {v0.d}[1], [x8], x1
+        b.gt            4b
+        ret
+8:
+        ld2r            {v4.8b, v5.8b}, [x5], #2
+        ld1             {v2.8h, v3.8h}, [x2], #32
+        neg             v4.8b,   v4.8b            // -m
+        neg             v5.8b,   v5.8b
+        ld1             {v0.8h}, [x0]
+        subs            w4,  w4,  #2
+        sxtl            v4.8h,   v4.8b
+        sxtl            v5.8h,   v5.8b
+        ld1             {v1.8h}, [x8]
+        shl             v4.8h,   v4.8h,   #9      // -m << 9
+        shl             v5.8h,   v5.8h,   #9
+        sub             v2.8h,   v0.8h,   v2.8h   // a - b
+        sub             v3.8h,   v1.8h,   v3.8h
+        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v3.8h,   v3.8h,   v5.8h
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v1.8h,   v1.8h,   v3.8h
+        st1             {v0.8h}, [x0], x1
+        st1             {v1.8h}, [x8], x1
+        b.gt            8b
+        ret
+16:
+        ld2r            {v16.8b, v17.8b}, [x5], #2
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+        neg             v16.8b,  v16.8b           // -m
+        neg             v17.8b,  v17.8b
+        ld1             {v0.8h, v1.8h},  [x0]
+        ld1             {v2.8h, v3.8h},  [x8]
+        subs            w4,  w4,  #2
+        sxtl            v16.8h,  v16.8b
+        sxtl            v17.8h,  v17.8b
+        shl             v16.8h,  v16.8h,  #9      // -m << 9
+        shl             v17.8h,  v17.8h,  #9
+        sub             v4.8h,   v0.8h,   v4.8h   // a - b
+        sub             v5.8h,   v1.8h,   v5.8h
+        sub             v6.8h,   v2.8h,   v6.8h
+        sub             v7.8h,   v3.8h,   v7.8h
+        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v5.8h,   v5.8h,   v16.8h
+        sqrdmulh        v6.8h,   v6.8h,   v17.8h
+        sqrdmulh        v7.8h,   v7.8h,   v17.8h
+        add             v0.8h,   v0.8h,   v4.8h
+        add             v1.8h,   v1.8h,   v5.8h
+        add             v2.8h,   v2.8h,   v6.8h
+        add             v3.8h,   v3.8h,   v7.8h
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v2.8h, v3.8h}, [x8], x1
+        b.gt            16b
+        ret
+1280:
+640:
+320:
+        sub             x1,  x1,  w3,  uxtw #1
+        add             x7,  x2,  w3,  uxtw #1
+321:
+        ld2r            {v24.8b, v25.8b}, [x5], #2
+        mov             w6,  w3
+        neg             v24.8b,  v24.8b           // -m
+        neg             v25.8b,  v25.8b
+        sxtl            v24.8h,  v24.8b
+        sxtl            v25.8h,  v25.8b
+        shl             v24.8h,  v24.8h,  #9      // -m << 9
+        shl             v25.8h,  v25.8h,  #9
+32:
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0]
+        subs            w6,  w6,  #32
+        sub             v16.8h,  v0.8h,   v16.8h  // a - b
+        sub             v17.8h,  v1.8h,   v17.8h
+        sub             v18.8h,  v2.8h,   v18.8h
+        sub             v19.8h,  v3.8h,   v19.8h
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8]
+        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v17.8h,  v17.8h,  v24.8h
+        sqrdmulh        v18.8h,  v18.8h,  v24.8h
+        sqrdmulh        v19.8h,  v19.8h,  v24.8h
+        sub             v20.8h,  v4.8h,   v20.8h  // a - b
+        sub             v21.8h,  v5.8h,   v21.8h
+        sub             v22.8h,  v6.8h,   v22.8h
+        sub             v23.8h,  v7.8h,   v23.8h
+        add             v0.8h,   v0.8h,   v16.8h
+        add             v1.8h,   v1.8h,   v17.8h
+        add             v2.8h,   v2.8h,   v18.8h
+        add             v3.8h,   v3.8h,   v19.8h
+        sqrdmulh        v20.8h,  v20.8h,  v25.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v21.8h,  v21.8h,  v25.8h
+        sqrdmulh        v22.8h,  v22.8h,  v25.8h
+        sqrdmulh        v23.8h,  v23.8h,  v25.8h
+        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
+        add             v4.8h,   v4.8h,   v20.8h
+        add             v5.8h,   v5.8h,   v21.8h
+        add             v6.8h,   v6.8h,   v22.8h
+        add             v7.8h,   v7.8h,   v23.8h
+        st1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8], #64
+        b.gt            32b
+        subs            w4,  w4,  #2
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        add             x2,  x2,  w3,  uxtw #1
+        add             x7,  x7,  w3,  uxtw #1
+        b.gt            321b
+        ret
+L(blend_h_tbl):
+        .hword L(blend_h_tbl) - 1280b
+        .hword L(blend_h_tbl) -  640b
+        .hword L(blend_h_tbl) -  320b
+        .hword L(blend_h_tbl) -   16b
+        .hword L(blend_h_tbl) -    8b
+        .hword L(blend_h_tbl) -    4b
+        .hword L(blend_h_tbl) -    2b
+endfunc
+
+function blend_v_16bpc_neon, export=1
+        adr             x6,  L(blend_v_tbl)
+        movrel          x5,  X(obmc_masks)
+        add             x5,  x5,  w3,  uxtw
+        clz             w3,  w3
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        sub             w3,  w3,  #26
+        ldrh            w3,  [x6,  x3,  lsl #1]
+        sub             x6,  x6,  w3,  uxtw
+        br              x6
+20:
+        ld1r            {v2.8b}, [x5]
+        neg             v2.8b,   v2.8b            // -m
+        sxtl            v2.8h,   v2.8b
+        shl             v2.4h,   v2.4h,   #9      // -m << 9
+2:
+        ld1             {v1.s}[0], [x2], #4
+        ld1             {v0.h}[0], [x0]
+        subs            w4,  w4,  #2
+        ld1             {v1.h}[1], [x2]
+        ld1             {v0.h}[1], [x8]
+        add             x2,  x2,  #4
+        sub             v1.4h,   v0.4h,   v1.4h   // a - b
+        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
+        add             v0.4h,   v0.4h,   v1.4h
+        st1             {v0.h}[0], [x0],  x1
+        st1             {v0.h}[1], [x8],  x1
+        b.gt            2b
+        ret
+40:
+        ld1r            {v2.2s}, [x5]
+        sub             x1,  x1,  #4
+        neg             v2.8b,   v2.8b            // -m
+        sxtl            v2.8h,   v2.8b
+        shl             v2.8h,   v2.8h,   #9      // -m << 9
+4:
+        ld1             {v1.8h},   [x2], #16
+        ld1             {v0.d}[0], [x0]
+        ld1             {v0.d}[1], [x8]
+        subs            w4,  w4,  #2
+        sub             v1.8h,   v0.8h,   v1.8h   // a - b
+        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
+        add             v0.8h,   v0.8h,   v1.8h
+        st1             {v0.s}[0], [x0], #4
+        st1             {v0.s}[2], [x8], #4
+        st1             {v0.h}[2], [x0], x1
+        st1             {v0.h}[6], [x8], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v4.8b}, [x5]
+        sub             x1,  x1,  #8
+        neg             v4.8b,   v4.8b            // -m
+        sxtl            v4.8h,   v4.8b
+        shl             v4.8h,   v4.8h,   #9      // -m << 9
+8:
+        ld1             {v2.8h, v3.8h}, [x2], #32
+        ld1             {v0.8h}, [x0]
+        ld1             {v1.8h}, [x8]
+        subs            w4,  w4,  #2
+        sub             v2.8h,   v0.8h,   v2.8h   // a - b
+        sub             v3.8h,   v1.8h,   v3.8h
+        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v3.8h,   v3.8h,   v4.8h
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v1.8h,   v1.8h,   v3.8h
+        st1             {v0.d}[0], [x0], #8
+        st1             {v1.d}[0], [x8], #8
+        st1             {v0.s}[2], [x0], x1
+        st1             {v1.s}[2], [x8], x1
+        b.gt            8b
+        ret
+160:
+        ld1             {v16.8b, v17.8b}, [x5]
+        sub             x1,  x1,  #16
+        neg             v16.8b,  v16.8b           // -m
+        neg             v17.8b,  v17.8b
+        sxtl            v16.8h,  v16.8b
+        sxtl            v17.8h,  v17.8b
+        shl             v16.8h,  v16.8h,  #9      // -m << 9
+        shl             v17.4h,  v17.4h,  #9
+16:
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+        ld1             {v0.8h, v1.8h}, [x0]
+        subs            w4,  w4,  #2
+        ld1             {v2.8h, v3.8h}, [x8]
+        sub             v4.8h,   v0.8h,   v4.8h   // a - b
+        sub             v5.4h,   v1.4h,   v5.4h
+        sub             v6.8h,   v2.8h,   v6.8h
+        sub             v7.4h,   v3.4h,   v7.4h
+        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v5.4h,   v5.4h,   v17.4h
+        sqrdmulh        v6.8h,   v6.8h,   v16.8h
+        sqrdmulh        v7.4h,   v7.4h,   v17.4h
+        add             v0.8h,   v0.8h,   v4.8h
+        add             v1.4h,   v1.4h,   v5.4h
+        add             v2.8h,   v2.8h,   v6.8h
+        add             v3.4h,   v3.4h,   v7.4h
+        st1             {v0.8h}, [x0], #16
+        st1             {v2.8h}, [x8], #16
+        st1             {v1.4h}, [x0], x1
+        st1             {v3.4h}, [x8], x1
+        b.gt            16b
+        ret
+320:
+        ld1             {v24.16b, v25.16b},  [x5]
+        neg             v26.16b, v24.16b          // -m
+        neg             v27.8b,  v25.8b
+        sxtl            v24.8h,  v26.8b
+        sxtl2           v25.8h,  v26.16b
+        sxtl            v26.8h,  v27.8b
+        shl             v24.8h,  v24.8h,  #9      // -m << 9
+        shl             v25.8h,  v25.8h,  #9
+        shl             v26.8h,  v26.8h,  #9
+32:
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
+        ld1             {v0.8h, v1.8h, v2.8h}, [x0]
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
+        ld1             {v4.8h, v5.8h, v6.8h}, [x8]
+        subs            w4,  w4,  #2
+        sub             v16.8h,  v0.8h,   v16.8h  // a - b
+        sub             v17.8h,  v1.8h,   v17.8h
+        sub             v18.8h,  v2.8h,   v18.8h
+        sub             v20.8h,  v4.8h,   v20.8h
+        sub             v21.8h,  v5.8h,   v21.8h
+        sub             v22.8h,  v6.8h,   v22.8h
+        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v17.8h,  v17.8h,  v25.8h
+        sqrdmulh        v18.8h,  v18.8h,  v26.8h
+        sqrdmulh        v20.8h,  v20.8h,  v24.8h
+        sqrdmulh        v21.8h,  v21.8h,  v25.8h
+        sqrdmulh        v22.8h,  v22.8h,  v26.8h
+        add             v0.8h,   v0.8h,   v16.8h
+        add             v1.8h,   v1.8h,   v17.8h
+        add             v2.8h,   v2.8h,   v18.8h
+        add             v4.8h,   v4.8h,   v20.8h
+        add             v5.8h,   v5.8h,   v21.8h
+        add             v6.8h,   v6.8h,   v22.8h
+        st1             {v0.8h, v1.8h, v2.8h}, [x0], x1
+        st1             {v4.8h, v5.8h, v6.8h}, [x8], x1
+        b.gt            32b
+        ret
+L(blend_v_tbl):
+        .hword L(blend_v_tbl) - 320b
+        .hword L(blend_v_tbl) - 160b
+        .hword L(blend_v_tbl) -  80b
+        .hword L(blend_v_tbl) -  40b
+        .hword L(blend_v_tbl) -  20b
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that x9 is set to (clz(w)-24).
+function put_neon
+        adr             x10, L(put_tbl)
+        ldrh            w9, [x10, x9, lsl #1]
+        sub             x10, x10, w9, uxtw
+        br              x10
+
+2:
+        ld1             {v0.s}[0], [x2], x3
+        ld1             {v1.s}[0], [x2], x3
+        subs            w5,  w5,  #2
+        st1             {v0.s}[0], [x0], x1
+        st1             {v1.s}[0], [x0], x1
+        b.gt            2b
+        ret
+4:
+        ld1             {v0.8b}, [x2], x3
+        ld1             {v1.8b}, [x2], x3
+        subs            w5,  w5,  #2
+        st1             {v0.8b}, [x0], x1
+        st1             {v1.8b}, [x0], x1
+        b.gt            4b
+        ret
+80:
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        add             x9,  x2,  x3
+        lsl             x3,  x3,  #1
+8:
+        ld1             {v0.16b}, [x2], x3
+        ld1             {v1.16b}, [x9], x3
+        subs            w5,  w5,  #2
+        st1             {v0.16b}, [x0], x1
+        st1             {v1.16b}, [x8], x1
+        b.gt            8b
+        ret
+16:
+        ldp             x6,  x7,  [x2]
+        ldp             x8,  x9,  [x2, #16]
+        stp             x6,  x7,  [x0]
+        subs            w5,  w5,  #1
+        stp             x8,  x9,  [x0, #16]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.gt            16b
+        ret
+32:
+        ldp             x6,  x7,  [x2]
+        ldp             x8,  x9,  [x2, #16]
+        stp             x6,  x7,  [x0]
+        ldp             x10, x11, [x2, #32]
+        stp             x8,  x9,  [x0, #16]
+        subs            w5,  w5,  #1
+        ldp             x12, x13, [x2, #48]
+        stp             x10, x11, [x0, #32]
+        stp             x12, x13, [x0, #48]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.gt            32b
+        ret
+64:
+        ldp             q0,  q1,  [x2]
+        ldp             q2,  q3,  [x2, #32]
+        stp             q0,  q1,  [x0]
+        ldp             q4,  q5,  [x2, #64]
+        stp             q2,  q3,  [x0, #32]
+        ldp             q6,  q7,  [x2, #96]
+        subs            w5,  w5,  #1
+        stp             q4,  q5,  [x0, #64]
+        stp             q6,  q7,  [x0, #96]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.gt            64b
+        ret
+128:
+        ldp             q0,  q1,  [x2]
+        ldp             q2,  q3,  [x2, #32]
+        stp             q0,  q1,  [x0]
+        ldp             q4,  q5,  [x2, #64]
+        stp             q2,  q3,  [x0, #32]
+        ldp             q6,  q7,  [x2, #96]
+        subs            w5,  w5,  #1
+        stp             q4,  q5,  [x0, #64]
+        ldp             q16, q17, [x2, #128]
+        stp             q6,  q7,  [x0, #96]
+        ldp             q18, q19, [x2, #160]
+        stp             q16, q17, [x0, #128]
+        ldp             q20, q21, [x2, #192]
+        stp             q18, q19, [x0, #160]
+        ldp             q22, q23, [x2, #224]
+        stp             q20, q21, [x0, #192]
+        stp             q22, q23, [x0, #224]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.gt            128b
+        ret
+
+L(put_tbl):
+        .hword L(put_tbl) - 128b
+        .hword L(put_tbl) -  64b
+        .hword L(put_tbl) -  32b
+        .hword L(put_tbl) -  16b
+        .hword L(put_tbl) -  80b
+        .hword L(put_tbl) -   4b
+        .hword L(put_tbl) -   2b
+endfunc
+
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
+// x8 to w*2.
+function prep_neon
+        adr             x10, L(prep_tbl)
+        ldrh            w9, [x10, x9, lsl #1]
+        dup             v31.8h,  w7   // intermediate_bits
+        movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8
+        sub             x10, x10, w9, uxtw
+        br              x10
+
+40:
+        add             x9,  x1,  x2
+        lsl             x2,  x2,  #1
+4:
+        ld1             {v0.d}[0], [x1], x2
+        ld1             {v0.d}[1], [x9], x2
+        subs            w4,  w4,  #2
+        sshl            v0.8h,   v0.8h,   v31.8h
+        sub             v0.8h,   v0.8h,   v30.8h
+        st1             {v0.8h}, [x0], #16
+        b.gt            4b
+        ret
+80:
+        add             x9,  x1,  x2
+        lsl             x2,  x2,  #1
+8:
+        ld1             {v0.8h}, [x1], x2
+        ld1             {v1.8h}, [x9], x2
+        subs            w4,  w4,  #2
+        sshl            v0.8h,   v0.8h,   v31.8h
+        sshl            v1.8h,   v1.8h,   v31.8h
+        sub             v0.8h,   v0.8h,   v30.8h
+        sub             v1.8h,   v1.8h,   v30.8h
+        st1             {v0.8h, v1.8h}, [x0], #32
+        b.gt            8b
+        ret
+16:
+        ldp             q0,  q1,  [x1]
+        add             x1,  x1,  x2
+        sshl            v0.8h,   v0.8h,   v31.8h
+        ldp             q2,  q3,  [x1]
+        add             x1,  x1,  x2
+        subs            w4,  w4,  #2
+        sshl            v1.8h,   v1.8h,   v31.8h
+        sshl            v2.8h,   v2.8h,   v31.8h
+        sshl            v3.8h,   v3.8h,   v31.8h
+        sub             v0.8h,   v0.8h,   v30.8h
+        sub             v1.8h,   v1.8h,   v30.8h
+        sub             v2.8h,   v2.8h,   v30.8h
+        sub             v3.8h,   v3.8h,   v30.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        b.gt            16b
+        ret
+32:
+        ldp             q0,  q1,  [x1]
+        sshl            v0.8h,   v0.8h,   v31.8h
+        ldp             q2,  q3,  [x1, #32]
+        add             x1,  x1,  x2
+        sshl            v1.8h,   v1.8h,   v31.8h
+        sshl            v2.8h,   v2.8h,   v31.8h
+        sshl            v3.8h,   v3.8h,   v31.8h
+        subs            w4,  w4,  #1
+        sub             v0.8h,   v0.8h,   v30.8h
+        sub             v1.8h,   v1.8h,   v30.8h
+        sub             v2.8h,   v2.8h,   v30.8h
+        sub             v3.8h,   v3.8h,   v30.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        b.gt            32b
+        ret
+64:
+        ldp             q0,  q1,  [x1]
+        subs            w4,  w4,  #1
+        sshl            v0.8h,   v0.8h,   v31.8h
+        ldp             q2,  q3,  [x1, #32]
+        sshl            v1.8h,   v1.8h,   v31.8h
+        ldp             q4,  q5,  [x1, #64]
+        sshl            v2.8h,   v2.8h,   v31.8h
+        sshl            v3.8h,   v3.8h,   v31.8h
+        ldp             q6,  q7,  [x1, #96]
+        add             x1,  x1,  x2
+        sshl            v4.8h,   v4.8h,   v31.8h
+        sshl            v5.8h,   v5.8h,   v31.8h
+        sshl            v6.8h,   v6.8h,   v31.8h
+        sshl            v7.8h,   v7.8h,   v31.8h
+        sub             v0.8h,   v0.8h,   v30.8h
+        sub             v1.8h,   v1.8h,   v30.8h
+        sub             v2.8h,   v2.8h,   v30.8h
+        sub             v3.8h,   v3.8h,   v30.8h
+        stp             q0,  q1,  [x0]
+        sub             v4.8h,   v4.8h,   v30.8h
+        sub             v5.8h,   v5.8h,   v30.8h
+        stp             q2,  q3,  [x0, #32]
+        sub             v6.8h,   v6.8h,   v30.8h
+        sub             v7.8h,   v7.8h,   v30.8h
+        stp             q4,  q5,  [x0, #64]
+        stp             q6,  q7,  [x0, #96]
+        add             x0,  x0,  x8
+        b.gt            64b
+        ret
+128:
+        ldp             q0,  q1,  [x1]
+        subs            w4,  w4,  #1
+        sshl            v0.8h,   v0.8h,   v31.8h
+        ldp             q2,  q3,  [x1, #32]
+        sshl            v1.8h,   v1.8h,   v31.8h
+        ldp             q4,  q5,  [x1, #64]
+        sshl            v2.8h,   v2.8h,   v31.8h
+        sshl            v3.8h,   v3.8h,   v31.8h
+        ldp             q6,  q7,  [x1, #96]
+        sshl            v4.8h,   v4.8h,   v31.8h
+        sshl            v5.8h,   v5.8h,   v31.8h
+        ldp             q16, q17, [x1, #128]
+        sshl            v6.8h,   v6.8h,   v31.8h
+        sshl            v7.8h,   v7.8h,   v31.8h
+        ldp             q18, q19, [x1, #160]
+        sshl            v16.8h,  v16.8h,  v31.8h
+        sshl            v17.8h,  v17.8h,  v31.8h
+        ldp             q20, q21, [x1, #192]
+        sshl            v18.8h,  v18.8h,  v31.8h
+        sshl            v19.8h,  v19.8h,  v31.8h
+        ldp             q22, q23, [x1, #224]
+        add             x1,  x1,  x2
+        sshl            v20.8h,  v20.8h,  v31.8h
+        sshl            v21.8h,  v21.8h,  v31.8h
+        sshl            v22.8h,  v22.8h,  v31.8h
+        sshl            v23.8h,  v23.8h,  v31.8h
+        sub             v0.8h,   v0.8h,   v30.8h
+        sub             v1.8h,   v1.8h,   v30.8h
+        sub             v2.8h,   v2.8h,   v30.8h
+        sub             v3.8h,   v3.8h,   v30.8h
+        stp             q0,  q1,  [x0]
+        sub             v4.8h,   v4.8h,   v30.8h
+        sub             v5.8h,   v5.8h,   v30.8h
+        stp             q2,  q3,  [x0, #32]
+        sub             v6.8h,   v6.8h,   v30.8h
+        sub             v7.8h,   v7.8h,   v30.8h
+        stp             q4,  q5,  [x0, #64]
+        sub             v16.8h,  v16.8h,  v30.8h
+        sub             v17.8h,  v17.8h,  v30.8h
+        stp             q6,  q7,  [x0, #96]
+        sub             v18.8h,  v18.8h,  v30.8h
+        sub             v19.8h,  v19.8h,  v30.8h
+        stp             q16, q17, [x0, #128]
+        sub             v20.8h,  v20.8h,  v30.8h
+        sub             v21.8h,  v21.8h,  v30.8h
+        stp             q18, q19, [x0, #160]
+        sub             v22.8h,  v22.8h,  v30.8h
+        sub             v23.8h,  v23.8h,  v30.8h
+        stp             q20, q21, [x0, #192]
+        stp             q22, q23, [x0, #224]
+        add             x0,  x0,  x8
+        b.gt            128b
+        ret
+
+L(prep_tbl):
+        .hword L(prep_tbl) - 128b
+        .hword L(prep_tbl) -  64b
+        .hword L(prep_tbl) -  32b
+        .hword L(prep_tbl) -  16b
+        .hword L(prep_tbl) -  80b
+        .hword L(prep_tbl) -  40b
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+        ld1             {\d0\wd}[0], [\s0], \strd
+        ld1             {\d1\wd}[0], [\s1], \strd
+.ifnb \d2
+        ld1             {\d2\wd}[0], [\s0], \strd
+        ld1             {\d3\wd}[0], [\s1], \strd
+.endif
+.ifnb \d4
+        ld1             {\d4\wd}[0], [\s0], \strd
+.endif
+.ifnb \d5
+        ld1             {\d5\wd}[0], [\s1], \strd
+.endif
+.ifnb \d6
+        ld1             {\d6\wd}[0], [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+        ld1             {\d0\wd}, [\s0], \strd
+        ld1             {\d1\wd}, [\s1], \strd
+.ifnb \d2
+        ld1             {\d2\wd}, [\s0], \strd
+        ld1             {\d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+        ld1             {\d4\wd}, [\s0], \strd
+.endif
+.ifnb \d5
+        ld1             {\d5\wd}, [\s1], \strd
+.endif
+.ifnb \d6
+        ld1             {\d6\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
+        ld1             {\d0\wd, \d1\wd}, [\s0], \strd
+.ifnb \d2
+        ld1             {\d2\wd, \d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+        ld1             {\d4\wd, \d5\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_reg        \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_reg        \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
+        load_regpair    \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
+.endm
+.macro interleave_1 wd, r0, r1, r2, r3, r4
+        trn1            \r0\wd, \r0\wd, \r1\wd
+        trn1            \r1\wd, \r1\wd, \r2\wd
+.ifnb \r3
+        trn1            \r2\wd, \r2\wd, \r3\wd
+        trn1            \r3\wd, \r3\wd, \r4\wd
+.endif
+.endm
+.macro interleave_1_s r0, r1, r2, r3, r4
+        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro umin_h c, wd, r0, r1, r2, r3
+        umin            \r0\wd,  \r0\wd,  \c\wd
+.ifnb \r1
+        umin            \r1\wd,  \r1\wd,  \c\wd
+.endif
+.ifnb \r2
+        umin            \r2\wd,  \r2\wd,  \c\wd
+        umin            \r3\wd,  \r3\wd,  \c\wd
+.endif
+.endm
+.macro sub_h c, wd, r0, r1, r2, r3
+        sub             \r0\wd,  \r0\wd,  \c\wd
+.ifnb \r1
+        sub             \r1\wd,  \r1\wd,  \c\wd
+.endif
+.ifnb \r2
+        sub             \r2\wd,  \r2\wd,  \c\wd
+        sub             \r3\wd,  \r3\wd,  \c\wd
+.endif
+.endm
+.macro smull_smlal_4 d, s0, s1, s2, s3
+        smull           \d\().4s,  \s0\().4h,  v0.h[0]
+        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
+        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
+        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
+.endm
+.macro smull2_smlal2_4 d, s0, s1, s2, s3
+        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
+        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
+        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
+        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
+.endm
+.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+        smull           \d\().4s,  \s0\().4h,  v0.h[0]
+        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
+        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
+        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
+        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
+        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
+        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
+        smlal           \d\().4s,  \s7\().4h,  v0.h[7]
+.endm
+.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
+        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
+        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
+        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
+        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
+        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
+        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
+        smlal2          \d\().4s,  \s7\().8h,  v0.h[7]
+.endm
+.macro sqrshrun_h shift, r0, r1, r2, r3
+        sqrshrun        \r0\().4h, \r0\().4s,  #\shift
+.ifnb \r1
+        sqrshrun2       \r0\().8h, \r1\().4s,  #\shift
+.endif
+.ifnb \r2
+        sqrshrun        \r2\().4h, \r2\().4s,  #\shift
+        sqrshrun2       \r2\().8h, \r3\().4s,  #\shift
+.endif
+.endm
+.macro xtn_h r0, r1, r2, r3
+        xtn             \r0\().4h,  \r0\().4s
+        xtn2            \r0\().8h,  \r1\().4s
+.ifnb \r2
+        xtn             \r2\().4h,  \r2\().4s
+        xtn2            \r2\().8h,  \r3\().4s
+.endif
+.endm
+.macro srshl_s shift, r0, r1, r2, r3
+        srshl           \r0\().4s,  \r0\().4s,  \shift\().4s
+        srshl           \r1\().4s,  \r1\().4s,  \shift\().4s
+.ifnb \r2
+        srshl           \r2\().4s,  \r2\().4s,  \shift\().4s
+        srshl           \r3\().4s,  \r3\().4s,  \shift\().4s
+.endif
+.endm
+.macro st_s strd, reg, lanes
+        st1             {\reg\().s}[0], [x0], \strd
+        st1             {\reg\().s}[1], [x9], \strd
+.if \lanes > 2
+        st1             {\reg\().s}[2], [x0], \strd
+        st1             {\reg\().s}[3], [x9], \strd
+.endif
+.endm
+.macro st_d strd, r0, r1
+        st1             {\r0\().d}[0], [x0], \strd
+        st1             {\r0\().d}[1], [x9], \strd
+.ifnb \r1
+        st1             {\r1\().d}[0], [x0], \strd
+        st1             {\r1\().d}[1], [x9], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, r0, r1, r2, r3
+.ifc \type, put
+        sqrshrun_h      6,   \r0, \r1, \r2, \r3
+        umin_h          v31, .8h, \r0, \r2
+.else
+        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
+        xtn_h           \r0, \r1, \r2, \r3
+        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
+.endif
+        st_d            \strd, \r0, \r2
+.endm
+.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
+        st1             {\r0\wd}, [x0], \strd
+        st1             {\r1\wd}, [x9], \strd
+.ifnb \r2
+        st1             {\r2\wd}, [x0], \strd
+        st1             {\r3\wd}, [x9], \strd
+.endif
+.ifnb \r4
+        st1             {\r4\wd}, [x0], \strd
+        st1             {\r5\wd}, [x9], \strd
+        st1             {\r6\wd}, [x0], \strd
+        st1             {\r7\wd}, [x9], \strd
+.endif
+.endm
+.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
+        st_reg          \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro shift_store_8 type, strd, r0, r1, r2, r3
+.ifc \type, put
+        sqrshrun_h      6,   \r0, \r1, \r2, \r3
+        umin_h          v31, .8h, \r0, \r2
+.else
+        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
+        xtn_h           \r0, \r1, \r2, \r3
+        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
+.endif
+        st_8h           \strd, \r0, \r2
+.endm
+.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
+.ifc \type, put
+        sqrshrun_h      6,   \r0, \r1, \r2, \r3
+        umin            \r0\().8h, \r0\().8h, v31.8h
+        umin            \r1\().8h, \r2\().8h, v31.8h
+.else
+        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
+        xtn_h           \r0, \r1, \r2, \r3
+        sub             \r0\().8h, \r0\().8h, v29.8h
+        sub             \r1\().8h, \r2\().8h, v29.8h
+.endif
+        st1             {\r0\().8h, \r1\().8h}, [\dst], \strd
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_16bpc_neon, export=1
+        mov             w9,  \type_h
+        mov             w10, \type_v
+        b               \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH  ((1*15<<7)|4*15)
+#define SHARP   ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
+make_8tap_fn \type, regular,        REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
+make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
+make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
+make_8tap_fn \type, sharp,          SHARP,   SHARP
+make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
+make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
+
+function \type\()_8tap_neon
+.ifc \bdmax, w8
+        ldr             w8,  [sp]
+.endif
+        mov             w11,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
+        mul             \mx,  \mx, w11
+        mul             \my,  \my, w11
+        add             \mx,  \mx, w9  // mx, 8tap_h, 4tap_h
+        add             \my,  \my, w10 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+        uxtw            \d_strd, \w
+        lsl             \d_strd, \d_strd, #1
+.endif
+
+        dup             v31.8h,  \bdmax        // bitdepth_max
+        clz             \bdmax,  \bdmax
+        clz             w9,  \w
+        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18
+        mov             w12, #6
+        tst             \mx, #(0x7f << 14)
+        sub             w9,  w9,  #24
+        add             w13, w12, \bdmax       // 6 + intermediate_bits
+        sub             w12, w12, \bdmax       // 6 - intermediate_bits
+        movrel          x11, X(mc_subpel_filters), -8
+        b.ne            L(\type\()_8tap_h)
+        tst             \my, #(0x7f << 14)
+        b.ne            L(\type\()_8tap_v)
+        b               \type\()_neon
+
+L(\type\()_8tap_h):
+        cmp             \w,   #4
+        ubfx            w10,  \mx, #7, #7
+        and             \mx,  \mx, #0x7f
+        b.le            4f
+        mov             \mx,  w10
+4:
+        tst             \my,  #(0x7f << 14)
+        add             \xmx, x11, \mx, uxtw #3
+        b.ne            L(\type\()_8tap_hv)
+
+        adr             x10, L(\type\()_8tap_h_tbl)
+        dup             v30.4s,  w12           // 6 - intermediate_bits
+        ldrh            w9,  [x10, x9, lsl #1]
+        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
+.ifc \type, put
+        dup             v29.8h,  \bdmax        // intermediate_bits
+.else
+        movi            v28.8h,  #(PREP_BIAS >> 8), lsl #8
+.endif
+        sub             x10, x10, w9, uxtw
+.ifc \type, put
+        neg             v29.8h,  v29.8h        // -intermediate_bits
+.endif
+        br              x10
+
+20:     // 2xN h
+.ifc \type, put
+        add             \xmx,  \xmx,  #2
+        ld1             {v0.s}[0], [\xmx]
+        sub             \src,  \src,  #2
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h,   v0.8b
+2:
+        ld1             {v4.8h},  [\src], \s_strd
+        ld1             {v6.8h},  [\sr2], \s_strd
+        ext             v5.16b,  v4.16b,  v4.16b,  #2
+        ext             v7.16b,  v6.16b,  v6.16b,  #2
+        subs            \h,  \h,  #2
+        trn1            v3.2s,   v4.2s,   v6.2s
+        trn2            v6.2s,   v4.2s,   v6.2s
+        trn1            v4.2s,   v5.2s,   v7.2s
+        trn2            v7.2s,   v5.2s,   v7.2s
+        smull           v3.4s,   v3.4h,   v0.h[0]
+        smlal           v3.4s,   v4.4h,   v0.h[1]
+        smlal           v3.4s,   v6.4h,   v0.h[2]
+        smlal           v3.4s,   v7.4h,   v0.h[3]
+        srshl           v3.4s,   v3.4s,   v30.4s // -(6-intermediate_bits)
+        sqxtun          v3.4h,   v3.4s
+        srshl           v3.4h,   v3.4h,   v29.4h // -intermediate_bits
+        umin            v3.4h,   v3.4h,   v31.4h
+        st1             {v3.s}[0], [\dst], \d_strd
+        st1             {v3.s}[1], [\ds2], \d_strd
+        b.gt            2b
+        ret
+.endif
+
+40:     // 4xN h
+        add             \xmx,  \xmx,  #2
+        ld1             {v0.s}[0], [\xmx]
+        sub             \src,  \src,  #2
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h,   v0.8b
+4:
+        ld1             {v16.8h}, [\src], \s_strd
+        ld1             {v20.8h}, [\sr2], \s_strd
+        ext             v17.16b, v16.16b, v16.16b, #2
+        ext             v18.16b, v16.16b, v16.16b, #4
+        ext             v19.16b, v16.16b, v16.16b, #6
+        ext             v21.16b, v20.16b, v20.16b, #2
+        ext             v22.16b, v20.16b, v20.16b, #4
+        ext             v23.16b, v20.16b, v20.16b, #6
+        subs            \h,  \h,  #2
+        smull           v16.4s,  v16.4h,  v0.h[0]
+        smlal           v16.4s,  v17.4h,  v0.h[1]
+        smlal           v16.4s,  v18.4h,  v0.h[2]
+        smlal           v16.4s,  v19.4h,  v0.h[3]
+        smull           v20.4s,  v20.4h,  v0.h[0]
+        smlal           v20.4s,  v21.4h,  v0.h[1]
+        smlal           v20.4s,  v22.4h,  v0.h[2]
+        smlal           v20.4s,  v23.4h,  v0.h[3]
+        srshl           v16.4s,  v16.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v20.4s,  v20.4s,  v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+        sqxtun          v16.4h,  v16.4s
+        sqxtun2         v16.8h,  v20.4s
+        srshl           v16.8h,  v16.8h,  v29.8h // -intermediate_bits
+        umin            v16.8h,  v16.8h,  v31.8h
+.else
+        xtn             v16.4h,  v16.4s
+        xtn2            v16.8h,  v20.4s
+        sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
+.endif
+        st1             {v16.d}[0], [\dst], \d_strd
+        st1             {v16.d}[1], [\ds2], \d_strd
+        b.gt            4b
+        ret
+
+80:
+160:
+320:
+640:
+1280:   // 8xN, 16xN, 32xN, ... h
+        ld1             {v0.8b}, [\xmx]
+        sub             \src,  \src,  #6
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h,   v0.8b
+
+        sub             \s_strd,  \s_strd,  \w, uxtw #1
+        sub             \s_strd,  \s_strd,  #16
+.ifc \type, put
+        lsl             \d_strd,  \d_strd,  #1
+        sub             \d_strd,  \d_strd,  \w, uxtw #1
+.endif
+81:
+        ld1             {v16.8h, v17.8h},  [\src], #32
+        ld1             {v20.8h, v21.8h},  [\sr2], #32
+        mov             \mx, \w
+
+8:
+        smull           v18.4s,  v16.4h,  v0.h[0]
+        smull2          v19.4s,  v16.8h,  v0.h[0]
+        smull           v22.4s,  v20.4h,  v0.h[0]
+        smull2          v23.4s,  v20.8h,  v0.h[0]
+.irpc i, 1234567
+        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
+        smlal           v18.4s,  v24.4h,  v0.h[\i]
+        smlal2          v19.4s,  v24.8h,  v0.h[\i]
+        smlal           v22.4s,  v25.4h,  v0.h[\i]
+        smlal2          v23.4s,  v25.8h,  v0.h[\i]
+.endr
+        subs            \mx, \mx, #8
+        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+        sqxtun          v18.4h,  v18.4s
+        sqxtun2         v18.8h,  v19.4s
+        sqxtun          v22.4h,  v22.4s
+        sqxtun2         v22.8h,  v23.4s
+        srshl           v18.8h,  v18.8h,  v29.8h // -intermediate_bits
+        srshl           v22.8h,  v22.8h,  v29.8h // -intermediate_bits
+        umin            v18.8h,  v18.8h,  v31.8h
+        umin            v22.8h,  v22.8h,  v31.8h
+.else
+        xtn             v18.4h,  v18.4s
+        xtn2            v18.8h,  v19.4s
+        xtn             v22.4h,  v22.4s
+        xtn2            v22.8h,  v23.4s
+        sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS
+        sub             v22.8h,  v22.8h,  v28.8h // PREP_BIAS
+.endif
+        st1             {v18.8h}, [\dst], #16
+        st1             {v22.8h}, [\ds2], #16
+        b.le            9f
+
+        mov             v16.16b, v17.16b
+        mov             v20.16b, v21.16b
+        ld1             {v17.8h}, [\src], #16
+        ld1             {v21.8h}, [\sr2], #16
+        b               8b
+
+9:
+        add             \dst,  \dst,  \d_strd
+        add             \ds2,  \ds2,  \d_strd
+        add             \src,  \src,  \s_strd
+        add             \sr2,  \sr2,  \s_strd
+
+        subs            \h,  \h,  #2
+        b.gt            81b
+        ret
+
+L(\type\()_8tap_h_tbl):
+        .hword L(\type\()_8tap_h_tbl) - 1280b
+        .hword L(\type\()_8tap_h_tbl) -  640b
+        .hword L(\type\()_8tap_h_tbl) -  320b
+        .hword L(\type\()_8tap_h_tbl) -  160b
+        .hword L(\type\()_8tap_h_tbl) -   80b
+        .hword L(\type\()_8tap_h_tbl) -   40b
+        .hword L(\type\()_8tap_h_tbl) -   20b
+        .hword 0
+
+
+L(\type\()_8tap_v):
+        cmp             \h,  #4
+        ubfx            w10, \my, #7, #7
+        and             \my, \my, #0x7f
+        b.le            4f
+        mov             \my, w10
+4:
+        add             \xmy, x11, \my, uxtw #3
+
+.ifc \type, prep
+        dup             v30.4s,  w12           // 6 - intermediate_bits
+        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
+.endif
+        adr             x10, L(\type\()_8tap_v_tbl)
+        ldrh            w9,  [x10, x9, lsl #1]
+.ifc \type, prep
+        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
+.endif
+        sub             x10, x10, w9, uxtw
+        br              x10
+
+20:     // 2xN v
+.ifc \type, put
+        b.gt            28f
+
+        cmp             \h,  #2
+        add             \xmy, \xmy, #2
+        ld1             {v0.s}[0], [\xmy]
+        sub             \src,  \src,  \s_strd
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        sxtl            v0.8h,   v0.8b
+
+        // 2x2 v
+        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+        interleave_1_s  v1,  v2,  v3,  v4,  v5
+        b.gt            24f
+        smull_smlal_4   v6,  v1,  v2,  v3,  v4
+        sqrshrun_h      6,   v6
+        umin_h          v31, .8h, v6
+        st_s            \d_strd, v6, 2
+        ret
+
+24:     // 2x4 v
+        load_s          \sr2, \src, \s_strd, v6, v7
+        interleave_1_s  v5,  v6,  v7
+        smull_smlal_4   v16, v1,  v2,  v3,  v4
+        smull_smlal_4   v17, v3,  v4,  v5,  v6
+        sqrshrun_h      6,   v16, v17
+        umin_h          v31, .8h, v16
+        st_s            \d_strd, v16, 4
+        ret
+
+28:     // 2x8, 2x16 v
+        ld1             {v0.8b}, [\xmy]
+        sub             \sr2,  \src,  \s_strd, lsl #1
+        add             \ds2,  \dst,  \d_strd
+        sub             \src,  \sr2,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h,   v0.8b
+
+        load_s          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
+        interleave_1_s  v1,  v2,  v3,  v4,  v5
+        interleave_1_s  v5,  v6,  v7
+216:
+        subs            \h,  \h,  #8
+        load_s          \sr2, \src, \s_strd, v16, v17, v18, v19
+        load_s          \sr2, \src, \s_strd, v20, v21, v22, v23
+        interleave_1_s  v7,  v16, v17, v18, v19
+        interleave_1_s  v19, v20, v21, v22, v23
+        smull_smlal_8   v24, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
+        smull_smlal_8   v25, v3,  v4,  v5,  v6,  v7,  v16, v17, v18
+        smull_smlal_8   v26, v5,  v6,  v7,  v16, v17, v18, v19, v20
+        smull_smlal_8   v27, v7,  v16, v17, v18, v19, v20, v21, v22
+        sqrshrun_h      6,   v24, v25, v26, v27
+        umin_h          v31, .8h, v24, v26
+        st_s            \d_strd, v24, 4
+        st_s            \d_strd, v26, 4
+        b.le            0f
+        mov             v1.16b,  v17.16b
+        mov             v2.16b,  v18.16b
+        mov             v3.16b,  v19.16b
+        mov             v4.16b,  v20.16b
+        mov             v5.16b,  v21.16b
+        mov             v6.16b,  v22.16b
+        mov             v7.16b,  v23.16b
+        b               216b
+0:
+        ret
+.endif
+
+40:
+        b.gt            480f
+
+        // 4x2, 4x4 v
+        cmp             \h,  #2
+        add             \xmy, \xmy, #2
+        ld1             {v0.s}[0], [\xmy]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+
+        load_4h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+        smull_smlal_4   v6,  v1,  v2,  v3,  v4
+        smull_smlal_4   v7,  v2,  v3,  v4,  v5
+        shift_store_4   \type, \d_strd, v6, v7
+        b.le            0f
+        load_4h         \sr2, \src, \s_strd, v6, v7
+        smull_smlal_4   v1,  v3,  v4,  v5,  v6
+        smull_smlal_4   v2,  v4,  v5,  v6,  v7
+        shift_store_4   \type, \d_strd, v1, v2
+0:
+        ret
+
+480:    // 4x8, 4x16 v
+        ld1             {v0.8b}, [\xmy]
+        sub             \sr2, \src, \s_strd, lsl #1
+        add             \ds2, \dst, \d_strd
+        sub             \src, \sr2, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+
+        load_4h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+
+48:
+        subs            \h,  \h,  #4
+        load_4h         \sr2, \src, \s_strd, v23, v24, v25, v26
+        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
+        smull_smlal_8   v2,  v17, v18, v19, v20, v21, v22, v23, v24
+        smull_smlal_8   v3,  v18, v19, v20, v21, v22, v23, v24, v25
+        smull_smlal_8   v4,  v19, v20, v21, v22, v23, v24, v25, v26
+        shift_store_4   \type, \d_strd, v1, v2, v3, v4
+        b.le            0f
+        mov             v16.8b,  v20.8b
+        mov             v17.8b,  v21.8b
+        mov             v18.8b,  v22.8b
+        mov             v19.8b,  v23.8b
+        mov             v20.8b,  v24.8b
+        mov             v21.8b,  v25.8b
+        mov             v22.8b,  v26.8b
+        b               48b
+0:
+        ret
+
+80:
+        b.gt            880f
+
+        // 8x2, 8x4 v
+        cmp             \h,  #2
+        add             \xmy, \xmy, #2
+        ld1             {v0.s}[0], [\xmy]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+
+        load_8h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+        smull_smlal_4   v16, v1,  v2,  v3,  v4
+        smull2_smlal2_4 v17, v1,  v2,  v3,  v4
+        smull_smlal_4   v18, v2,  v3,  v4,  v5
+        smull2_smlal2_4 v19, v2,  v3,  v4,  v5
+        shift_store_8   \type, \d_strd, v16, v17, v18, v19
+        b.le            0f
+        load_8h         \sr2, \src, \s_strd, v6, v7
+        smull_smlal_4   v16, v3,  v4,  v5,  v6
+        smull2_smlal2_4 v17, v3,  v4,  v5,  v6
+        smull_smlal_4   v18, v4,  v5,  v6,  v7
+        smull2_smlal2_4 v19, v4,  v5,  v6,  v7
+        shift_store_8   \type, \d_strd, v16, v17, v18, v19
+0:
+        ret
+
+880:    // 8x6, 8x8, 8x16, 8x32 v
+1680:   // 16x8, 16x16, ...
+320:    // 32x8, 32x16, ...
+640:
+1280:
+        ld1             {v0.8b}, [\xmy]
+        sub             \src, \src, \s_strd
+        sub             \src, \src, \s_strd, lsl #1
+        sxtl            v0.8h,   v0.8b
+        mov             \my,  \h
+168:
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        load_8h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+
+88:
+        subs            \h,  \h,  #2
+        load_8h         \sr2, \src, \s_strd, v23, v24
+        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
+        smull2_smlal2_8 v2,  v16, v17, v18, v19, v20, v21, v22, v23
+        smull_smlal_8   v3,  v17, v18, v19, v20, v21, v22, v23, v24
+        smull2_smlal2_8 v4,  v17, v18, v19, v20, v21, v22, v23, v24
+        shift_store_8   \type, \d_strd, v1, v2, v3, v4
+        b.le            9f
+        subs            \h,  \h,  #2
+        load_8h         \sr2, \src, \s_strd, v25, v26
+        smull_smlal_8   v1,  v18, v19, v20, v21, v22, v23, v24, v25
+        smull2_smlal2_8 v2,  v18, v19, v20, v21, v22, v23, v24, v25
+        smull_smlal_8   v3,  v19, v20, v21, v22, v23, v24, v25, v26
+        smull2_smlal2_8 v4,  v19, v20, v21, v22, v23, v24, v25, v26
+        shift_store_8   \type, \d_strd, v1, v2, v3, v4
+        b.le            9f
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        mov             v18.16b, v22.16b
+        mov             v19.16b, v23.16b
+        mov             v20.16b, v24.16b
+        mov             v21.16b, v25.16b
+        mov             v22.16b, v26.16b
+        b               88b
+9:
+        subs            \w,  \w,  #8
+        b.le            0f
+        asr             \s_strd, \s_strd, #1
+        asr             \d_strd, \d_strd, #1
+        msub            \src, \s_strd, \xmy, \src
+        msub            \dst, \d_strd, \xmy, \dst
+        sub             \src, \src, \s_strd, lsl #3
+        mov             \h,  \my
+        add             \src, \src, #16
+        add             \dst, \dst, #16
+        b               168b
+0:
+        ret
+
+160:
+        b.gt            1680b
+
+        // 16x2, 16x4 v
+        add             \xmy, \xmy, #2
+        ld1             {v0.s}[0], [\xmy]
+        sub             \src, \src, \s_strd
+        sxtl            v0.8h,   v0.8b
+
+        load_16h        \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
+16:
+        load_16h        \src, \src, \s_strd, v22, v23
+        subs            \h,  \h,  #1
+        smull_smlal_4   v1,  v16, v18, v20, v22
+        smull2_smlal2_4 v2,  v16, v18, v20, v22
+        smull_smlal_4   v3,  v17, v19, v21, v23
+        smull2_smlal2_4 v4,  v17, v19, v21, v23
+        shift_store_16  \type, \d_strd, x0, v1, v2, v3, v4
+        b.le            0f
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        mov             v18.16b, v20.16b
+        mov             v19.16b, v21.16b
+        mov             v20.16b, v22.16b
+        mov             v21.16b, v23.16b
+        b               16b
+0:
+        ret
+
+L(\type\()_8tap_v_tbl):
+        .hword L(\type\()_8tap_v_tbl) - 1280b
+        .hword L(\type\()_8tap_v_tbl) -  640b
+        .hword L(\type\()_8tap_v_tbl) -  320b
+        .hword L(\type\()_8tap_v_tbl) -  160b
+        .hword L(\type\()_8tap_v_tbl) -   80b
+        .hword L(\type\()_8tap_v_tbl) -   40b
+        .hword L(\type\()_8tap_v_tbl) -   20b
+        .hword 0
+
+L(\type\()_8tap_hv):
+        cmp             \h,  #4
+        ubfx            w10, \my, #7, #7
+        and             \my, \my, #0x7f
+        b.le            4f
+        mov             \my,  w10
+4:
+        add             \xmy, x11, \my, uxtw #3
+
+        adr             x10, L(\type\()_8tap_hv_tbl)
+        dup             v30.4s,  w12           // 6 - intermediate_bits
+        ldrh            w9,  [x10, x9, lsl #1]
+        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
+.ifc \type, put
+        dup             v29.4s,  w13           // 6 + intermediate_bits
+.else
+        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
+.endif
+        sub             x10, x10, w9, uxtw
+.ifc \type, put
+        neg             v29.4s,  v29.4s        // -(6+intermediate_bits)
+.endif
+        br              x10
+
+20:
+.ifc \type, put
+        add             \xmx,  \xmx,  #2
+        ld1             {v0.s}[0],  [\xmx]
+        b.gt            280f
+        add             \xmy,  \xmy,  #2
+        ld1             {v1.s}[0],  [\xmy]
+
+        // 2x2, 2x4 hv
+        sub             \sr2, \src, #2
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        mov             x15, x30
+        sxtl            v1.4s,   v1.4h
+
+        ld1             {v27.8h}, [\src], \s_strd
+        ext             v28.16b, v27.16b, v27.16b, #2
+        smull           v27.4s,  v27.4h,  v0.4h
+        smull           v28.4s,  v28.4h,  v0.4h
+        addp            v27.4s,  v27.4s,  v28.4s
+        addp            v16.4s,  v27.4s,  v27.4s
+        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
+        bl              L(\type\()_8tap_filter_2)
+
+        trn1            v16.2d,  v16.2d,  v24.2d
+        mov             v17.16b, v24.16b
+
+2:
+        bl              L(\type\()_8tap_filter_2)
+
+        ext             v18.16b, v17.16b, v24.16b, #8
+        mov             v19.16b, v24.16b
+        mul             v2.4s,   v16.4s,  v1.s[0]
+        mla             v2.4s,   v17.4s,  v1.s[1]
+        mla             v2.4s,   v18.4s,  v1.s[2]
+        mla             v2.4s,   v19.4s,  v1.s[3]
+
+        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
+        sqxtun          v2.4h,   v2.4s
+        umin            v2.4h,   v2.4h,   v31.4h
+        subs            \h,  \h,  #2
+        st1             {v2.s}[0], [\dst], \d_strd
+        st1             {v2.s}[1], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        b               2b
+
+280:    // 2x8, 2x16, 2x32 hv
+        ld1             {v1.8b},  [\xmy]
+        sub             \src, \src, #2
+        sub             \sr2, \src, \s_strd, lsl #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        mov             x15, x30
+        sxtl2           v2.4s,   v1.8h
+        sxtl            v1.4s,   v1.4h
+
+        ld1             {v27.8h}, [\src], \s_strd
+        ext             v28.16b, v27.16b, v27.16b, #2
+        smull           v27.4s,  v27.4h,  v0.4h
+        smull           v28.4s,  v28.4h,  v0.4h
+        addp            v27.4s,  v27.4s,  v28.4s
+        addp            v16.4s,  v27.4s,  v27.4s
+        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
+
+        bl              L(\type\()_8tap_filter_2)
+        trn1            v16.2d,  v16.2d,  v24.2d
+        mov             v17.16b, v24.16b
+        bl              L(\type\()_8tap_filter_2)
+        ext             v18.16b, v17.16b, v24.16b, #8
+        mov             v19.16b, v24.16b
+        bl              L(\type\()_8tap_filter_2)
+        ext             v20.16b, v19.16b, v24.16b, #8
+        mov             v21.16b, v24.16b
+
+28:
+        bl              L(\type\()_8tap_filter_2)
+        ext             v22.16b, v21.16b, v24.16b, #8
+        mov             v23.16b, v24.16b
+        mul             v3.4s,   v16.4s,  v1.s[0]
+        mla             v3.4s,   v17.4s,  v1.s[1]
+        mla             v3.4s,   v18.4s,  v1.s[2]
+        mla             v3.4s,   v19.4s,  v1.s[3]
+        mla             v3.4s,   v20.4s,  v2.s[0]
+        mla             v3.4s,   v21.4s,  v2.s[1]
+        mla             v3.4s,   v22.4s,  v2.s[2]
+        mla             v3.4s,   v23.4s,  v2.s[3]
+
+        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
+        sqxtun          v3.4h,   v3.4s
+        umin            v3.4h,   v3.4h,   v31.4h
+        subs            \h,  \h,  #2
+        st1             {v3.s}[0], [\dst], \d_strd
+        st1             {v3.s}[1], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        mov             v18.16b, v20.16b
+        mov             v19.16b, v21.16b
+        mov             v20.16b, v22.16b
+        mov             v21.16b, v23.16b
+        b               28b
+
+0:
+        br              x15
+
+L(\type\()_8tap_filter_2):
+        ld1             {v25.8h},  [\sr2], \s_strd
+        ld1             {v27.8h},  [\src], \s_strd
+        ext             v26.16b, v25.16b, v25.16b, #2
+        ext             v28.16b, v27.16b, v27.16b, #2
+        trn1            v24.2s,  v25.2s,  v27.2s
+        trn2            v27.2s,  v25.2s,  v27.2s
+        trn1            v25.2s,  v26.2s,  v28.2s
+        trn2            v28.2s,  v26.2s,  v28.2s
+        smull           v24.4s,  v24.4h,  v0.h[0]
+        smlal           v24.4s,  v25.4h,  v0.h[1]
+        smlal           v24.4s,  v27.4h,  v0.h[2]
+        smlal           v24.4s,  v28.4h,  v0.h[3]
+        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
+        ret
+.endif
+
+40:
+        add             \xmx, \xmx, #2
+        ld1             {v0.s}[0],  [\xmx]
+        b.gt            480f
+        add             \xmy, \xmy,  #2
+        ld1             {v1.s}[0],  [\xmy]
+        sub             \sr2, \src, #2
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        mov             x15, x30
+
+        // 4x2, 4x4 hv
+        ld1             {v25.8h}, [\src], \s_strd
+        ext             v26.16b, v25.16b, v25.16b, #2
+        ext             v27.16b, v25.16b, v25.16b, #4
+        ext             v28.16b, v25.16b, v25.16b, #6
+        smull           v25.4s,  v25.4h,  v0.h[0]
+        smlal           v25.4s,  v26.4h,  v0.h[1]
+        smlal           v25.4s,  v27.4h,  v0.h[2]
+        smlal           v25.4s,  v28.4h,  v0.h[3]
+        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53).
+        xtn             v16.4h,  v16.4s
+
+        bl              L(\type\()_8tap_filter_4)
+        mov             v17.8b,  v24.8b
+        mov             v18.8b,  v25.8b
+
+4:
+        bl              L(\type\()_8tap_filter_4)
+        smull           v2.4s,   v16.4h,  v1.h[0]
+        smlal           v2.4s,   v17.4h,  v1.h[1]
+        smlal           v2.4s,   v18.4h,  v1.h[2]
+        smlal           v2.4s,   v24.4h,  v1.h[3]
+        smull           v3.4s,   v17.4h,  v1.h[0]
+        smlal           v3.4s,   v18.4h,  v1.h[1]
+        smlal           v3.4s,   v24.4h,  v1.h[2]
+        smlal           v3.4s,   v25.4h,  v1.h[3]
+.ifc \type, put
+        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
+        sqxtun          v2.4h,   v2.4s
+        sqxtun2         v2.8h,   v3.4s
+        umin            v2.8h,   v2.8h,   v31.8h
+.else
+        rshrn           v2.4h,   v2.4s,   #6
+        rshrn2          v2.8h,   v3.4s,   #6
+        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
+.endif
+        subs            \h,  \h,  #2
+
+        st1             {v2.d}[0], [\dst], \d_strd
+        st1             {v2.d}[1], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v24.8b
+        mov             v18.8b,  v25.8b
+        b               4b
+
+480:    // 4x8, 4x16, 4x32 hv
+        ld1             {v1.8b},  [\xmy]
+        sub             \src, \src, #2
+        sub             \sr2, \src, \s_strd, lsl #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        mov             x15, x30
+
+        ld1             {v25.8h}, [\src], \s_strd
+        ext             v26.16b, v25.16b, v25.16b, #2
+        ext             v27.16b, v25.16b, v25.16b, #4
+        ext             v28.16b, v25.16b, v25.16b, #6
+        smull           v25.4s,  v25.4h,  v0.h[0]
+        smlal           v25.4s,  v26.4h,  v0.h[1]
+        smlal           v25.4s,  v27.4h,  v0.h[2]
+        smlal           v25.4s,  v28.4h,  v0.h[3]
+        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53).
+        xtn             v16.4h,  v16.4s
+
+        bl              L(\type\()_8tap_filter_4)
+        mov             v17.8b,  v24.8b
+        mov             v18.8b,  v25.8b
+        bl              L(\type\()_8tap_filter_4)
+        mov             v19.8b,  v24.8b
+        mov             v20.8b,  v25.8b
+        bl              L(\type\()_8tap_filter_4)
+        mov             v21.8b,  v24.8b
+        mov             v22.8b,  v25.8b
+
+48:
+        bl              L(\type\()_8tap_filter_4)
+        smull           v3.4s,   v16.4h,  v1.h[0]
+        smlal           v3.4s,   v17.4h,  v1.h[1]
+        smlal           v3.4s,   v18.4h,  v1.h[2]
+        smlal           v3.4s,   v19.4h,  v1.h[3]
+        smlal           v3.4s,   v20.4h,  v1.h[4]
+        smlal           v3.4s,   v21.4h,  v1.h[5]
+        smlal           v3.4s,   v22.4h,  v1.h[6]
+        smlal           v3.4s,   v24.4h,  v1.h[7]
+        smull           v4.4s,   v17.4h,  v1.h[0]
+        smlal           v4.4s,   v18.4h,  v1.h[1]
+        smlal           v4.4s,   v19.4h,  v1.h[2]
+        smlal           v4.4s,   v20.4h,  v1.h[3]
+        smlal           v4.4s,   v21.4h,  v1.h[4]
+        smlal           v4.4s,   v22.4h,  v1.h[5]
+        smlal           v4.4s,   v24.4h,  v1.h[6]
+        smlal           v4.4s,   v25.4h,  v1.h[7]
+.ifc \type, put
+        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
+        sqxtun          v3.4h,   v3.4s
+        sqxtun2         v3.8h,   v4.4s
+        umin            v3.8h,   v3.8h,   v31.8h
+.else
+        rshrn           v3.4h,   v3.4s,   #6
+        rshrn2          v3.8h,   v4.4s,   #6
+        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
+.endif
+        subs            \h,  \h,  #2
+        st1             {v3.d}[0], [\dst], \d_strd
+        st1             {v3.d}[1], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v19.8b
+        mov             v18.8b,  v20.8b
+        mov             v19.8b,  v21.8b
+        mov             v20.8b,  v22.8b
+        mov             v21.8b,  v24.8b
+        mov             v22.8b,  v25.8b
+        b               48b
+0:
+        br              x15
+
+L(\type\()_8tap_filter_4):
+        ld1             {v24.8h}, [\sr2], \s_strd
+        ld1             {v25.8h}, [\src], \s_strd
+        ext             v26.16b, v24.16b, v24.16b, #2
+        ext             v27.16b, v24.16b, v24.16b, #4
+        ext             v28.16b, v24.16b, v24.16b, #6
+        smull           v24.4s,  v24.4h,  v0.h[0]
+        smlal           v24.4s,  v26.4h,  v0.h[1]
+        smlal           v24.4s,  v27.4h,  v0.h[2]
+        smlal           v24.4s,  v28.4h,  v0.h[3]
+        ext             v26.16b, v25.16b, v25.16b, #2
+        ext             v27.16b, v25.16b, v25.16b, #4
+        ext             v28.16b, v25.16b, v25.16b, #6
+        smull           v25.4s,  v25.4h,  v0.h[0]
+        smlal           v25.4s,  v26.4h,  v0.h[1]
+        smlal           v25.4s,  v27.4h,  v0.h[2]
+        smlal           v25.4s,  v28.4h,  v0.h[3]
+        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
+        xtn             v24.4h,  v24.4s
+        xtn             v25.4h,  v25.4s
+        ret
+
+80:
+160:
+320:
+        b.gt            880f
+        add             \xmy,  \xmy,  #2
+        ld1             {v0.8b},  [\xmx]
+        ld1             {v1.s}[0],  [\xmy]
+        sub             \src,  \src,  #6
+        sub             \src,  \src,  \s_strd
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        mov             x15, x30
+        mov             \my, \h
+
+164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd, \d_strd, #1
+        lsl             \s_strd, \s_strd, #1
+
+        ld1             {v27.8h, v28.8h},  [\src], \s_strd
+        smull           v24.4s,  v27.4h,  v0.h[0]
+        smull2          v25.4s,  v27.8h,  v0.h[0]
+.irpc i, 1234567
+        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
+        smlal           v24.4s,  v26.4h,  v0.h[\i]
+        smlal2          v25.4s,  v26.8h,  v0.h[\i]
+.endr
+        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53),
+        // and conserves register space (no need to clobber v8-v15).
+        xtn             v16.4h,  v24.4s
+        xtn2            v16.8h,  v25.4s
+
+        bl              L(\type\()_8tap_filter_8)
+        mov             v17.16b, v23.16b
+        mov             v18.16b, v24.16b
+
+8:
+        smull           v2.4s,   v16.4h,  v1.h[0]
+        smull2          v3.4s,   v16.8h,  v1.h[0]
+        bl              L(\type\()_8tap_filter_8)
+        smull           v4.4s,   v17.4h,  v1.h[0]
+        smull2          v5.4s,   v17.8h,  v1.h[0]
+        smlal           v2.4s,   v17.4h,  v1.h[1]
+        smlal2          v3.4s,   v17.8h,  v1.h[1]
+        smlal           v4.4s,   v18.4h,  v1.h[1]
+        smlal2          v5.4s,   v18.8h,  v1.h[1]
+        smlal           v2.4s,   v18.4h,  v1.h[2]
+        smlal2          v3.4s,   v18.8h,  v1.h[2]
+        smlal           v4.4s,   v23.4h,  v1.h[2]
+        smlal2          v5.4s,   v23.8h,  v1.h[2]
+        smlal           v2.4s,   v23.4h,  v1.h[3]
+        smlal2          v3.4s,   v23.8h,  v1.h[3]
+        smlal           v4.4s,   v24.4h,  v1.h[3]
+        smlal2          v5.4s,   v24.8h,  v1.h[3]
+.ifc \type, put
+        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
+        sqxtun          v2.4h,   v2.4s
+        sqxtun2         v2.8h,   v3.4s
+        sqxtun          v3.4h,   v4.4s
+        sqxtun2         v3.8h,   v5.4s
+        umin            v2.8h,   v2.8h,   v31.8h
+        umin            v3.8h,   v3.8h,   v31.8h
+.else
+        rshrn           v2.4h,   v2.4s,   #6
+        rshrn2          v2.8h,   v3.4s,   #6
+        rshrn           v3.4h,   v4.4s,   #6
+        rshrn2          v3.8h,   v5.4s,   #6
+        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
+        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
+.endif
+        subs            \h,  \h,  #2
+        st1             {v2.8h}, [\dst], \d_strd
+        st1             {v3.8h}, [\ds2], \d_strd
+        b.le            9f
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v23.16b
+        mov             v18.16b, v24.16b
+        b               8b
+9:
+        subs            \w,  \w,  #8
+        b.le            0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        msub            \src,  \s_strd,  \xmy,  \src
+        msub            \dst,  \d_strd,  \xmy,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #2
+        mov             \h,  \my
+        add             \src,  \src,  #16
+        add             \dst,  \dst,  #16
+        b               164b
+
+880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+        ld1             {v0.8b},  [\xmx]
+        ld1             {v1.8b},  [\xmy]
+        sub             \src,  \src,  #6
+        sub             \src,  \src,  \s_strd
+        sub             \src,  \src,  \s_strd, lsl #1
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        mov             x15, x30
+        mov             \my, \h
+
+168:
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd, \d_strd, #1
+        lsl             \s_strd, \s_strd, #1
+
+        ld1             {v27.8h, v28.8h},  [\src], \s_strd
+        smull           v24.4s,  v27.4h,  v0.h[0]
+        smull2          v25.4s,  v27.8h,  v0.h[0]
+.irpc i, 1234567
+        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
+        smlal           v24.4s,  v26.4h,  v0.h[\i]
+        smlal2          v25.4s,  v26.8h,  v0.h[\i]
+.endr
+        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53),
+        // and conserves register space (no need to clobber v8-v15).
+        xtn             v16.4h,  v24.4s
+        xtn2            v16.8h,  v25.4s
+
+        bl              L(\type\()_8tap_filter_8)
+        mov             v17.16b, v23.16b
+        mov             v18.16b, v24.16b
+        bl              L(\type\()_8tap_filter_8)
+        mov             v19.16b, v23.16b
+        mov             v20.16b, v24.16b
+        bl              L(\type\()_8tap_filter_8)
+        mov             v21.16b, v23.16b
+        mov             v22.16b, v24.16b
+
+88:
+        smull           v2.4s,   v16.4h,  v1.h[0]
+        smull2          v3.4s,   v16.8h,  v1.h[0]
+        bl              L(\type\()_8tap_filter_8)
+        smull           v4.4s,   v17.4h,  v1.h[0]
+        smull2          v5.4s,   v17.8h,  v1.h[0]
+        smlal           v2.4s,   v17.4h,  v1.h[1]
+        smlal2          v3.4s,   v17.8h,  v1.h[1]
+        smlal           v4.4s,   v18.4h,  v1.h[1]
+        smlal2          v5.4s,   v18.8h,  v1.h[1]
+        smlal           v2.4s,   v18.4h,  v1.h[2]
+        smlal2          v3.4s,   v18.8h,  v1.h[2]
+        smlal           v4.4s,   v19.4h,  v1.h[2]
+        smlal2          v5.4s,   v19.8h,  v1.h[2]
+        smlal           v2.4s,   v19.4h,  v1.h[3]
+        smlal2          v3.4s,   v19.8h,  v1.h[3]
+        smlal           v4.4s,   v20.4h,  v1.h[3]
+        smlal2          v5.4s,   v20.8h,  v1.h[3]
+        smlal           v2.4s,   v20.4h,  v1.h[4]
+        smlal2          v3.4s,   v20.8h,  v1.h[4]
+        smlal           v4.4s,   v21.4h,  v1.h[4]
+        smlal2          v5.4s,   v21.8h,  v1.h[4]
+        smlal           v2.4s,   v21.4h,  v1.h[5]
+        smlal2          v3.4s,   v21.8h,  v1.h[5]
+        smlal           v4.4s,   v22.4h,  v1.h[5]
+        smlal2          v5.4s,   v22.8h,  v1.h[5]
+        smlal           v2.4s,   v22.4h,  v1.h[6]
+        smlal2          v3.4s,   v22.8h,  v1.h[6]
+        smlal           v4.4s,   v23.4h,  v1.h[6]
+        smlal2          v5.4s,   v23.8h,  v1.h[6]
+        smlal           v2.4s,   v23.4h,  v1.h[7]
+        smlal2          v3.4s,   v23.8h,  v1.h[7]
+        smlal           v4.4s,   v24.4h,  v1.h[7]
+        smlal2          v5.4s,   v24.8h,  v1.h[7]
+.ifc \type, put
+        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
+        sqxtun          v2.4h,   v2.4s
+        sqxtun2         v2.8h,   v3.4s
+        sqxtun          v3.4h,   v4.4s
+        sqxtun2         v3.8h,   v5.4s
+        umin            v2.8h,   v2.8h,   v31.8h
+        umin            v3.8h,   v3.8h,   v31.8h
+.else
+        rshrn           v2.4h,   v2.4s,   #6
+        rshrn2          v2.8h,   v3.4s,   #6
+        rshrn           v3.4h,   v4.4s,   #6
+        rshrn2          v3.8h,   v5.4s,   #6
+        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
+        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
+.endif
+        subs            \h,  \h,  #2
+        st1             {v2.8h}, [\dst], \d_strd
+        st1             {v3.8h}, [\ds2], \d_strd
+        b.le            9f
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        mov             v18.16b, v20.16b
+        mov             v19.16b, v21.16b
+        mov             v20.16b, v22.16b
+        mov             v21.16b, v23.16b
+        mov             v22.16b, v24.16b
+        b               88b
+9:
+        subs            \w,  \w,  #8
+        b.le            0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        msub            \src,  \s_strd,  \xmy,  \src
+        msub            \dst,  \d_strd,  \xmy,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #3
+        mov             \h,  \my
+        add             \src,  \src,  #16
+        add             \dst,  \dst,  #16
+        b               168b
+0:
+        br              x15
+
+L(\type\()_8tap_filter_8):
+        ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
+        ld1             {v6.8h, v7.8h},  [\src], \s_strd
+        smull           v25.4s,  v4.4h,   v0.h[0]
+        smull2          v26.4s,  v4.8h,   v0.h[0]
+        smull           v27.4s,  v6.4h,   v0.h[0]
+        smull2          v28.4s,  v6.8h,   v0.h[0]
+.irpc i, 1234567
+        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
+        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
+        smlal           v25.4s,  v23.4h,  v0.h[\i]
+        smlal2          v26.4s,  v23.8h,  v0.h[\i]
+        smlal           v27.4s,  v24.4h,  v0.h[\i]
+        smlal2          v28.4s,  v24.8h,  v0.h[\i]
+.endr
+        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v28.4s,  v28.4s,  v30.4s // -(6-intermediate_bits)
+        xtn             v23.4h,  v25.4s
+        xtn2            v23.8h,  v26.4s
+        xtn             v24.4h,  v27.4s
+        xtn2            v24.8h,  v28.4s
+        ret
+
+L(\type\()_8tap_hv_tbl):
+        .hword L(\type\()_8tap_hv_tbl) - 1280b
+        .hword L(\type\()_8tap_hv_tbl) -  640b
+        .hword L(\type\()_8tap_hv_tbl) -  320b
+        .hword L(\type\()_8tap_hv_tbl) -  160b
+        .hword L(\type\()_8tap_hv_tbl) -   80b
+        .hword L(\type\()_8tap_hv_tbl) -   40b
+        .hword L(\type\()_8tap_hv_tbl) -   20b
+        .hword 0
+endfunc
+
+
+function \type\()_bilin_16bpc_neon, export=1
+.ifc \bdmax, w8
+        ldr             w8,  [sp]
+.endif
+        dup             v1.8h,   \mx
+        dup             v3.8h,   \my
+        mov             w10, #16
+        sub             w9,  w10, \mx
+        sub             w10, w10, \my
+        dup             v0.8h,   w9
+        dup             v2.8h,   w10
+.ifc \type, prep
+        uxtw            \d_strd, \w
+        lsl             \d_strd, \d_strd, #1
+.endif
+
+        clz             \bdmax,   \bdmax       // bitdepth_max
+        clz             w9,  \w
+        sub             \bdmax,   \bdmax,  #18 // intermediate_bits = clz(bitdepth_max) - 18
+        mov             w11, #4
+        sub             w9,  w9,  #24
+        sub             w11, w11, \bdmax  // 4 - intermediate_bits
+        add             w12, \bdmax, #4   // 4 + intermediate_bits
+        cbnz            \mx, L(\type\()_bilin_h)
+        cbnz            \my, L(\type\()_bilin_v)
+        b               \type\()_neon
+
+L(\type\()_bilin_h):
+        cbnz            \my, L(\type\()_bilin_hv)
+
+        adr             x10, L(\type\()_bilin_h_tbl)
+        dup             v31.8h,  w11      // 4 - intermediate_bits
+        ldrh            w9,  [x10, x9, lsl #1]
+        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
+.ifc \type, put
+        dup             v30.8h,  \bdmax   // intermediate_bits
+.else
+        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
+.endif
+        sub             x10, x10, w9, uxtw
+.ifc \type, put
+        neg             v30.8h,  v30.8h   // -intermediate_bits
+.endif
+        br              x10
+
+20:     // 2xN h
+.ifc \type, put
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+2:
+        ld1             {v4.4h},  [\src], \s_strd
+        ld1             {v6.4h},  [\sr2], \s_strd
+        ext             v5.8b,   v4.8b,   v4.8b,   #2
+        ext             v7.8b,   v6.8b,   v6.8b,   #2
+        trn1            v4.2s,   v4.2s,   v6.2s
+        trn1            v5.2s,   v5.2s,   v7.2s
+        subs            \h,  \h,  #2
+        mul             v4.4h,   v4.4h,   v0.4h
+        mla             v4.4h,   v5.4h,   v1.4h
+        urshl           v4.4h,   v4.4h,   v31.4h
+        urshl           v4.4h,   v4.4h,   v30.4h
+        st1             {v4.s}[0], [\dst], \d_strd
+        st1             {v4.s}[1], [\ds2], \d_strd
+        b.gt            2b
+        ret
+.endif
+
+40:     // 4xN h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+4:
+        ld1             {v4.8h}, [\src], \s_strd
+        ld1             {v6.8h}, [\sr2], \s_strd
+        ext             v5.16b,  v4.16b,  v4.16b,  #2
+        ext             v7.16b,  v6.16b,  v6.16b,  #2
+        trn1            v4.2d,   v4.2d,   v6.2d
+        trn1            v5.2d,   v5.2d,   v7.2d
+        subs            \h,  \h,  #2
+        mul             v4.8h,   v4.8h,   v0.8h
+        mla             v4.8h,   v5.8h,   v1.8h
+        urshl           v4.8h,   v4.8h,   v31.8h
+.ifc \type, put
+        urshl           v4.8h,   v4.8h,   v30.8h
+.else
+        sub             v4.8h,   v4.8h,   v29.8h
+.endif
+        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.d}[1], [\ds2], \d_strd
+        b.gt            4b
+        ret
+
+80:     // 8xN h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+8:
+        ldr             h5,  [\src, #16]
+        ldr             h7,  [\sr2, #16]
+        ld1             {v4.8h}, [\src], \s_strd
+        ld1             {v6.8h}, [\sr2], \s_strd
+        ext             v5.16b,  v4.16b,  v5.16b,  #2
+        ext             v7.16b,  v6.16b,  v7.16b,  #2
+        subs            \h,  \h,  #2
+        mul             v4.8h,   v4.8h,   v0.8h
+        mla             v4.8h,   v5.8h,   v1.8h
+        mul             v6.8h,   v6.8h,   v0.8h
+        mla             v6.8h,   v7.8h,   v1.8h
+        urshl           v4.8h,   v4.8h,   v31.8h
+        urshl           v6.8h,   v6.8h,   v31.8h
+.ifc \type, put
+        urshl           v4.8h,   v4.8h,   v30.8h
+        urshl           v6.8h,   v6.8h,   v30.8h
+.else
+        sub             v4.8h,   v4.8h,   v29.8h
+        sub             v6.8h,   v6.8h,   v29.8h
+.endif
+        st1             {v4.8h}, [\dst], \d_strd
+        st1             {v6.8h}, [\ds2], \d_strd
+        b.gt            8b
+        ret
+160:
+320:
+640:
+1280:   // 16xN, 32xN, ... h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+
+        sub             \s_strd,  \s_strd,  \w, uxtw #1
+        sub             \s_strd,  \s_strd,  #16
+.ifc \type, put
+        lsl             \d_strd,  \d_strd,  #1
+        sub             \d_strd,  \d_strd,  \w, uxtw #1
+.endif
+161:
+        ld1             {v16.8h},  [\src], #16
+        ld1             {v21.8h},  [\sr2], #16
+        mov             \mx, \w
+
+16:
+        ld1             {v17.8h, v18.8h},  [\src], #32
+        ld1             {v22.8h, v23.8h},  [\sr2], #32
+        ext             v19.16b, v16.16b, v17.16b, #2
+        ext             v20.16b, v17.16b, v18.16b, #2
+        ext             v24.16b, v21.16b, v22.16b, #2
+        ext             v25.16b, v22.16b, v23.16b, #2
+        mul             v16.8h,  v16.8h,  v0.8h
+        mla             v16.8h,  v19.8h,  v1.8h
+        mul             v17.8h,  v17.8h,  v0.8h
+        mla             v17.8h,  v20.8h,  v1.8h
+        mul             v21.8h,  v21.8h,  v0.8h
+        mla             v21.8h,  v24.8h,  v1.8h
+        mul             v22.8h,  v22.8h,  v0.8h
+        mla             v22.8h,  v25.8h,  v1.8h
+        urshl           v16.8h,  v16.8h,  v31.8h
+        urshl           v17.8h,  v17.8h,  v31.8h
+        urshl           v21.8h,  v21.8h,  v31.8h
+        urshl           v22.8h,  v22.8h,  v31.8h
+        subs            \mx, \mx, #16
+.ifc \type, put
+        urshl           v16.8h,  v16.8h,  v30.8h
+        urshl           v17.8h,  v17.8h,  v30.8h
+        urshl           v21.8h,  v21.8h,  v30.8h
+        urshl           v22.8h,  v22.8h,  v30.8h
+.else
+        sub             v16.8h,  v16.8h,  v29.8h
+        sub             v17.8h,  v17.8h,  v29.8h
+        sub             v21.8h,  v21.8h,  v29.8h
+        sub             v22.8h,  v22.8h,  v29.8h
+.endif
+        st1             {v16.8h, v17.8h}, [\dst], #32
+        st1             {v21.8h, v22.8h}, [\ds2], #32
+        b.le            9f
+
+        mov             v16.16b, v18.16b
+        mov             v21.16b, v23.16b
+        b               16b
+
+9:
+        add             \dst,  \dst,  \d_strd
+        add             \ds2,  \ds2,  \d_strd
+        add             \src,  \src,  \s_strd
+        add             \sr2,  \sr2,  \s_strd
+
+        subs            \h,  \h,  #2
+        b.gt            161b
+        ret
+
+L(\type\()_bilin_h_tbl):
+        .hword L(\type\()_bilin_h_tbl) - 1280b
+        .hword L(\type\()_bilin_h_tbl) -  640b
+        .hword L(\type\()_bilin_h_tbl) -  320b
+        .hword L(\type\()_bilin_h_tbl) -  160b
+        .hword L(\type\()_bilin_h_tbl) -   80b
+        .hword L(\type\()_bilin_h_tbl) -   40b
+        .hword L(\type\()_bilin_h_tbl) -   20b
+        .hword 0
+
+
+L(\type\()_bilin_v):
+        cmp             \h,  #4
+        adr             x10, L(\type\()_bilin_v_tbl)
+.ifc \type, prep
+        dup             v31.8h,  w11      // 4 - intermediate_bits
+.endif
+        ldrh            w9,  [x10, x9, lsl #1]
+.ifc \type, prep
+        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
+        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
+.endif
+        sub             x10, x10, w9, uxtw
+        br              x10
+
+20:     // 2xN v
+.ifc \type, put
+        cmp             \h,  #2
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+
+        // 2x2 v
+        ld1             {v16.s}[0], [\src], \s_strd
+        b.gt            24f
+        ld1             {v17.s}[0], [\sr2], \s_strd
+        ld1             {v18.s}[0], [\src], \s_strd
+        trn1            v16.2s,  v16.2s,  v17.2s
+        trn1            v17.2s,  v17.2s,  v18.2s
+        mul             v4.4h,   v16.4h,  v2.4h
+        mla             v4.4h,   v17.4h,  v3.4h
+        urshr           v4.8h,   v4.8h,   #4
+        st1             {v4.s}[0], [\dst]
+        st1             {v4.s}[1], [\ds2]
+        ret
+24:     // 2x4, 2x8, ... v
+        ld1             {v17.s}[0], [\sr2], \s_strd
+        ld1             {v18.s}[0], [\src], \s_strd
+        ld1             {v19.s}[0], [\sr2], \s_strd
+        ld1             {v20.s}[0], [\src], \s_strd
+        trn1            v16.2s,  v16.2s,  v17.2s
+        trn1            v17.2s,  v17.2s,  v18.2s
+        trn1            v18.2s,  v18.2s,  v19.2s
+        trn1            v19.2s,  v19.2s,  v20.2s
+        trn1            v16.2d,  v16.2d,  v18.2d
+        trn1            v17.2d,  v17.2d,  v19.2d
+        mul             v4.8h,   v16.8h,  v2.8h
+        mla             v4.8h,   v17.8h,  v3.8h
+        subs            \h,  \h,  #4
+        urshr           v4.8h,   v4.8h,   #4
+        st1             {v4.s}[0], [\dst], \d_strd
+        st1             {v4.s}[1], [\ds2], \d_strd
+        st1             {v4.s}[2], [\dst], \d_strd
+        st1             {v4.s}[3], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.8b,  v20.8b
+        b               24b
+0:
+        ret
+.endif
+
+40:     // 4xN v
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        ld1             {v16.4h}, [\src], \s_strd
+4:
+        ld1             {v17.4h}, [\sr2], \s_strd
+        ld1             {v18.4h}, [\src], \s_strd
+        trn1            v16.2d,  v16.2d,  v17.2d
+        trn1            v17.2d,  v17.2d,  v18.2d
+        mul             v4.8h,   v16.8h,  v2.8h
+        mla             v4.8h,   v17.8h,  v3.8h
+        subs            \h,  \h,  #2
+.ifc \type, put
+        urshr           v4.8h,   v4.8h,   #4
+.else
+        urshl           v4.8h,   v4.8h,   v31.8h
+        sub             v4.8h,   v4.8h,   v29.8h
+.endif
+        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.d}[1], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.8b,  v18.8b
+        b               4b
+0:
+        ret
+
+80:     // 8xN v
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        ld1             {v16.8h}, [\src], \s_strd
+8:
+        ld1             {v17.8h}, [\sr2], \s_strd
+        ld1             {v18.8h}, [\src], \s_strd
+        mul             v4.8h,   v16.8h,  v2.8h
+        mla             v4.8h,   v17.8h,  v3.8h
+        mul             v5.8h,   v17.8h,  v2.8h
+        mla             v5.8h,   v18.8h,  v3.8h
+        subs            \h,  \h,  #2
+.ifc \type, put
+        urshr           v4.8h,   v4.8h,   #4
+        urshr           v5.8h,   v5.8h,   #4
+.else
+        urshl           v4.8h,   v4.8h,   v31.8h
+        urshl           v5.8h,   v5.8h,   v31.8h
+        sub             v4.8h,   v4.8h,   v29.8h
+        sub             v5.8h,   v5.8h,   v29.8h
+.endif
+        st1             {v4.8h}, [\dst], \d_strd
+        st1             {v5.8h}, [\ds2], \d_strd
+        b.le            0f
+        mov             v16.16b, v18.16b
+        b               8b
+0:
+        ret
+
+160:    // 16xN, 32xN, ...
+320:
+640:
+1280:
+        mov             \my, \h
+1:
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        ld1             {v16.8h, v17.8h}, [\src], \s_strd
+2:
+        ld1             {v18.8h, v19.8h}, [\sr2], \s_strd
+        ld1             {v20.8h, v21.8h}, [\src], \s_strd
+        mul             v4.8h,   v16.8h,  v2.8h
+        mla             v4.8h,   v18.8h,  v3.8h
+        mul             v5.8h,   v17.8h,  v2.8h
+        mla             v5.8h,   v19.8h,  v3.8h
+        mul             v6.8h,   v18.8h,  v2.8h
+        mla             v6.8h,   v20.8h,  v3.8h
+        mul             v7.8h,   v19.8h,  v2.8h
+        mla             v7.8h,   v21.8h,  v3.8h
+        subs            \h,  \h,  #2
+.ifc \type, put
+        urshr           v4.8h,   v4.8h,   #4
+        urshr           v5.8h,   v5.8h,   #4
+        urshr           v6.8h,   v6.8h,   #4
+        urshr           v7.8h,   v7.8h,   #4
+.else
+        urshl           v4.8h,   v4.8h,   v31.8h
+        urshl           v5.8h,   v5.8h,   v31.8h
+        urshl           v6.8h,   v6.8h,   v31.8h
+        urshl           v7.8h,   v7.8h,   v31.8h
+        sub             v4.8h,   v4.8h,   v29.8h
+        sub             v5.8h,   v5.8h,   v29.8h
+        sub             v6.8h,   v6.8h,   v29.8h
+        sub             v7.8h,   v7.8h,   v29.8h
+.endif
+        st1             {v4.8h, v5.8h}, [\dst], \d_strd
+        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
+        b.le            9f
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        b               2b
+9:
+        subs            \w,  \w,  #16
+        b.le            0f
+        asr             \s_strd, \s_strd, #1
+        asr             \d_strd, \d_strd, #1
+        msub            \src, \s_strd, \xmy, \src
+        msub            \dst, \d_strd, \xmy, \dst
+        sub             \src, \src, \s_strd, lsl #1
+        mov             \h,  \my
+        add             \src, \src, #32
+        add             \dst, \dst, #32
+        b               1b
+0:
+        ret
+
+L(\type\()_bilin_v_tbl):
+        .hword L(\type\()_bilin_v_tbl) - 1280b
+        .hword L(\type\()_bilin_v_tbl) -  640b
+        .hword L(\type\()_bilin_v_tbl) -  320b
+        .hword L(\type\()_bilin_v_tbl) -  160b
+        .hword L(\type\()_bilin_v_tbl) -   80b
+        .hword L(\type\()_bilin_v_tbl) -   40b
+        .hword L(\type\()_bilin_v_tbl) -   20b
+        .hword 0
+
+L(\type\()_bilin_hv):
+        adr             x10, L(\type\()_bilin_hv_tbl)
+        dup             v31.8h,  w11      // 4 - intermediate_bits
+        ldrh            w9,  [x10, x9, lsl #1]
+        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
+.ifc \type, put
+        dup             v30.4s,  w12      // 4 + intermediate_bits
+.else
+        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
+.endif
+        sub             x10, x10, w9, uxtw
+.ifc \type, put
+        neg             v30.4s,  v30.4s   // -(4+intermediate_bits)
+.endif
+        br              x10
+
+20:     // 2xN hv
+.ifc \type, put
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        ld1             {v20.4h},  [\src], \s_strd
+        ext             v21.8b,  v20.8b,  v20.8b,  #2
+        mul             v16.4h,  v20.4h,  v0.4h
+        mla             v16.4h,  v21.4h,  v1.4h
+        urshl           v16.4h,  v16.4h,  v31.4h
+
+2:
+        ld1             {v22.4h},  [\sr2], \s_strd
+        ld1             {v24.4h},  [\src], \s_strd
+        ext             v23.8b,  v22.8b,  v22.8b,  #2
+        ext             v25.8b,  v24.8b,  v24.8b,  #2
+        trn1            v22.2s,  v22.2s,  v24.2s
+        trn1            v23.2s,  v23.2s,  v25.2s
+        mul             v17.4h,  v22.4h,  v0.4h
+        mla             v17.4h,  v23.4h,  v1.4h
+        urshl           v17.4h,  v17.4h,  v31.4h
+
+        trn1            v16.2s,  v16.2s,  v17.2s
+
+        umull           v4.4s,   v16.4h,  v2.4h
+        umlal           v4.4s,   v17.4h,  v3.4h
+        urshl           v4.4s,   v4.4s,   v30.4s
+        xtn             v4.4h,   v4.4s
+        subs            \h,  \h,  #2
+        st1             {v4.s}[0], [\dst], \d_strd
+        st1             {v4.s}[1], [\ds2], \d_strd
+        b.le            0f
+        trn2            v16.2s,  v17.2s,  v17.2s
+        b               2b
+0:
+        ret
+.endif
+
+40:     // 4xN hv
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        ld1             {v20.8h},  [\src], \s_strd
+        ext             v21.16b, v20.16b, v20.16b, #2
+        mul             v16.4h,  v20.4h,  v0.4h
+        mla             v16.4h,  v21.4h,  v1.4h
+        urshl           v16.4h,  v16.4h,  v31.4h
+
+4:
+        ld1             {v22.8h},  [\sr2], \s_strd
+        ld1             {v24.8h},  [\src], \s_strd
+        ext             v23.16b, v22.16b, v22.16b, #2
+        ext             v25.16b, v24.16b, v24.16b, #2
+        trn1            v22.2d,  v22.2d,  v24.2d
+        trn1            v23.2d,  v23.2d,  v25.2d
+        mul             v17.8h,  v22.8h,  v0.8h
+        mla             v17.8h,  v23.8h,  v1.8h
+        urshl           v17.8h,  v17.8h,  v31.8h
+
+        trn1            v16.2d,  v16.2d,  v17.2d
+
+        umull           v4.4s,   v16.4h,  v2.4h
+        umlal           v4.4s,   v17.4h,  v3.4h
+        umull2          v5.4s,   v16.8h,  v2.8h
+        umlal2          v5.4s,   v17.8h,  v3.8h
+.ifc \type, put
+        urshl           v4.4s,   v4.4s,   v30.4s
+        urshl           v5.4s,   v5.4s,   v30.4s
+        xtn             v4.4h,   v4.4s
+        xtn2            v4.8h,   v5.4s
+.else
+        rshrn           v4.4h,   v4.4s,   #4
+        rshrn2          v4.8h,   v5.4s,   #4
+        sub             v4.8h,   v4.8h,   v29.8h
+.endif
+        subs            \h,  \h,  #2
+        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.d}[1], [\ds2], \d_strd
+        b.le            0f
+        trn2            v16.2d,  v17.2d,  v17.2d
+        b               4b
+0:
+        ret
+
+80:     // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+        mov             \my, \h
+
+1:
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        ldr             h21, [\src, #16]
+        ld1             {v20.8h},  [\src], \s_strd
+        ext             v21.16b, v20.16b, v21.16b, #2
+        mul             v16.8h,  v20.8h,  v0.8h
+        mla             v16.8h,  v21.8h,  v1.8h
+        urshl           v16.8h,  v16.8h,  v31.8h
+
+2:
+        ldr             h23, [\sr2, #16]
+        ld1             {v22.8h},  [\sr2], \s_strd
+        ldr             h25, [\src, #16]
+        ld1             {v24.8h},  [\src], \s_strd
+        ext             v23.16b, v22.16b, v23.16b, #2
+        ext             v25.16b, v24.16b, v25.16b, #2
+        mul             v17.8h,  v22.8h,  v0.8h
+        mla             v17.8h,  v23.8h,  v1.8h
+        mul             v18.8h,  v24.8h,  v0.8h
+        mla             v18.8h,  v25.8h,  v1.8h
+        urshl           v17.8h,  v17.8h,  v31.8h
+        urshl           v18.8h,  v18.8h,  v31.8h
+
+        umull           v4.4s,   v16.4h,  v2.4h
+        umlal           v4.4s,   v17.4h,  v3.4h
+        umull2          v5.4s,   v16.8h,  v2.8h
+        umlal2          v5.4s,   v17.8h,  v3.8h
+        umull           v6.4s,   v17.4h,  v2.4h
+        umlal           v6.4s,   v18.4h,  v3.4h
+        umull2          v7.4s,   v17.8h,  v2.8h
+        umlal2          v7.4s,   v18.8h,  v3.8h
+.ifc \type, put
+        urshl           v4.4s,   v4.4s,   v30.4s
+        urshl           v5.4s,   v5.4s,   v30.4s
+        urshl           v6.4s,   v6.4s,   v30.4s
+        urshl           v7.4s,   v7.4s,   v30.4s
+        xtn             v4.4h,   v4.4s
+        xtn2            v4.8h,   v5.4s
+        xtn             v5.4h,   v6.4s
+        xtn2            v5.8h,   v7.4s
+.else
+        rshrn           v4.4h,   v4.4s,   #4
+        rshrn2          v4.8h,   v5.4s,   #4
+        rshrn           v5.4h,   v6.4s,   #4
+        rshrn2          v5.8h,   v7.4s,   #4
+        sub             v4.8h,   v4.8h,   v29.8h
+        sub             v5.8h,   v5.8h,   v29.8h
+.endif
+        subs            \h,  \h,  #2
+        st1             {v4.8h}, [\dst], \d_strd
+        st1             {v5.8h}, [\ds2], \d_strd
+        b.le            9f
+        mov             v16.16b, v18.16b
+        b               2b
+9:
+        subs            \w,  \w,  #8
+        b.le            0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        msub            \src,  \s_strd,  \xmy,  \src
+        msub            \dst,  \d_strd,  \xmy,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #1
+        mov             \h,  \my
+        add             \src,  \src,  #16
+        add             \dst,  \dst,  #16
+        b               1b
+0:
+        ret
+
+L(\type\()_bilin_hv_tbl):
+        .hword L(\type\()_bilin_hv_tbl) - 1280b
+        .hword L(\type\()_bilin_hv_tbl) -  640b
+        .hword L(\type\()_bilin_hv_tbl) -  320b
+        .hword L(\type\()_bilin_hv_tbl) -  160b
+        .hword L(\type\()_bilin_hv_tbl) -   80b
+        .hword L(\type\()_bilin_hv_tbl) -   40b
+        .hword L(\type\()_bilin_hv_tbl) -   20b
+        .hword 0
+endfunc
+.endm
+
+filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+
+.macro load_filter_row dst, src, inc
+        asr             w13, \src, #10
+        ldr             \dst, [x11, w13, sxtw #3]
+        add             \src, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+        add             w12, w5,  #512
+
+        ld1             {v16.8h, v17.8h}, [x2], x3
+
+        load_filter_row d0, w12, w7
+        load_filter_row d1, w12, w7
+        load_filter_row d2, w12, w7
+        sxtl            v0.8h,   v0.8b
+        load_filter_row d3, w12, w7
+        sxtl            v1.8h,   v1.8b
+        load_filter_row d4, w12, w7
+        sxtl            v2.8h,   v2.8b
+        load_filter_row d5, w12, w7
+        sxtl            v3.8h,   v3.8b
+        load_filter_row d6, w12, w7
+        sxtl            v4.8h,   v4.8b
+        load_filter_row d7, w12, w7
+        sxtl            v5.8h,   v5.8b
+        ext             v18.16b, v16.16b, v17.16b, #2*1
+        smull           v8.4s,   v16.4h,  v0.4h
+        smull2          v9.4s,   v16.8h,  v0.8h
+        sxtl            v6.8h,   v6.8b
+        ext             v19.16b, v16.16b, v17.16b, #2*2
+        smull           v10.4s,  v18.4h,  v1.4h
+        smull2          v11.4s,  v18.8h,  v1.8h
+        sxtl            v7.8h,   v7.8b
+        ext             v20.16b, v16.16b, v17.16b, #2*3
+        smull           v0.4s,   v19.4h,  v2.4h
+        smull2          v1.4s,   v19.8h,  v2.8h
+        ext             v21.16b, v16.16b, v17.16b, #2*4
+        addp            v8.4s,   v8.4s,   v9.4s
+        smull           v2.4s,   v20.4h,  v3.4h
+        smull2          v3.4s,   v20.8h,  v3.8h
+        ext             v22.16b, v16.16b, v17.16b, #2*5
+        addp            v9.4s,   v10.4s,  v11.4s
+        smull           v10.4s,  v21.4h,  v4.4h
+        smull2          v11.4s,  v21.8h,  v4.8h
+        ext             v23.16b, v16.16b, v17.16b, #2*6
+        addp            v0.4s,   v0.4s,   v1.4s
+        smull           v18.4s,  v22.4h,  v5.4h
+        smull2          v19.4s,  v22.8h,  v5.8h
+        ext             v16.16b, v16.16b, v17.16b, #2*7
+        addp            v1.4s,   v2.4s,   v3.4s
+        addp            v2.4s,   v10.4s,  v11.4s
+        smull           v20.4s,  v23.4h,  v6.4h
+        smull2          v21.4s,  v23.8h,  v6.8h
+        addp            v3.4s,   v18.4s,  v19.4s
+        smull           v22.4s,  v16.4h,  v7.4h
+        smull2          v23.4s,  v16.8h,  v7.8h
+        addp            v4.4s,   v20.4s,  v21.4s
+        addp            v5.4s,   v22.4s,  v23.4s
+
+        addp            v8.4s,   v8.4s,   v9.4s
+        addp            v0.4s,   v0.4s,   v1.4s
+        addp            v2.4s,   v2.4s,   v3.4s
+        addp            v4.4s,   v4.4s,   v5.4s
+
+        addp            v16.4s,  v8.4s,   v0.4s
+        addp            v17.4s,  v2.4s,   v4.4s
+
+        add             w5,  w5,  w8
+
+        srshl           v16.4s,  v16.4s,  v14.4s // -(7 - intermediate_bits)
+        srshl           v17.4s,  v17.4s,  v14.4s // -(7 - intermediate_bits)
+
+        ret
+endfunc
+
+// void dav1d_warp_affine_8x8_16bpc_neon(
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *src, const ptrdiff_t src_stride,
+//         const int16_t *const abcd, int mx, int my,
+//         const int bitdepth_max)
+.macro warp t
+function warp_affine_8x8\t\()_16bpc_neon, export=1
+        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]
+
+.ifb \t
+        dup             v15.8h,  w7        // bitdepth_max
+.else
+        movi            v15.8h,  #(PREP_BIAS >> 8), lsl #8
+.endif
+        clz             w7,  w7
+                                           // intermediate_bits = clz(bitdepth_max) - 18
+.ifb \t
+        sub             w8,  w7,  #11      // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
+.endif
+        sub             w7,  w7,  #25      // -(7 - intermediate_bits)
+.ifb \t
+        neg             w8,  w8            // -(7 + intermediate_bits)
+.endif
+        dup             v14.4s,  w7        // -(7 - intermediate_bits)
+.ifb \t
+        dup             v13.4s,  w8        // -(7 + intermediate_bits)
+.endif
+
+        ldr             x4,  [x4]
+        sbfx            x7,  x4, #0,  #16
+        sbfx            x8,  x4, #16, #16
+        sbfx            x9,  x4, #32, #16
+        sbfx            x4,  x4, #48, #16
+        mov             w10, #8
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        sub             x2,  x2,  #6
+        movrel          x11, X(mc_warp_filter), 64*8
+        mov             x15, x30
+.ifnb \t
+        lsl             x1,  x1,  #1
+.endif
+
+        bl              warp_filter_horz_neon
+        xtn             v24.4h,  v16.4s
+        xtn2            v24.8h,  v17.4s
+        bl              warp_filter_horz_neon
+        xtn             v25.4h,  v16.4s
+        xtn2            v25.8h,  v17.4s
+        bl              warp_filter_horz_neon
+        xtn             v26.4h,  v16.4s
+        xtn2            v26.8h,  v17.4s
+        bl              warp_filter_horz_neon
+        xtn             v27.4h,  v16.4s
+        xtn2            v27.8h,  v17.4s
+        bl              warp_filter_horz_neon
+        xtn             v28.4h,  v16.4s
+        xtn2            v28.8h,  v17.4s
+        bl              warp_filter_horz_neon
+        xtn             v29.4h,  v16.4s
+        xtn2            v29.8h,  v17.4s
+        bl              warp_filter_horz_neon
+        xtn             v30.4h,  v16.4s
+        xtn2            v30.8h,  v17.4s
+
+1:
+        add             w14, w6,  #512
+        bl              warp_filter_horz_neon
+        xtn             v31.4h,  v16.4s
+        xtn2            v31.8h,  v17.4s
+
+        load_filter_row d0, w14, w9
+        load_filter_row d1, w14, w9
+        load_filter_row d2, w14, w9
+        load_filter_row d3, w14, w9
+        load_filter_row d4, w14, w9
+        load_filter_row d5, w14, w9
+        load_filter_row d6, w14, w9
+        load_filter_row d7, w14, w9
+        transpose_8x8b  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        sxtl            v2.8h,   v2.8b
+        sxtl            v3.8h,   v3.8b
+        sxtl            v4.8h,   v4.8b
+        sxtl            v5.8h,   v5.8b
+        sxtl            v6.8h,   v6.8b
+        sxtl            v7.8h,   v7.8b
+
+        // This ordering of smull/smlal/smull2/smlal2 is highly
+        // beneficial for Cortex A53 here.
+        smull           v16.4s,  v24.4h,  v0.4h
+        smlal           v16.4s,  v25.4h,  v1.4h
+        smlal           v16.4s,  v26.4h,  v2.4h
+        smlal           v16.4s,  v27.4h,  v3.4h
+        smlal           v16.4s,  v28.4h,  v4.4h
+        smlal           v16.4s,  v29.4h,  v5.4h
+        smlal           v16.4s,  v30.4h,  v6.4h
+        smlal           v16.4s,  v31.4h,  v7.4h
+        smull2          v17.4s,  v24.8h,  v0.8h
+        smlal2          v17.4s,  v25.8h,  v1.8h
+        smlal2          v17.4s,  v26.8h,  v2.8h
+        smlal2          v17.4s,  v27.8h,  v3.8h
+        smlal2          v17.4s,  v28.8h,  v4.8h
+        smlal2          v17.4s,  v29.8h,  v5.8h
+        smlal2          v17.4s,  v30.8h,  v6.8h
+        smlal2          v17.4s,  v31.8h,  v7.8h
+
+        mov             v24.16b, v25.16b
+        mov             v25.16b, v26.16b
+.ifb \t
+        srshl           v16.4s,  v16.4s,  v13.4s // -(7 + intermediate_bits)
+        srshl           v17.4s,  v17.4s,  v13.4s // -(7 + intermediate_bits)
+.else
+        rshrn           v16.4h,  v16.4s,  #7
+        rshrn2          v16.8h,  v17.4s,  #7
+.endif
+        mov             v26.16b, v27.16b
+.ifb \t
+        sqxtun          v16.4h,  v16.4s
+        sqxtun2         v16.8h,  v17.4s
+.else
+        sub             v16.8h,  v16.8h,  v15.8h // PREP_BIAS
+.endif
+        mov             v27.16b, v28.16b
+        mov             v28.16b, v29.16b
+.ifb \t
+        umin            v16.8h,  v16.8h,  v15.8h // bitdepth_max
+.endif
+        mov             v29.16b, v30.16b
+        mov             v30.16b, v31.16b
+        subs            w10, w10, #1
+        st1             {v16.8h}, [x0], x1
+
+        add             w6,  w6,  w4
+        b.gt            1b
+
+        ldp             d14, d15, [sp, #0x30]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x40
+
+        br              x15
+endfunc
+.endm
+
+warp
+warp t
+
+// void dav1d_emu_edge_16bpc_neon(
+//         const intptr_t bw, const intptr_t bh,
+//         const intptr_t iw, const intptr_t ih,
+//         const intptr_t x, const intptr_t y,
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_16bpc_neon, export=1
+        ldp             x8,  x9,  [sp]
+
+        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+        // ref += iclip(x, 0, iw - 1)
+        sub             x12, x3,  #1           // ih - 1
+        cmp             x5,  x3
+        sub             x13, x2,  #1           // iw - 1
+        csel            x12, x12, x5,  ge      // min(y, ih - 1)
+        cmp             x4,  x2
+        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+        csel            x13, x13, x4,  ge      // min(x, iw - 1)
+        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
+        add             x8,  x8,  x13, lsl #1  // ref += iclip()
+
+        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+        // top_ext = iclip(-y, 0, bh - 1)
+        add             x10, x5,  x1           // y + bh
+        neg             x5,  x5                // -y
+        sub             x10, x10, x3           // y + bh - ih
+        sub             x12, x1,  #1           // bh - 1
+        cmp             x10, x1
+        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
+        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
+        cmp             x5,  x1
+        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
+
+        // right_ext = iclip(x + bw - iw, 0, bw - 1)
+        // left_ext = iclip(-x, 0, bw - 1)
+        add             x11, x4,  x0           // x + bw
+        neg             x4,  x4                // -x
+        sub             x11, x11, x2           // x + bw - iw
+        sub             x13, x0,  #1           // bw - 1
+        cmp             x11, x0
+        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
+        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
+        cmp             x4,  x0
+        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
+
+        // center_h = bh - top_ext - bottom_ext
+        // dst += top_ext * PXSTRIDE(dst_stride)
+        // center_w = bw - left_ext - right_ext
+        sub             x1,  x1,  x5           // bh - top_ext
+        madd            x6,  x5,  x7,  x6
+        sub             x2,  x0,  x4           // bw - left_ext
+        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
+        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
+
+        mov             x14, x6                // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+        ld1r            {v0.8h}, [x8]
+        mov             x12, x6                // out = dst
+        mov             x3,  x4
+        mov             v1.16b,  v0.16b
+1:
+        subs            x3,  x3,  #16
+        st1             {v0.8h, v1.8h}, [x12], #32
+        b.gt            1b
+.endif
+        mov             x13, x8
+        add             x12, x6,  x4, lsl #1   // out = dst + left_ext
+        mov             x3,  x2
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
+        subs            x3,  x3,  #32
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
+        b.gt            1b
+.if \need_right
+        add             x3,  x8,  x2, lsl #1   // in + center_w
+        sub             x3,  x3,  #2           // in + center_w - 1
+        add             x12, x6,  x4, lsl #1   // dst + left_ext
+        ld1r            {v0.8h}, [x3]
+        add             x12, x12, x2, lsl #1   // out = dst + left_ext + center_w
+        mov             x3,  x11
+        mov             v1.16b,  v0.16b
+1:
+        subs            x3,  x3,  #16
+        st1             {v0.8h, v1.8h}, [x12], #32
+        b.gt            1b
+.endif
+
+        subs            x1,  x1,  #1           // center_h--
+        add             x6,  x6,  x7
+        add             x8,  x8,  x9
+        b.gt            0b
+.endm
+
+        cbz             x4,  2f
+        // need_left
+        cbz             x11, 3f
+        // need_left + need_right
+        v_loop          1,   1
+        b               5f
+
+2:
+        // !need_left
+        cbz             x11, 4f
+        // !need_left + need_right
+        v_loop          0,   1
+        b               5f
+
+3:
+        // need_left + !need_right
+        v_loop          1,   0
+        b               5f
+
+4:
+        // !need_left + !need_right
+        v_loop          0,   0
+
+5:
+
+        cbz             x10, 3f
+        // need_bottom
+        sub             x8,  x6,  x7           // ref = dst - stride
+        mov             x4,  x0
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
+        mov             x3,  x10
+2:
+        subs            x3,  x3,  #1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+        b.gt            2b
+        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
+        subs            x4,  x4,  #32          // bw -= 32
+        add             x6,  x6,  #64          // dst += 32
+        b.gt            1b
+
+3:
+        cbz             x5,  3f
+        // need_top
+        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
+        mov             x3,  x5
+2:
+        subs            x3,  x3,  #1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+        b.gt            2b
+        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
+        subs            x0,  x0,  #32          // bw -= 32
+        add             x6,  x6,  #64          // dst += 32
+        b.gt            1b
+
+3:
+        ret
+endfunc
diff --git a/src/arm/64/msac.S b/src/arm/64/msac.S
new file mode 100644 (file)
index 0000000..3a6cf90
--- /dev/null
@@ -0,0 +1,480 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define BUF_POS 0
+#define BUF_END 8
+#define DIF 16
+#define RNG 24
+#define CNT 28
+#define ALLOW_UPDATE_CDF 32
+
+const coeffs
+        .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+        .short 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0
+endconst
+
+const bits
+        .short   0x1,   0x2,   0x4,   0x8,   0x10,   0x20,   0x40,   0x80
+        .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
+endconst
+
+.macro ld1_n d0, d1, src, sz, n
+.if \n <= 8
+        ld1             {\d0\sz},  [\src]
+.else
+        ld1             {\d0\sz, \d1\sz},  [\src]
+.endif
+.endm
+
+.macro st1_n s0, s1, dst, sz, n
+.if \n <= 8
+        st1             {\s0\sz},  [\dst]
+.else
+        st1             {\s0\sz, \s1\sz},  [\dst]
+.endif
+.endm
+
+.macro ushr_n d0, d1, s0, s1, shift, sz, n
+        ushr            \d0\sz,  \s0\sz,  \shift
+.if \n == 16
+        ushr            \d1\sz,  \s1\sz,  \shift
+.endif
+.endm
+
+.macro add_n d0, d1, s0, s1, s2, s3, sz, n
+        add             \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        add             \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro sub_n d0, d1, s0, s1, s2, s3, sz, n
+        sub             \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        sub             \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro and_n d0, d1, s0, s1, s2, s3, sz, n
+        and             \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        and             \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n
+        cmhs            \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        cmhs            \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n
+        urhadd          \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        urhadd          \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
+        sshl            \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        sshl            \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
+        sqdmulh         \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        sqdmulh         \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro str_n            idx0, idx1, dstreg, dstoff, n
+        str             \idx0,  [\dstreg, \dstoff]
+.if \n == 16
+        str             \idx1,  [\dstreg, \dstoff + 16]
+.endif
+.endm
+
+// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+//                                               size_t n_symbols);
+
+function msac_decode_symbol_adapt4_neon, export=1
+.macro decode_update sz, szb, n
+        sub             sp,  sp,  #48
+        add             x8,  x0,  #RNG
+        ld1_n           v0,  v1,  x1,  \sz, \n                    // cdf
+        ld1r            {v4\sz},  [x8]                            // rng
+        movrel          x9,  coeffs, 30
+        movi            v31\sz, #0x7f, lsl #8                     // 0x7f00
+        sub             x9,  x9,  x2, lsl #1
+        mvni            v30\sz, #0x3f                             // 0xffc0
+        and             v7\szb, v4\szb, v31\szb                   // rng & 0x7f00
+        str             h4,  [sp, #14]                            // store original u = s->rng
+        and_n           v2,  v3,  v0,  v1,  v30, v30, \szb, \n    // cdf & 0xffc0
+
+        ld1_n           v4,  v5,  x9,  \sz, \n                    // EC_MIN_PROB * (n_symbols - ret)
+        sqdmulh_n       v6,  v7,  v2,  v3,  v7,  v7,  \sz, \n     // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+        add             x8,  x0,  #DIF + 6
+
+        add_n           v4,  v5,  v2,  v3,  v4,  v5,  \sz, \n     // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+        add_n           v4,  v5,  v6,  v7,  v4,  v5,  \sz, \n     // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+
+        ld1r            {v6.8h},  [x8]                            // dif >> (EC_WIN_SIZE - 16)
+        movrel          x8,  bits
+        str_n           q4,  q5,  sp, #16, \n                     // store v values to allow indexed access
+
+        ld1_n           v16, v17, x8,  .8h, \n
+
+        cmhs_n          v2,  v3,  v6,  v6,  v4,  v5,  .8h,  \n    // c >= v
+
+        and_n           v6,  v7,  v2,  v3,  v16, v17, .16b, \n    // One bit per halfword set in the mask
+.if \n == 16
+        add             v6.8h,  v6.8h,  v7.8h
+.endif
+        addv            h6,  v6.8h                                // Aggregate mask bits
+        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
+        umov            w3,  v6.h[0]
+        rbit            w3,  w3
+        clz             w15, w3                                   // ret
+
+        cbz             w4,  L(renorm)
+        // update_cdf
+        ldrh            w3,  [x1, x2, lsl #1]                     // count = cdf[n_symbols]
+        movi            v5\szb, #0xff
+.if \n == 16
+        mov             w4,  #-5
+.else
+        mvn             w14, w2
+        mov             w4,  #-4
+        cmn             w14, #3                                   // set C if n_symbols <= 2
+.endif
+        urhadd_n        v4,  v5,  v5,  v5,  v2,  v3,  \sz, \n     // i >= val ? -1 : 32768
+.if \n == 16
+        sub             w4,  w4,  w3, lsr #4                      // -((count >> 4) + 5)
+.else
+        lsr             w14, w3,  #4                              // count >> 4
+        sbc             w4,  w4,  w14                             // -((count >> 4) + (n_symbols > 2) + 4)
+.endif
+        sub_n           v4,  v5,  v4,  v5,  v0,  v1,  \sz, \n     // (32768 - cdf[i]) or (-1 - cdf[i])
+        dup             v6\sz,    w4                              // -rate
+
+        sub             w3,  w3,  w3, lsr #5                      // count - (count == 32)
+        sub_n           v0,  v1,  v0,  v1,  v2,  v3,  \sz, \n     // cdf + (i >= val ? 1 : 0)
+        sshl_n          v4,  v5,  v4,  v5,  v6,  v6,  \sz, \n     // ({32768,-1} - cdf[i]) >> rate
+        add             w3,  w3,  #1                              // count + (count < 32)
+        add_n           v0,  v1,  v0,  v1,  v4,  v5,  \sz, \n     // cdf + (32768 - cdf[i]) >> rate
+        st1_n           v0,  v1,  x1,  \sz, \n
+        strh            w3,  [x1, x2, lsl #1]
+.endm
+
+        decode_update   .4h, .8b, 4
+
+L(renorm):
+        add             x8,  sp,  #16
+        add             x8,  x8,  w15, uxtw #1
+        ldrh            w3,  [x8]              // v
+        ldurh           w4,  [x8, #-2]         // u
+        ldr             w6,  [x0, #CNT]
+        ldr             x7,  [x0, #DIF]
+        sub             w4,  w4,  w3           // rng = u - v
+        clz             w5,  w4                // clz(rng)
+        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
+        mvn             x7,  x7                // ~dif
+        add             x7,  x7,  x3, lsl #48  // ~dif + (v << 48)
+L(renorm2):
+        lsl             w4,  w4,  w5           // rng << d
+        subs            w6,  w6,  w5           // cnt -= d
+        lsl             x7,  x7,  x5           // (~dif + (v << 48)) << d
+        str             w4,  [x0, #RNG]
+        mvn             x7,  x7                // ~dif
+        b.hs            9f
+
+        // refill
+        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
+        add             x5,  x3,  #8
+        cmp             x5,  x4
+        b.gt            2f
+
+        ldr             x3,  [x3]              // next_bits
+        add             w8,  w6,  #23          // shift_bits = cnt + 23
+        add             w6,  w6,  #16          // cnt += 16
+        rev             x3,  x3                // next_bits = bswap(next_bits)
+        sub             x5,  x5,  x8, lsr #3   // buf_pos -= shift_bits >> 3
+        and             w8,  w8,  #24          // shift_bits &= 24
+        lsr             x3,  x3,  x8           // next_bits >>= shift_bits
+        sub             w8,  w8,  w6           // shift_bits -= 16 + cnt
+        str             x5,  [x0, #BUF_POS]
+        lsl             x3,  x3,  x8           // next_bits <<= shift_bits
+        mov             w4,  #48
+        sub             w6,  w4,  w8           // cnt = cnt + 64 - shift_bits
+        eor             x7,  x7,  x3           // dif ^= next_bits
+        b               9f
+
+2:      // refill_eob
+        mov             w14, #40
+        sub             w5,  w14, w6           // c = 40 - cnt
+3:
+        cmp             x3,  x4
+        b.ge            4f
+        ldrb            w8,  [x3], #1
+        lsl             x8,  x8,  x5
+        eor             x7,  x7,  x8
+        subs            w5,  w5,  #8
+        b.ge            3b
+
+4:      // refill_eob_end
+        str             x3,  [x0, #BUF_POS]
+        sub             w6,  w14, w5           // cnt = 40 - c
+
+9:
+        str             w6,  [x0, #CNT]
+        str             x7,  [x0, #DIF]
+
+        mov             w0,  w15
+        add             sp,  sp,  #48
+        ret
+endfunc
+
+function msac_decode_symbol_adapt8_neon, export=1
+        decode_update   .8h, .16b, 8
+        b               L(renorm)
+endfunc
+
+function msac_decode_symbol_adapt16_neon, export=1
+        decode_update   .8h, .16b, 16
+        b               L(renorm)
+endfunc
+
+function msac_decode_hi_tok_neon, export=1
+        ld1             {v0.4h},  [x1]            // cdf
+        add             x16, x0,  #RNG
+        movi            v31.4h, #0x7f, lsl #8     // 0x7f00
+        movrel          x17, coeffs, 30-2*3
+        mvni            v30.4h, #0x3f             // 0xffc0
+        ldrh            w9,  [x1, #6]             // count = cdf[n_symbols]
+        ld1r            {v3.4h},  [x16]           // rng
+        movrel          x16, bits
+        ld1             {v29.4h}, [x17]           // EC_MIN_PROB * (n_symbols - ret)
+        add             x17, x0,  #DIF + 6
+        ld1             {v16.8h}, [x16]
+        mov             w13, #-24
+        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
+        ldr             w10, [x0, #ALLOW_UPDATE_CDF]
+        ld1r            {v1.8h},  [x17]           // dif >> (EC_WIN_SIZE - 16)
+        sub             sp,  sp,  #48
+        ldr             w6,  [x0, #CNT]
+        ldr             x7,  [x0, #DIF]
+1:
+        and             v7.8b,   v3.8b,   v31.8b  // rng & 0x7f00
+        sqdmulh         v6.4h,   v17.4h,  v7.4h   // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+        add             v4.4h,   v17.4h,  v29.4h  // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+        add             v4.4h,   v6.4h,   v4.4h   // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+        str             h3,  [sp, #14]            // store original u = s->rng
+        cmhs            v2.8h,   v1.8h,   v4.8h   // c >= v
+        str             q4,  [sp, #16]            // store v values to allow indexed access
+        and             v6.16b,  v2.16b,  v16.16b // One bit per halfword set in the mask
+        addv            h6,  v6.8h                // Aggregate mask bits
+        umov            w3,  v6.h[0]
+        add             w13, w13, #5
+        rbit            w3,  w3
+        add             x8,  sp,  #16
+        clz             w15, w3                   // ret
+
+        cbz             w10, 2f
+        // update_cdf
+        movi            v5.8b, #0xff
+        mov             w4,  #-5
+        urhadd          v4.4h,   v5.4h,   v2.4h   // i >= val ? -1 : 32768
+        sub             w4,  w4,  w9, lsr #4      // -((count >> 4) + 5)
+        sub             v4.4h,   v4.4h,   v0.4h   // (32768 - cdf[i]) or (-1 - cdf[i])
+        dup             v6.4h,    w4              // -rate
+
+        sub             w9,  w9,  w9, lsr #5      // count - (count == 32)
+        sub             v0.4h,   v0.4h,   v2.4h   // cdf + (i >= val ? 1 : 0)
+        sshl            v4.4h,   v4.4h,   v6.4h   // ({32768,-1} - cdf[i]) >> rate
+        add             w9,  w9,  #1              // count + (count < 32)
+        add             v0.4h,   v0.4h,   v4.4h   // cdf + (32768 - cdf[i]) >> rate
+        st1             {v0.4h},  [x1]
+        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
+        strh            w9,  [x1, #6]
+
+2:
+        add             x8,  x8,  w15, uxtw #1
+        ldrh            w3,  [x8]              // v
+        ldurh           w4,  [x8, #-2]         // u
+        sub             w4,  w4,  w3           // rng = u - v
+        clz             w5,  w4                // clz(rng)
+        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
+        mvn             x7,  x7                // ~dif
+        add             x7,  x7,  x3, lsl #48  // ~dif + (v << 48)
+        lsl             w4,  w4,  w5           // rng << d
+        subs            w6,  w6,  w5           // cnt -= d
+        lsl             x7,  x7,  x5           // (~dif + (v << 48)) << d
+        str             w4,  [x0, #RNG]
+        dup             v3.4h,   w4
+        mvn             x7,  x7                // ~dif
+        b.hs            9f
+
+        // refill
+        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
+        add             x5,  x3,  #8
+        cmp             x5,  x4
+        b.gt            2f
+
+        ldr             x3,  [x3]              // next_bits
+        add             w8,  w6,  #23          // shift_bits = cnt + 23
+        add             w6,  w6,  #16          // cnt += 16
+        rev             x3,  x3                // next_bits = bswap(next_bits)
+        sub             x5,  x5,  x8, lsr #3   // buf_pos -= shift_bits >> 3
+        and             w8,  w8,  #24          // shift_bits &= 24
+        lsr             x3,  x3,  x8           // next_bits >>= shift_bits
+        sub             w8,  w8,  w6           // shift_bits -= 16 + cnt
+        str             x5,  [x0, #BUF_POS]
+        lsl             x3,  x3,  x8           // next_bits <<= shift_bits
+        mov             w4,  #48
+        sub             w6,  w4,  w8           // cnt = cnt + 64 - shift_bits
+        eor             x7,  x7,  x3           // dif ^= next_bits
+        b               9f
+
+2:      // refill_eob
+        mov             w14, #40
+        sub             w5,  w14, w6           // c = 40 - cnt
+3:
+        cmp             x3,  x4
+        b.ge            4f
+        ldrb            w8,  [x3], #1
+        lsl             x8,  x8,  x5
+        eor             x7,  x7,  x8
+        subs            w5,  w5,  #8
+        b.ge            3b
+
+4:      // refill_eob_end
+        str             x3,  [x0, #BUF_POS]
+        sub             w6,  w14, w5           // cnt = 40 - c
+
+9:
+        lsl             w15, w15, #1
+        sub             w15, w15, #5
+        lsr             x12, x7,  #48
+        adds            w13, w13, w15          // carry = tok_br < 3 || tok == 15
+        dup             v1.8h,   w12
+        b.cc            1b                     // loop if !carry
+        add             w13, w13, #30
+        str             w6,  [x0, #CNT]
+        add             sp,  sp,  #48
+        str             x7,  [x0, #DIF]
+        lsr             w0,  w13, #1
+        ret
+endfunc
+
+function msac_decode_bool_equi_neon, export=1
+        ldp             w5,  w6,  [x0, #RNG]   // + CNT
+        sub             sp,  sp,  #48
+        ldr             x7,  [x0, #DIF]
+        bic             w4,  w5,  #0xff        // r &= 0xff00
+        add             w4,  w4,  #8
+        subs            x8,  x7,  x4, lsl #47  // dif - vw
+        lsr             w4,  w4,  #1           // v
+        sub             w5,  w5,  w4           // r - v
+        cset            w15, lo
+        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
+        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
+
+        clz             w5,  w4                // clz(rng)
+        mvn             x7,  x7                // ~dif
+        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
+        b               L(renorm2)
+endfunc
+
+function msac_decode_bool_neon, export=1
+        ldp             w5,  w6,  [x0, #RNG]   // + CNT
+        sub             sp,  sp,  #48
+        ldr             x7,  [x0, #DIF]
+        lsr             w4,  w5,  #8           // r >> 8
+        bic             w1,  w1,  #0x3f        // f &= ~63
+        mul             w4,  w4,  w1
+        lsr             w4,  w4,  #7
+        add             w4,  w4,  #4           // v
+        subs            x8,  x7,  x4, lsl #48  // dif - vw
+        sub             w5,  w5,  w4           // r - v
+        cset            w15, lo
+        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
+        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
+
+        clz             w5,  w4                // clz(rng)
+        mvn             x7,  x7                // ~dif
+        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
+        b               L(renorm2)
+endfunc
+
+function msac_decode_bool_adapt_neon, export=1
+        ldr             w9,  [x1]              // cdf[0-1]
+        ldp             w5,  w6,  [x0, #RNG]   // + CNT
+        sub             sp,  sp,  #48
+        ldr             x7,  [x0, #DIF]
+        lsr             w4,  w5,  #8           // r >> 8
+        and             w2,  w9,  #0xffc0      // f &= ~63
+        mul             w4,  w4,  w2
+        lsr             w4,  w4,  #7
+        add             w4,  w4,  #4           // v
+        subs            x8,  x7,  x4, lsl #48  // dif - vw
+        sub             w5,  w5,  w4           // r - v
+        cset            w15, lo
+        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
+        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
+
+        ldr             w10, [x0, #ALLOW_UPDATE_CDF]
+
+        clz             w5,  w4                // clz(rng)
+        mvn             x7,  x7                // ~dif
+        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
+
+        cbz             w10, L(renorm2)
+
+        lsr             w2,  w9,  #16          // count = cdf[1]
+        and             w9,  w9,  #0xffff      // cdf[0]
+
+        sub             w3,  w2,  w2, lsr #5   // count - (count >= 32)
+        lsr             w2,  w2,  #4           // count >> 4
+        add             w10, w3,  #1           // count + (count < 32)
+        add             w2,  w2,  #4           // rate = (count >> 4) | 4
+
+        sub             w9,  w9,  w15          // cdf[0] -= bit
+        sub             w11, w9,  w15, lsl #15 // {cdf[0], cdf[0] - 32769}
+        asr             w11, w11, w2           // {cdf[0], cdf[0] - 32769} >> rate
+        sub             w9,  w9,  w11          // cdf[0]
+
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+
+        b               L(renorm2)
+endfunc
diff --git a/src/arm/64/util.S b/src/arm/64/util.S
new file mode 100644 (file)
index 0000000..fc0e0d0
--- /dev/null
@@ -0,0 +1,197 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2015 Martin Storsjo
+ * Copyright © 2015 Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#ifndef DAV1D_SRC_ARM_64_UTIL_S
+#define DAV1D_SRC_ARM_64_UTIL_S
+
+#include "config.h"
+#include "src/arm/asm.S"
+
+.macro  movrel rd, val, offset=0
+#if defined(__APPLE__)
+  .if \offset < 0
+        adrp            \rd, \val@PAGE
+        add             \rd, \rd, \val@PAGEOFF
+        sub             \rd, \rd, -(\offset)
+  .else
+        adrp            \rd, \val+(\offset)@PAGE
+        add             \rd, \rd, \val+(\offset)@PAGEOFF
+  .endif
+#elif defined(PIC) && defined(_WIN32)
+  .if \offset < 0
+        adrp            \rd, \val
+        add             \rd, \rd, :lo12:\val
+        sub             \rd, \rd, -(\offset)
+  .else
+        adrp            \rd, \val+(\offset)
+        add             \rd, \rd, :lo12:\val+(\offset)
+  .endif
+#elif defined(PIC)
+        adrp            \rd, \val+(\offset)
+        add             \rd, \rd, :lo12:\val+(\offset)
+#else
+        ldr             \rd, =\val+\offset
+#endif
+.endm
+
+.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
+        trn1            \t8\().8b,  \r0\().8b,  \r1\().8b
+        trn2            \t9\().8b,  \r0\().8b,  \r1\().8b
+        trn1            \r1\().8b,  \r2\().8b,  \r3\().8b
+        trn2            \r3\().8b,  \r2\().8b,  \r3\().8b
+        trn1            \r0\().8b,  \r4\().8b,  \r5\().8b
+        trn2            \r5\().8b,  \r4\().8b,  \r5\().8b
+        trn1            \r2\().8b,  \r6\().8b,  \r7\().8b
+        trn2            \r7\().8b,  \r6\().8b,  \r7\().8b
+
+        trn1            \r4\().4h,  \r0\().4h,  \r2\().4h
+        trn2            \r2\().4h,  \r0\().4h,  \r2\().4h
+        trn1            \r6\().4h,  \r5\().4h,  \r7\().4h
+        trn2            \r7\().4h,  \r5\().4h,  \r7\().4h
+        trn1            \r5\().4h,  \t9\().4h,  \r3\().4h
+        trn2            \t9\().4h,  \t9\().4h,  \r3\().4h
+        trn1            \r3\().4h,  \t8\().4h,  \r1\().4h
+        trn2            \t8\().4h,  \t8\().4h,  \r1\().4h
+
+        trn1            \r0\().2s,  \r3\().2s,  \r4\().2s
+        trn2            \r4\().2s,  \r3\().2s,  \r4\().2s
+        trn1            \r1\().2s,  \r5\().2s,  \r6\().2s
+        trn2            \r5\().2s,  \r5\().2s,  \r6\().2s
+        trn2            \r6\().2s,  \t8\().2s,  \r2\().2s
+        trn1            \r2\().2s,  \t8\().2s,  \r2\().2s
+        trn1            \r3\().2s,  \t9\().2s,  \r7\().2s
+        trn2            \r7\().2s,  \t9\().2s,  \r7\().2s
+.endm
+
+.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
+        trn1            \t8\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \t9\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \r1\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \r3\().8h,  \r2\().8h,  \r3\().8h
+        trn1            \r0\().8h,  \r4\().8h,  \r5\().8h
+        trn2            \r5\().8h,  \r4\().8h,  \r5\().8h
+        trn1            \r2\().8h,  \r6\().8h,  \r7\().8h
+        trn2            \r7\().8h,  \r6\().8h,  \r7\().8h
+
+        trn1            \r4\().4s,  \r0\().4s,  \r2\().4s
+        trn2            \r2\().4s,  \r0\().4s,  \r2\().4s
+        trn1            \r6\().4s,  \r5\().4s,  \r7\().4s
+        trn2            \r7\().4s,  \r5\().4s,  \r7\().4s
+        trn1            \r5\().4s,  \t9\().4s,  \r3\().4s
+        trn2            \t9\().4s,  \t9\().4s,  \r3\().4s
+        trn1            \r3\().4s,  \t8\().4s,  \r1\().4s
+        trn2            \t8\().4s,  \t8\().4s,  \r1\().4s
+
+        trn1            \r0\().2d,  \r3\().2d,  \r4\().2d
+        trn2            \r4\().2d,  \r3\().2d,  \r4\().2d
+        trn1            \r1\().2d,  \r5\().2d,  \r6\().2d
+        trn2            \r5\().2d,  \r5\().2d,  \r6\().2d
+        trn2            \r6\().2d,  \t8\().2d,  \r2\().2d
+        trn1            \r2\().2d,  \t8\().2d,  \r2\().2d
+        trn1            \r3\().2d,  \t9\().2d,  \r7\().2d
+        trn2            \r7\().2d,  \t9\().2d,  \r7\().2d
+.endm
+
+.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
+        trn1            \t8\().16b, \r0\().16b, \r1\().16b
+        trn2            \t9\().16b, \r0\().16b, \r1\().16b
+        trn1            \r1\().16b, \r2\().16b, \r3\().16b
+        trn2            \r3\().16b, \r2\().16b, \r3\().16b
+        trn1            \r0\().16b, \r4\().16b, \r5\().16b
+        trn2            \r5\().16b, \r4\().16b, \r5\().16b
+        trn1            \r2\().16b, \r6\().16b, \r7\().16b
+        trn2            \r7\().16b, \r6\().16b, \r7\().16b
+
+        trn1            \r4\().8h,  \r0\().8h,  \r2\().8h
+        trn2            \r2\().8h,  \r0\().8h,  \r2\().8h
+        trn1            \r6\().8h,  \r5\().8h,  \r7\().8h
+        trn2            \r7\().8h,  \r5\().8h,  \r7\().8h
+        trn1            \r5\().8h,  \t9\().8h,  \r3\().8h
+        trn2            \t9\().8h,  \t9\().8h,  \r3\().8h
+        trn1            \r3\().8h,  \t8\().8h,  \r1\().8h
+        trn2            \t8\().8h,  \t8\().8h,  \r1\().8h
+
+        trn1            \r0\().4s,  \r3\().4s,  \r4\().4s
+        trn2            \r4\().4s,  \r3\().4s,  \r4\().4s
+        trn1            \r1\().4s,  \r5\().4s,  \r6\().4s
+        trn2            \r5\().4s,  \r5\().4s,  \r6\().4s
+        trn2            \r6\().4s,  \t8\().4s,  \r2\().4s
+        trn1            \r2\().4s,  \t8\().4s,  \r2\().4s
+        trn1            \r3\().4s,  \t9\().4s,  \r7\().4s
+        trn2            \r7\().4s,  \t9\().4s,  \r7\().4s
+.endm
+
+.macro  transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().16b, \r0\().16b, \r1\().16b
+        trn2            \t5\().16b, \r0\().16b, \r1\().16b
+        trn1            \t6\().16b, \r2\().16b, \r3\().16b
+        trn2            \t7\().16b, \r2\().16b, \r3\().16b
+
+        trn1            \r0\().8h,  \t4\().8h,  \t6\().8h
+        trn2            \r2\().8h,  \t4\().8h,  \t6\().8h
+        trn1            \r1\().8h,  \t5\().8h,  \t7\().8h
+        trn2            \r3\().8h,  \t5\().8h,  \t7\().8h
+.endm
+
+.macro  transpose_4x4h  r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().4h,  \r0\().4h,  \r1\().4h
+        trn2            \t5\().4h,  \r0\().4h,  \r1\().4h
+        trn1            \t6\().4h,  \r2\().4h,  \r3\().4h
+        trn2            \t7\().4h,  \r2\().4h,  \r3\().4h
+
+        trn1            \r0\().2s,  \t4\().2s,  \t6\().2s
+        trn2            \r2\().2s,  \t4\().2s,  \t6\().2s
+        trn1            \r1\().2s,  \t5\().2s,  \t7\().2s
+        trn2            \r3\().2s,  \t5\().2s,  \t7\().2s
+.endm
+
+.macro  transpose_4x4s  r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().4s,  \r0\().4s,  \r1\().4s
+        trn2            \t5\().4s,  \r0\().4s,  \r1\().4s
+        trn1            \t6\().4s,  \r2\().4s,  \r3\().4s
+        trn2            \t7\().4s,  \r2\().4s,  \r3\().4s
+
+        trn1            \r0\().2d,  \t4\().2d,  \t6\().2d
+        trn2            \r2\().2d,  \t4\().2d,  \t6\().2d
+        trn1            \r1\().2d,  \t5\().2d,  \t7\().2d
+        trn2            \r3\().2d,  \t5\().2d,  \t7\().2d
+.endm
+
+.macro  transpose_4x8h  r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
+
+        trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
+        trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
+        trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
+        trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
+.endm
+
+#endif /* DAV1D_SRC_ARM_64_UTIL_S */
diff --git a/src/arm/asm.S b/src/arm/asm.S
new file mode 100644 (file)
index 0000000..1cd0955
--- /dev/null
@@ -0,0 +1,151 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_ASM_S
+#define DAV1D_SRC_ARM_ASM_S
+
+#include "config.h"
+
+#if ARCH_ARM
+        .syntax unified
+#ifdef __ELF__
+        .arch armv7-a
+        .fpu neon
+        .eabi_attribute 10, 0           // suppress Tag_FP_arch
+        .eabi_attribute 12, 0           // suppress Tag_Advanced_SIMD_arch
+        .section .note.GNU-stack,"",%progbits // Mark stack as non-executable
+#endif
+
+#ifdef _WIN32
+#define CONFIG_THUMB 1
+#else
+#define CONFIG_THUMB 0
+#endif
+
+#if CONFIG_THUMB
+        .thumb
+#define A @
+#define T
+#else
+#define A
+#define T @
+#endif
+#endif
+
+#if !defined(PIC)
+#if defined(__PIC__)
+#define PIC __PIC__
+#elif defined(__pic__)
+#define PIC __pic__
+#endif
+#endif
+
+#ifndef PRIVATE_PREFIX
+#define PRIVATE_PREFIX dav1d_
+#endif
+
+#define PASTE(a,b) a ## b
+#define CONCAT(a,b) PASTE(a,b)
+
+#ifdef PREFIX
+#define EXTERN CONCAT(_,PRIVATE_PREFIX)
+#else
+#define EXTERN PRIVATE_PREFIX
+#endif
+
+.macro function name, export=0, align=2
+    .macro endfunc
+#ifdef __ELF__
+        .size   \name, . - \name
+#endif
+#if HAVE_AS_FUNC
+        .endfunc
+#endif
+        .purgem endfunc
+    .endm
+        .text
+        .align \align
+    .if \export
+        .global EXTERN\name
+#ifdef __ELF__
+        .type   EXTERN\name, %function
+        .hidden EXTERN\name
+#endif
+#if HAVE_AS_FUNC
+        .func   EXTERN\name
+#endif
+EXTERN\name:
+    .else
+#ifdef __ELF__
+        .type \name, %function
+#endif
+#if HAVE_AS_FUNC
+        .func \name
+#endif
+    .endif
+\name:
+.endm
+
+.macro  const   name, export=0, align=2
+    .macro endconst
+#ifdef __ELF__
+        .size   \name, . - \name
+#endif
+        .purgem endconst
+    .endm
+#if defined(_WIN32)
+        .section        .rdata
+#elif !defined(__MACH__)
+        .section        .rodata
+#else
+        .const_data
+#endif
+        .align          \align
+    .if \export
+        .global EXTERN\name
+#ifdef __ELF__
+        .hidden EXTERN\name
+#endif
+EXTERN\name:
+    .endif
+\name:
+.endm
+
+#ifdef __APPLE__
+#define L(x) L ## x
+#else
+#define L(x) .L ## x
+#endif
+
+#define X(x) CONCAT(EXTERN, x)
+
+#if ARCH_AARCH64
+#define x18 do_not_use_x18
+#define w18 do_not_use_w18
+#endif
+
+#endif /* DAV1D_SRC_ARM_ASM_S */
diff --git a/src/arm/cdef_init_tmpl.c b/src/arm/cdef_init_tmpl.c
new file mode 100644 (file)
index 0000000..bbd6586
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+#if BITDEPTH == 8 || ARCH_AARCH64
+decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon));
+
+void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src,
+                                   ptrdiff_t src_stride, const pixel (*left)[2],
+                                   const pixel *const top, int h,
+                                   enum CdefEdgeFlags edges);
+void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src,
+                                   ptrdiff_t src_stride, const pixel (*left)[2],
+                                   const pixel *const top, int h,
+                                   enum CdefEdgeFlags edges);
+
+// Passing edges to this function, to allow it to switch to a more
+// optimized version for fully edged cases. Using size_t for edges,
+// to avoid ABI differences for passing more than one argument on the stack.
+void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride,
+                                  const uint16_t *tmp, int pri_strength,
+                                  int sec_strength, int dir, int damping, int h,
+                                  size_t edges HIGHBD_DECL_SUFFIX);
+void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride,
+                                  const uint16_t *tmp, int pri_strength,
+                                  int sec_strength, int dir, int damping, int h,
+                                  size_t edges HIGHBD_DECL_SUFFIX);
+
+#define DEFINE_FILTER(w, h, tmp_stride)                                      \
+static void                                                                  \
+cdef_filter_##w##x##h##_neon(pixel *dst,                                     \
+                             const ptrdiff_t stride,                         \
+                             const pixel (*left)[2], const pixel *const top, \
+                             const int pri_strength, const int sec_strength, \
+                             const int dir, const int damping,               \
+                             const enum CdefEdgeFlags edges                  \
+                             HIGHBD_DECL_SUFFIX)                             \
+{                                                                            \
+    ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,);                   \
+    uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8;                            \
+    BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, left, top, h, edges);  \
+    BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength,           \
+                                   sec_strength, dir, damping, h, edges      \
+                                   HIGHBD_TAIL_SUFFIX);                      \
+}
+
+DEFINE_FILTER(8, 8, 16)
+DEFINE_FILTER(4, 8, 8)
+DEFINE_FILTER(4, 4, 8)
+#endif
+
+
+COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if BITDEPTH == 8 || ARCH_AARCH64
+    c->dir = BF(dav1d_cdef_find_dir, neon);
+    c->fb[0] = cdef_filter_8x8_neon;
+    c->fb[1] = cdef_filter_4x8_neon;
+    c->fb[2] = cdef_filter_4x4_neon;
+#endif
+}
diff --git a/src/arm/cpu.c b/src/arm/cpu.c
new file mode 100644 (file)
index 0000000..b7a0d3a
--- /dev/null
@@ -0,0 +1,99 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "src/arm/cpu.h"
+
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
+// NEON is always available; runtime tests are not needed.
+#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+#include <sys/auxv.h>
+
+#ifndef HWCAP_ARM_NEON
+#define HWCAP_ARM_NEON (1 << 12)
+#endif
+#define NEON_HWCAP HWCAP_ARM_NEON
+
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
+#include <sys/auxv.h>
+
+#define NEON_HWCAP HWCAP_NEON
+
+#elif defined(__ANDROID__)
+#include <stdio.h>
+#include <string.h>
+
+static unsigned parse_proc_cpuinfo(const char *flag) {
+    FILE *file = fopen("/proc/cpuinfo", "r");
+    if (!file)
+        return 0;
+
+    char line_buffer[120];
+    const char *line;
+
+    while ((line = fgets(line_buffer, sizeof(line_buffer), file))) {
+        if (strstr(line, flag)) {
+            fclose(file);
+            return 1;
+        }
+        // if line is incomplete seek back to avoid splitting the search
+        // string into two buffers
+        if (!strchr(line, '\n') && strlen(line) > strlen(flag)) {
+            // use fseek since the 64 bit fseeko is only available since
+            // Android API level 24 and meson defines _FILE_OFFSET_BITS
+            // by default 64
+            if (fseek(file, -strlen(flag), SEEK_CUR))
+                break;
+        }
+    }
+
+    fclose(file);
+
+    return 0;
+}
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+    unsigned flags = 0;
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
+    flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+    unsigned long hw_cap = getauxval(AT_HWCAP);
+    flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
+    unsigned long hw_cap = 0;
+    elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+    flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#elif defined(__ANDROID__)
+    flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#endif
+
+    return flags;
+}
diff --git a/src/arm/cpu.h b/src/arm/cpu.h
new file mode 100644 (file)
index 0000000..8c10a1b
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_CPU_H
+#define DAV1D_SRC_ARM_CPU_H
+
+enum CpuFlags {
+    DAV1D_ARM_CPU_FLAG_NEON = 1 << 0,
+};
+
+unsigned dav1d_get_cpu_flags_arm(void);
+
+#endif /* DAV1D_SRC_ARM_CPU_H */
diff --git a/src/arm/ipred_init_tmpl.c b/src/arm/ipred_init_tmpl.c
new file mode 100644 (file)
index 0000000..e42ceaf
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon));
+
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon));
+
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon));
+
+decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
+
+COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if BITDEPTH == 8 || ARCH_AARCH64
+    c->intra_pred[DC_PRED]       = BF(dav1d_ipred_dc, neon);
+    c->intra_pred[DC_128_PRED]   = BF(dav1d_ipred_dc_128, neon);
+    c->intra_pred[TOP_DC_PRED]   = BF(dav1d_ipred_dc_top, neon);
+    c->intra_pred[LEFT_DC_PRED]  = BF(dav1d_ipred_dc_left, neon);
+    c->intra_pred[HOR_PRED]      = BF(dav1d_ipred_h, neon);
+    c->intra_pred[VERT_PRED]     = BF(dav1d_ipred_v, neon);
+#if ARCH_AARCH64
+    c->intra_pred[PAETH_PRED]    = BF(dav1d_ipred_paeth, neon);
+    c->intra_pred[SMOOTH_PRED]   = BF(dav1d_ipred_smooth, neon);
+    c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
+    c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon);
+    c->intra_pred[FILTER_PRED]   = BF(dav1d_ipred_filter, neon);
+
+    c->cfl_pred[DC_PRED]         = BF(dav1d_ipred_cfl, neon);
+    c->cfl_pred[DC_128_PRED]     = BF(dav1d_ipred_cfl_128, neon);
+    c->cfl_pred[TOP_DC_PRED]     = BF(dav1d_ipred_cfl_top, neon);
+    c->cfl_pred[LEFT_DC_PRED]    = BF(dav1d_ipred_cfl_left, neon);
+
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon);
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
+
+    c->pal_pred                  = BF(dav1d_pal_pred, neon);
+#endif
+#endif
+}
diff --git a/src/arm/itx_init_tmpl.c b/src/arm/itx_init_tmpl.c
new file mode 100644 (file)
index 0000000..ad418f2
--- /dev/null
@@ -0,0 +1,143 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+decl_itx17_fns( 4,  4, neon);
+decl_itx16_fns( 4,  8, neon);
+decl_itx16_fns( 4, 16, neon);
+decl_itx16_fns( 8,  4, neon);
+decl_itx16_fns( 8,  8, neon);
+decl_itx16_fns( 8, 16, neon);
+decl_itx2_fns ( 8, 32, neon);
+decl_itx16_fns(16,  4, neon);
+decl_itx16_fns(16,  8, neon);
+decl_itx12_fns(16, 16, neon);
+decl_itx2_fns (16, 32, neon);
+decl_itx2_fns (32,  8, neon);
+decl_itx2_fns (32, 16, neon);
+decl_itx2_fns (32, 32, neon);
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
+
+COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+    assign_itx1_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+    assign_itx2_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
+    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
+    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
+    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
+    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
+    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
+    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
+    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
+    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+    assign_itx12_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
+    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
+    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
+    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+    assign_itx16_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, wht_wht,           WHT_WHT,           ext)
+
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+    if (bpc > 10) return;
+
+#if ARCH_AARCH64 || BITDEPTH == 8
+    assign_itx17_fn( ,  4,  4, neon);
+    assign_itx16_fn(R,  4,  8, neon);
+    assign_itx16_fn(R,  4, 16, neon);
+    assign_itx16_fn(R,  8,  4, neon);
+    assign_itx16_fn( ,  8,  8, neon);
+    assign_itx16_fn(R,  8, 16, neon);
+    assign_itx2_fn (R,  8, 32, neon);
+    assign_itx16_fn(R, 16,  4, neon);
+    assign_itx16_fn(R, 16,  8, neon);
+    assign_itx12_fn( , 16, 16, neon);
+    assign_itx2_fn (R, 16, 32, neon);
+    assign_itx1_fn (R, 16, 64, neon);
+    assign_itx2_fn (R, 32,  8, neon);
+    assign_itx2_fn (R, 32, 16, neon);
+    assign_itx2_fn ( , 32, 32, neon);
+    assign_itx1_fn (R, 32, 64, neon);
+    assign_itx1_fn (R, 64, 16, neon);
+    assign_itx1_fn (R, 64, 32, neon);
+    assign_itx1_fn ( , 64, 64, neon);
+#endif
+}
diff --git a/src/arm/loopfilter_init_tmpl.c b/src/arm/loopfilter_init_tmpl.c
new file mode 100644 (file)
index 0000000..d44f8e1
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon));
+
+COLD void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if BITDEPTH == 8 || ARCH_AARCH64
+    c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon);
+    c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon);
+    c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon);
+    c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon);
+#endif
+}
diff --git a/src/arm/looprestoration_init_tmpl.c b/src/arm/looprestoration_init_tmpl.c
new file mode 100644 (file)
index 0000000..1f18d62
--- /dev/null
@@ -0,0 +1,298 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+#include "src/tables.h"
+
+#if BITDEPTH == 8 || ARCH_AARCH64
+// The 8bpc version calculates things slightly differently than the reference
+// C version. That version calculates roughly this:
+// int16_t sum = 0;
+// for (int i = 0; i < 7; i++)
+//     sum += src[idx] * fh[i];
+// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
+// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
+// sum += 1 << (bitdepth + 6 - round_bits_h);
+// Compared to the reference C version, this is the output of the first pass
+// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
+// with round_offset precompensated.
+// The 16bpc version calculates things pretty much the same way as the
+// reference C version, but with the end result subtracted by
+// 1 << (bitdepth + 6 - round_bits_h).
+void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
+                                     const pixel *src, ptrdiff_t stride,
+                                     const int16_t fh[7], const intptr_t w,
+                                     int h, enum LrEdgeFlags edges
+                                     HIGHBD_DECL_SUFFIX);
+// This calculates things slightly differently than the reference C version.
+// This version calculates roughly this:
+// fv[3] += 128;
+// int32_t sum = 0;
+// for (int i = 0; i < 7; i++)
+//     sum += mid[idx] * fv[i];
+// sum = (sum + rounding_off_v) >> round_bits_v;
+// This function assumes that the width is a multiple of 8.
+void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
+                                     const int16_t *mid, int w, int h,
+                                     const int16_t fv[7], enum LrEdgeFlags edges,
+                                     ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
+void BF(dav1d_copy_narrow, neon)(pixel *dst, ptrdiff_t stride,
+                                 const pixel *src, int w, int h);
+
+static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
+                               const pixel (*const left)[4],
+                               const pixel *lpf, const ptrdiff_t lpf_stride,
+                               const int w, const int h, const int16_t fh[7],
+                               const int16_t fv[7], const enum LrEdgeFlags edges
+                               HIGHBD_DECL_SUFFIX)
+{
+    ALIGN_STK_16(int16_t, mid, 68 * 384,);
+    int mid_stride = (w + 7) & ~7;
+
+    // Horizontal filter
+    BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride,
+                                    fh, w, h, edges HIGHBD_TAIL_SUFFIX);
+    if (edges & LR_HAVE_TOP)
+        BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride,
+                                        fh, w, 2, edges HIGHBD_TAIL_SUFFIX);
+    if (edges & LR_HAVE_BOTTOM)
+        BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
+                                        lpf + 6 * PXSTRIDE(lpf_stride),
+                                        lpf_stride, fh, w, 2, edges
+                                        HIGHBD_TAIL_SUFFIX);
+
+    // Vertical filter
+    if (w >= 8)
+        BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
+                                        w & ~7, h, fv, edges,
+                                        mid_stride * sizeof(*mid)
+                                        HIGHBD_TAIL_SUFFIX);
+    if (w & 7) {
+        // For uneven widths, do a full 8 pixel wide filtering into a temp
+        // buffer and copy out the narrow slice of pixels separately into dest.
+        ALIGN_STK_16(pixel, tmp, 64 * 8,);
+        BF(dav1d_wiener_filter_v, neon)(tmp, (w & 7) * sizeof(pixel),
+                                        &mid[2*mid_stride + (w & ~7)],
+                                        w & 7, h, fv, edges,
+                                        mid_stride * sizeof(*mid)
+                                        HIGHBD_TAIL_SUFFIX);
+        BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h);
+    }
+}
+
+void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
+                                const pixel (*left)[4],
+                                const pixel *src, const ptrdiff_t stride,
+                                const int w, const int h,
+                                const enum LrEdgeFlags edges);
+void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+                           const int w, const int h,
+                           const enum LrEdgeFlags edges);
+void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+                             const int w, const int h, const int strength,
+                             const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
+                                        const pixel *src, const ptrdiff_t stride,
+                                        const int32_t *a, const int16_t *b,
+                                        const int w, const int h);
+
+/* filter with a 3x3 box (radius=1) */
+static void dav1d_sgr_filter1_neon(int16_t *tmp,
+                                   const pixel *src, const ptrdiff_t stride,
+                                   const pixel (*left)[4],
+                                   const pixel *lpf, const ptrdiff_t lpf_stride,
+                                   const int w, const int h, const int strength,
+                                   const enum LrEdgeFlags edges
+                                   HIGHBD_DECL_SUFFIX)
+{
+    ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
+    int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
+    ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
+    int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
+
+    BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
+    if (edges & LR_HAVE_TOP)
+        BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+                                   NULL, lpf, lpf_stride, w, 2, edges);
+
+    if (edges & LR_HAVE_BOTTOM)
+        BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+                                   NULL, lpf + 6 * PXSTRIDE(lpf_stride),
+                                   lpf_stride, w, 2, edges);
+
+    dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
+    dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
+    BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
+}
+
+void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
+                                const pixel (*left)[4],
+                                const pixel *src, const ptrdiff_t stride,
+                                const int w, const int h,
+                                const enum LrEdgeFlags edges);
+void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+                           const int w, const int h,
+                           const enum LrEdgeFlags edges);
+void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+                             const int w, const int h, const int strength,
+                             const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
+                                        const pixel *src, const ptrdiff_t stride,
+                                        const int32_t *a, const int16_t *b,
+                                        const int w, const int h);
+
+/* filter with a 5x5 box (radius=2) */
+static void dav1d_sgr_filter2_neon(int16_t *tmp,
+                                   const pixel *src, const ptrdiff_t stride,
+                                   const pixel (*left)[4],
+                                   const pixel *lpf, const ptrdiff_t lpf_stride,
+                                   const int w, const int h, const int strength,
+                                   const enum LrEdgeFlags edges
+                                   HIGHBD_DECL_SUFFIX)
+{
+    ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
+    int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
+    ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
+    int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
+
+    BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
+    if (edges & LR_HAVE_TOP)
+        BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+                                   NULL, lpf, lpf_stride, w, 2, edges);
+
+    if (edges & LR_HAVE_BOTTOM)
+        BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+                                   NULL, lpf + 6 * PXSTRIDE(lpf_stride),
+                                   lpf_stride, w, 2, edges);
+
+    dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
+    dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
+    BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
+}
+
+void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
+                                   const pixel *src, const ptrdiff_t src_stride,
+                                   const int16_t *t1, const int w, const int h,
+                                   const int wt HIGHBD_DECL_SUFFIX);
+void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
+                                   const pixel *src, const ptrdiff_t src_stride,
+                                   const int16_t *t1, const int16_t *t2,
+                                   const int w, const int h,
+                                   const int16_t wt[2] HIGHBD_DECL_SUFFIX);
+
+static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
+                             const pixel (*const left)[4],
+                             const pixel *lpf, const ptrdiff_t lpf_stride,
+                             const int w, const int h, const int sgr_idx,
+                             const int16_t sgr_wt[7], const enum LrEdgeFlags edges
+                             HIGHBD_DECL_SUFFIX)
+{
+    if (!dav1d_sgr_params[sgr_idx][0]) {
+        ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+        dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
+                               w, h, dav1d_sgr_params[sgr_idx][3], edges
+                               HIGHBD_TAIL_SUFFIX);
+        if (w >= 8)
+            BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
+                                          tmp, w & ~7, h, (1 << 7) - sgr_wt[1]
+                                          HIGHBD_TAIL_SUFFIX);
+        if (w & 7) {
+            // For uneven widths, do a full 8 pixel wide filtering into a temp
+            // buffer and copy out the narrow slice of pixels separately into
+            // dest.
+            ALIGN_STK_16(pixel, stripe, 64 * 8,);
+            BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
+                                          dst + (w & ~7), dst_stride,
+                                          tmp + (w & ~7), w & 7, h,
+                                          (1 << 7) - sgr_wt[1]
+                                          HIGHBD_TAIL_SUFFIX);
+            BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
+                                        w & 7, h);
+        }
+    } else if (!dav1d_sgr_params[sgr_idx][1]) {
+        ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+        dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
+                               w, h, dav1d_sgr_params[sgr_idx][2], edges
+                               HIGHBD_TAIL_SUFFIX);
+        if (w >= 8)
+            BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
+                                          tmp, w & ~7, h, sgr_wt[0]
+                                          HIGHBD_TAIL_SUFFIX);
+        if (w & 7) {
+            // For uneven widths, do a full 8 pixel wide filtering into a temp
+            // buffer and copy out the narrow slice of pixels separately into
+            // dest.
+            ALIGN_STK_16(pixel, stripe, 64 * 8,);
+            BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
+                                          dst + (w & ~7), dst_stride,
+                                          tmp + (w & ~7), w & 7, h, sgr_wt[0]
+                                          HIGHBD_TAIL_SUFFIX);
+            BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
+                                        w & 7, h);
+        }
+    } else {
+        ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
+        ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
+        dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
+                               w, h, dav1d_sgr_params[sgr_idx][2], edges
+                               HIGHBD_TAIL_SUFFIX);
+        dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
+                               w, h, dav1d_sgr_params[sgr_idx][3], edges
+                               HIGHBD_TAIL_SUFFIX);
+        const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
+        if (w >= 8)
+            BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
+                                          tmp1, tmp2, w & ~7, h, wt
+                                          HIGHBD_TAIL_SUFFIX);
+        if (w & 7) {
+            // For uneven widths, do a full 8 pixel wide filtering into a temp
+            // buffer and copy out the narrow slice of pixels separately into
+            // dest.
+            ALIGN_STK_16(pixel, stripe, 64 * 8,);
+            BF(dav1d_sgr_weighted2, neon)(stripe, (w & 7) * sizeof(pixel),
+                                          dst + (w & ~7), dst_stride,
+                                          tmp1 + (w & ~7), tmp2 + (w & ~7),
+                                          w & 7, h, wt HIGHBD_TAIL_SUFFIX);
+            BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
+                                        w & 7, h);
+        }
+    }
+}
+#endif // BITDEPTH == 8
+
+COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if BITDEPTH == 8 || ARCH_AARCH64
+    c->wiener = wiener_filter_neon;
+    if (bpc <= 10)
+        c->selfguided = sgr_filter_neon;
+#endif
+}
diff --git a/src/arm/mc_init_tmpl.c b/src/arm/mc_init_tmpl.c
new file mode 100644 (file)
index 0000000..399ad41
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/mc.h"
+#include "src/cpu.h"
+
+decl_mc_fn(BF(dav1d_put_8tap_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon));
+decl_mc_fn(BF(dav1d_put_bilin, neon));
+
+decl_mct_fn(BF(dav1d_prep_8tap_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_bilin, neon));
+
+decl_avg_fn(BF(dav1d_avg, neon));
+decl_w_avg_fn(BF(dav1d_w_avg, neon));
+decl_mask_fn(BF(dav1d_mask, neon));
+decl_blend_fn(BF(dav1d_blend, neon));
+decl_blend_dir_fn(BF(dav1d_blend_h, neon));
+decl_blend_dir_fn(BF(dav1d_blend_v, neon));
+
+decl_w_mask_fn(BF(dav1d_w_mask_444, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_422, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_420, neon));
+
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));
+
+decl_emu_edge_fn(BF(dav1d_emu_edge, neon));
+
+void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
+#define init_mc_fn(type, name, suffix) \
+    c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+    c->mct[type] = BF(dav1d_prep_##name, suffix)
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if BITDEPTH == 8 || ARCH_AARCH64
+    init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
+    init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+    init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
+    init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+    init_mc_fn (FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         neon);
+    init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   neon);
+    init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  neon);
+    init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
+    init_mc_fn (FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
+    init_mc_fn (FILTER_2D_BILINEAR,            bilin,               neon);
+
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         neon);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   neon);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  neon);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
+    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               neon);
+
+    c->avg = BF(dav1d_avg, neon);
+    c->w_avg = BF(dav1d_w_avg, neon);
+    c->mask = BF(dav1d_mask, neon);
+    c->blend = BF(dav1d_blend, neon);
+    c->blend_h = BF(dav1d_blend_h, neon);
+    c->blend_v = BF(dav1d_blend_v, neon);
+    c->w_mask[0] = BF(dav1d_w_mask_444, neon);
+    c->w_mask[1] = BF(dav1d_w_mask_422, neon);
+    c->w_mask[2] = BF(dav1d_w_mask_420, neon);
+    c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
+    c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
+    c->emu_edge = BF(dav1d_emu_edge, neon);
+#endif
+}
diff --git a/src/arm/msac.h b/src/arm/msac.h
new file mode 100644 (file)
index 0000000..9db0bf8
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_MSAC_H
+#define DAV1D_SRC_ARM_MSAC_H
+
+unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
+                                               size_t n_symbols);
+unsigned dav1d_msac_decode_hi_tok_neon(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
+unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
+
+#if ARCH_AARCH64 || defined(__ARM_NEON)
+#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_neon
+#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_neon
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
+#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_neon
+#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_neon
+#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_neon
+#define dav1d_msac_decode_bool           dav1d_msac_decode_bool_neon
+#endif
+
+#endif /* DAV1D_SRC_ARM_MSAC_H */
diff --git a/src/cdef.h b/src/cdef.h
new file mode 100644 (file)
index 0000000..5dd52cf
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CDEF_H
+#define DAV1D_SRC_CDEF_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common/bitdepth.h"
+
+enum CdefEdgeFlags {
+    CDEF_HAVE_LEFT = 1 << 0,
+    CDEF_HAVE_RIGHT = 1 << 1,
+    CDEF_HAVE_TOP = 1 << 2,
+    CDEF_HAVE_BOTTOM = 1 << 3,
+};
+
+#ifdef BITDEPTH
+typedef const pixel (*const_left_pixel_row_2px)[2];
+#else
+typedef const void *const_left_pixel_row_2px;
+#endif
+
+// CDEF operates entirely on pre-filter data; if bottom/right edges are
+// present (according to $edges), then the pre-filter data is located in
+// $dst. However, the edge pixels above $dst may be post-filter, so in
+// order to get access to pre-filter top pixels, use $top.
+#define decl_cdef_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \
+            const pixel *top, int pri_strength, int sec_strength, \
+            int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
+typedef decl_cdef_fn(*cdef_fn);
+
+#define decl_cdef_dir_fn(name) \
+int (name)(const pixel *dst, ptrdiff_t dst_stride, unsigned *var HIGHBD_DECL_SUFFIX)
+typedef decl_cdef_dir_fn(*cdef_dir_fn);
+
+typedef struct Dav1dCdefDSPContext {
+    cdef_dir_fn dir;
+    cdef_fn fb[3 /* 444/luma, 422, 420 */];
+} Dav1dCdefDSPContext;
+
+bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c);
+bitfn_decls(void dav1d_cdef_dsp_init_arm, Dav1dCdefDSPContext *c);
+bitfn_decls(void dav1d_cdef_dsp_init_ppc, Dav1dCdefDSPContext *c);
+bitfn_decls(void dav1d_cdef_dsp_init_x86, Dav1dCdefDSPContext *c);
+
+#endif /* DAV1D_SRC_CDEF_H */
diff --git a/src/cdef_apply.h b/src/cdef_apply.h
new file mode 100644 (file)
index 0000000..ffdffba
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CDEF_APPLY_H
+#define DAV1D_SRC_CDEF_APPLY_H
+
+#include "common/bitdepth.h"
+
+#include "src/internal.h"
+
+void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *f, pixel *const p[3],
+                             const Av1Filter *lflvl, int by_start, int by_end);
+
+#endif /* DAV1D_SRC_CDEF_APPLY_H */
diff --git a/src/cdef_apply_tmpl.c b/src/cdef_apply_tmpl.c
new file mode 100644 (file)
index 0000000..c45c710
--- /dev/null
@@ -0,0 +1,234 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/cdef_apply.h"
+
+
+enum Backup2x8Flags {
+    BACKUP_2X8_Y = 1 << 0,
+    BACKUP_2X8_UV = 1 << 1,
+};
+
+static void backup2lines(pixel *const dst[3], /*const*/ pixel *const src[3],
+                         const ptrdiff_t stride[2],
+                         const enum Dav1dPixelLayout layout)
+{
+    const ptrdiff_t y_stride = PXSTRIDE(stride[0]);
+    if (y_stride < 0)
+        pixel_copy(dst[0] + y_stride, src[0] + 7 * y_stride, -2 * y_stride);
+    else
+        pixel_copy(dst[0], src[0] + 6 * y_stride, 2 * y_stride);
+
+    if (layout != DAV1D_PIXEL_LAYOUT_I400) {
+        const ptrdiff_t uv_stride = PXSTRIDE(stride[1]);
+        if (uv_stride < 0) {
+            const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 3 : 7;
+            pixel_copy(dst[1] + uv_stride, src[1] + uv_off * uv_stride, -2 * uv_stride);
+            pixel_copy(dst[2] + uv_stride, src[2] + uv_off * uv_stride, -2 * uv_stride);
+        } else {
+            const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 2 : 6;
+            pixel_copy(dst[1], src[1] + uv_off * uv_stride, 2 * uv_stride);
+            pixel_copy(dst[2], src[2] + uv_off * uv_stride, 2 * uv_stride);
+        }
+    }
+}
+
+static void backup2x8(pixel dst[3][8][2],
+                      /*const*/ pixel *const src[3],
+                      const ptrdiff_t src_stride[2], int x_off,
+                      const enum Dav1dPixelLayout layout,
+                      const enum Backup2x8Flags flag)
+{
+    ptrdiff_t y_off = 0;
+    if (flag & BACKUP_2X8_Y) {
+        for (int y = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
+            pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
+    }
+
+    if (layout == DAV1D_PIXEL_LAYOUT_I400 || !(flag & BACKUP_2X8_UV))
+        return;
+
+    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+
+    x_off >>= ss_hor;
+    y_off = 0;
+    for (int y = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {
+        pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2);
+        pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2);
+    }
+}
+
+static int adjust_strength(const int strength, const unsigned var) {
+    if (!var) return 0;
+    const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;
+    return (strength * (4 + i) + 8) >> 4;
+}
+
+void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
+                             pixel *const p[3],
+                             const Av1Filter *const lflvl,
+                             const int by_start, const int by_end)
+{
+    const int bitdepth_min_8 = BITDEPTH == 8 ? 0 : f->cur.p.bpc - 8;
+    const Dav1dDSPContext *const dsp = f->dsp;
+    enum CdefEdgeFlags edges = CDEF_HAVE_BOTTOM | (by_start > 0 ? CDEF_HAVE_TOP : 0);
+    pixel *ptrs[3] = { p[0], p[1], p[2] };
+    const int sbsz = 16;
+    const int sb64w = f->sb128w << 1;
+    const int damping = f->frame_hdr->cdef.damping + bitdepth_min_8;
+    const enum Dav1dPixelLayout layout = f->cur.p.layout;
+    const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
+    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+    static const uint8_t uv_dirs[2][8] = { { 0, 1, 2, 3, 4, 5, 6, 7 },
+                                           { 7, 0, 2, 4, 5, 6, 6, 6 } };
+    const uint8_t *uv_dir = uv_dirs[layout == DAV1D_PIXEL_LAYOUT_I422];
+
+    for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
+        const int tf = f->lf.top_pre_cdef_toggle;
+        const int by_idx = by & 30;
+        if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
+
+        if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration
+            backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride, layout);
+
+        ALIGN_STK_16(pixel, lr_bak, 2 /* idx */, [3 /* plane */][8 /* y */][2 /* x */]);
+        pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
+        edges &= ~CDEF_HAVE_LEFT;
+        edges |= CDEF_HAVE_RIGHT;
+        enum Backup2x8Flags prev_flag = 0;
+        for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
+            const int sb128x = sbx >> 1;
+            const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
+            const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
+            if (cdef_idx == -1 ||
+                (!f->frame_hdr->cdef.y_strength[cdef_idx] &&
+                 !f->frame_hdr->cdef.uv_strength[cdef_idx]))
+            {
+                last_skip = 1;
+                goto next_sb;
+            }
+
+            const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
+            const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
+            const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
+
+            const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8;
+            int y_sec_lvl = y_lvl & 3;
+            y_sec_lvl += y_sec_lvl == 3;
+            y_sec_lvl <<= bitdepth_min_8;
+
+            const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8;
+            int uv_sec_lvl = uv_lvl & 3;
+            uv_sec_lvl += uv_sec_lvl == 3;
+            uv_sec_lvl <<= bitdepth_min_8;
+
+            pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
+            for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
+                 bx += 2, edges |= CDEF_HAVE_LEFT)
+            {
+                if (bx + 2 >= f->bw) edges &= ~CDEF_HAVE_RIGHT;
+
+                // check if this 8x8 block had any coded coefficients; if not,
+                // go to the next block
+                const unsigned bx_mask = 3U << (bx & 14);
+                const int bx_idx = (bx & 16) >> 4;
+                if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
+                       lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
+                {
+                    last_skip = 1;
+                    goto next_b;
+                }
+                const int do_left = last_skip ? flag : (prev_flag ^ flag) & flag;
+                prev_flag = flag;
+                if (do_left && edges & CDEF_HAVE_LEFT) {
+                    // we didn't backup the prefilter data because it wasn't
+                    // there, so do it here instead
+                    backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout, do_left);
+                }
+                if (edges & CDEF_HAVE_RIGHT) {
+                    // backup pre-filter data for next iteration
+                    backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout, flag);
+                }
+
+                int dir;
+                unsigned variance;
+                if (y_pri_lvl || uv_pri_lvl)
+                    dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
+                                        &variance HIGHBD_CALL_SUFFIX);
+
+                if (y_pri_lvl) {
+                    const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance);
+                    if (adj_y_pri_lvl || y_sec_lvl)
+                        dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
+                                        &f->lf.cdef_line[tf][0][bx * 4],
+                                        adj_y_pri_lvl, y_sec_lvl, dir,
+                                        damping, edges HIGHBD_CALL_SUFFIX);
+                } else if (y_sec_lvl)
+                    dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
+                                    &f->lf.cdef_line[tf][0][bx * 4],
+                                    0, y_sec_lvl, 0,
+                                    damping, edges HIGHBD_CALL_SUFFIX);
+                if (uv_lvl) {
+                    assert(layout != DAV1D_PIXEL_LAYOUT_I400);
+                    const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0;
+                    for (int pl = 1; pl <= 2; pl++) {
+                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1], lr_bak[bit][pl],
+                                             &f->lf.cdef_line[tf][pl][bx * 4 >> ss_hor],
+                                             uv_pri_lvl, uv_sec_lvl, uvdir,
+                                             damping - 1, edges HIGHBD_CALL_SUFFIX);
+                    }
+                }
+
+                bit ^= 1;
+                last_skip = 0;
+
+            next_b:
+                bptrs[0] += 8;
+                bptrs[1] += 8 >> ss_hor;
+                bptrs[2] += 8 >> ss_hor;
+            }
+
+        next_sb:
+            iptrs[0] += sbsz * 4;
+            iptrs[1] += sbsz * 4 >> ss_hor;
+            iptrs[2] += sbsz * 4 >> ss_hor;
+        }
+
+        ptrs[0] += 8 * PXSTRIDE(f->cur.stride[0]);
+        ptrs[1] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
+        ptrs[2] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
+        f->lf.top_pre_cdef_toggle ^= 1;
+    }
+}
diff --git a/src/cdef_tmpl.c b/src/cdef_tmpl.c
new file mode 100644 (file)
index 0000000..41e3fe6
--- /dev/null
@@ -0,0 +1,312 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/cdef.h"
+#include "src/tables.h"
+
+static inline int constrain(const int diff, const int threshold,
+                            const int shift)
+{
+    const int adiff = abs(diff);
+    return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))), diff);
+}
+
+static inline void fill(int16_t *tmp, const ptrdiff_t stride,
+                        const int w, const int h)
+{
+    /* Use a value that's a large positive number when interpreted as unsigned,
+     * and a large negative number when interpreted as signed. */
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)
+            tmp[x] = INT16_MIN;
+        tmp += stride;
+    }
+}
+
+static void padding(int16_t *tmp, const ptrdiff_t tmp_stride,
+                    const pixel *src, const ptrdiff_t src_stride,
+                    const pixel (*left)[2], const pixel *top,
+                    const int w, const int h,
+                    const enum CdefEdgeFlags edges)
+{
+    // fill extended input buffer
+    int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;
+    if (!(edges & CDEF_HAVE_TOP)) {
+        fill(tmp - 2 - 2 * tmp_stride, tmp_stride, w + 4, 2);
+        y_start = 0;
+    }
+    if (!(edges & CDEF_HAVE_BOTTOM)) {
+        fill(tmp + h * tmp_stride - 2, tmp_stride, w + 4, 2);
+        y_end -= 2;
+    }
+    if (!(edges & CDEF_HAVE_LEFT)) {
+        fill(tmp + y_start * tmp_stride - 2, tmp_stride, 2, y_end - y_start);
+        x_start = 0;
+    }
+    if (!(edges & CDEF_HAVE_RIGHT)) {
+        fill(tmp + y_start * tmp_stride + w, tmp_stride, 2, y_end - y_start);
+        x_end -= 2;
+    }
+
+    for (int y = y_start; y < 0; y++) {
+        for (int x = x_start; x < x_end; x++)
+            tmp[x + y * tmp_stride] = top[x];
+        top += PXSTRIDE(src_stride);
+    }
+    for (int y = 0; y < h; y++)
+        for (int x = x_start; x < 0; x++)
+            tmp[x + y * tmp_stride] = left[y][2 + x];
+    for (int y = 0; y < y_end; y++) {
+        for (int x = (y < h) ? 0 : x_start; x < x_end; x++)
+            tmp[x] = src[x];
+        src += PXSTRIDE(src_stride);
+        tmp += tmp_stride;
+    }
+}
+
+static NOINLINE void
+cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
+                    const pixel (*left)[2], const pixel *const top,
+                    const int pri_strength, const int sec_strength,
+                    const int dir, const int damping, const int w, int h,
+                    const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    const ptrdiff_t tmp_stride = 12;
+    assert((w == 4 || w == 8) && (h == 4 || h == 8));
+    int16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4)
+    int16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
+
+    padding(tmp, tmp_stride, dst, dst_stride, left, top, w, h, edges);
+
+    if (pri_strength) {
+        const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+        const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
+        const int pri_shift = imax(0, damping - ulog2(pri_strength));
+        if (sec_strength) {
+            const int sec_shift = imax(0, damping - ulog2(sec_strength));
+            do {
+                for (int x = 0; x < w; x++) {
+                    const int px = dst[x];
+                    int sum = 0;
+                    int max = px, min = px;
+                    int pri_tap_k = pri_tap;
+                    for (int k = 0; k < 2; k++) {
+                        const int off1 = dav1d_cdef_directions[dir + 2][k]; // dir
+                        const int p0 = tmp[x + off1];
+                        const int p1 = tmp[x - off1];
+                        sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
+                        sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
+                        // if pri_tap_k == 4 then it becomes 2 else it remains 3
+                        pri_tap_k = (pri_tap_k & 3) | 2;
+                        min = umin(p0, min);
+                        max = imax(p0, max);
+                        min = umin(p1, min);
+                        max = imax(p1, max);
+                        const int off2 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
+                        const int off3 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
+                        const int s0 = tmp[x + off2];
+                        const int s1 = tmp[x - off2];
+                        const int s2 = tmp[x + off3];
+                        const int s3 = tmp[x - off3];
+                        // sec_tap starts at 2 and becomes 1
+                        const int sec_tap = 2 - k;
+                        sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
+                        sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
+                        sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
+                        sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
+                        min = umin(s0, min);
+                        max = imax(s0, max);
+                        min = umin(s1, min);
+                        max = imax(s1, max);
+                        min = umin(s2, min);
+                        max = imax(s2, max);
+                        min = umin(s3, min);
+                        max = imax(s3, max);
+                    }
+                    dst[x] = iclip(px + ((sum - (sum < 0) + 8) >> 4), min, max);
+                }
+                dst += PXSTRIDE(dst_stride);
+                tmp += tmp_stride;
+            } while (--h);
+        } else { // pri_strength only
+            do {
+                for (int x = 0; x < w; x++) {
+                    const int px = dst[x];
+                    int sum = 0;
+                    int pri_tap_k = pri_tap;
+                    for (int k = 0; k < 2; k++) {
+                        const int off = dav1d_cdef_directions[dir + 2][k]; // dir
+                        const int p0 = tmp[x + off];
+                        const int p1 = tmp[x - off];
+                        sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
+                        sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
+                        pri_tap_k = (pri_tap_k & 3) | 2;
+                    }
+                    dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
+                }
+                dst += PXSTRIDE(dst_stride);
+                tmp += tmp_stride;
+            } while (--h);
+        }
+    } else { // sec_strength only
+        assert(sec_strength);
+        const int sec_shift = imax(0, damping - ulog2(sec_strength));
+        do {
+            for (int x = 0; x < w; x++) {
+                const int px = dst[x];
+                int sum = 0;
+                for (int k = 0; k < 2; k++) {
+                    const int off1 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
+                    const int off2 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
+                    const int s0 = tmp[x + off1];
+                    const int s1 = tmp[x - off1];
+                    const int s2 = tmp[x + off2];
+                    const int s3 = tmp[x - off2];
+                    const int sec_tap = 2 - k;
+                    sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
+                    sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
+                    sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
+                    sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
+                }
+                dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
+            }
+            dst += PXSTRIDE(dst_stride);
+            tmp += tmp_stride;
+        } while (--h);
+    }
+}
+
+#define cdef_fn(w, h) \
+static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
+                                            const ptrdiff_t stride, \
+                                            const pixel (*left)[2], \
+                                            const pixel *const top, \
+                                            const int pri_strength, \
+                                            const int sec_strength, \
+                                            const int dir, \
+                                            const int damping, \
+                                            const enum CdefEdgeFlags edges \
+                                            HIGHBD_DECL_SUFFIX) \
+{ \
+    cdef_filter_block_c(dst, stride, left, top, pri_strength, sec_strength, \
+                        dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \
+}
+
+cdef_fn(4, 4);
+cdef_fn(4, 8);
+cdef_fn(8, 8);
+
+static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
+                           unsigned *const var HIGHBD_DECL_SUFFIX)
+{
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    int partial_sum_hv[2][8] = { { 0 } };
+    int partial_sum_diag[2][15] = { { 0 } };
+    int partial_sum_alt[4][11] = { { 0 } };
+
+    for (int y = 0; y < 8; y++) {
+        for (int x = 0; x < 8; x++) {
+            const int px = (img[x] >> bitdepth_min_8) - 128;
+
+            partial_sum_diag[0][     y       +  x      ] += px;
+            partial_sum_alt [0][     y       + (x >> 1)] += px;
+            partial_sum_hv  [0][     y                 ] += px;
+            partial_sum_alt [1][3 +  y       - (x >> 1)] += px;
+            partial_sum_diag[1][7 +  y       -  x      ] += px;
+            partial_sum_alt [2][3 - (y >> 1) +  x      ] += px;
+            partial_sum_hv  [1][                x      ] += px;
+            partial_sum_alt [3][    (y >> 1) +  x      ] += px;
+        }
+        img += PXSTRIDE(stride);
+    }
+
+    unsigned cost[8] = { 0 };
+    for (int n = 0; n < 8; n++) {
+        cost[2] += partial_sum_hv[0][n] * partial_sum_hv[0][n];
+        cost[6] += partial_sum_hv[1][n] * partial_sum_hv[1][n];
+    }
+    cost[2] *= 105;
+    cost[6] *= 105;
+
+    static const uint16_t div_table[7] = { 840, 420, 280, 210, 168, 140, 120 };
+    for (int n = 0; n < 7; n++) {
+        const int d = div_table[n];
+        cost[0] += (partial_sum_diag[0][n]      * partial_sum_diag[0][n] +
+                    partial_sum_diag[0][14 - n] * partial_sum_diag[0][14 - n]) * d;
+        cost[4] += (partial_sum_diag[1][n]      * partial_sum_diag[1][n] +
+                    partial_sum_diag[1][14 - n] * partial_sum_diag[1][14 - n]) * d;
+    }
+    cost[0] += partial_sum_diag[0][7] * partial_sum_diag[0][7] * 105;
+    cost[4] += partial_sum_diag[1][7] * partial_sum_diag[1][7] * 105;
+
+    for (int n = 0; n < 4; n++) {
+        unsigned *const cost_ptr = &cost[n * 2 + 1];
+        for (int m = 0; m < 5; m++)
+            *cost_ptr += partial_sum_alt[n][3 + m] * partial_sum_alt[n][3 + m];
+        *cost_ptr *= 105;
+        for (int m = 0; m < 3; m++) {
+            const int d = div_table[2 * m + 1];
+            *cost_ptr += (partial_sum_alt[n][m]      * partial_sum_alt[n][m] +
+                          partial_sum_alt[n][10 - m] * partial_sum_alt[n][10 - m]) * d;
+        }
+    }
+
+    int best_dir = 0;
+    unsigned best_cost = cost[0];
+    for (int n = 1; n < 8; n++) {
+        if (cost[n] > best_cost) {
+            best_cost = cost[n];
+            best_dir = n;
+        }
+    }
+
+    *var = (best_cost - (cost[best_dir ^ 4])) >> 10;
+    return best_dir;
+}
+
+COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
+    c->dir = cdef_find_dir_c;
+    c->fb[0] = cdef_filter_block_8x8_c;
+    c->fb[1] = cdef_filter_block_4x8_c;
+    c->fb[2] = cdef_filter_block_4x4_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    bitfn(dav1d_cdef_dsp_init_arm)(c);
+#elif ARCH_PPC64LE
+    bitfn(dav1d_cdef_dsp_init_ppc)(c);
+#elif ARCH_X86
+    bitfn(dav1d_cdef_dsp_init_x86)(c);
+#endif
+#endif
+}
diff --git a/src/cdf.c b/src/cdf.c
new file mode 100644 (file)
index 0000000..545d07e
--- /dev/null
+++ b/src/cdf.c
@@ -0,0 +1,4145 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "src/thread.h"
+#include "common/intops.h"
+
+#include "src/cdf.h"
+#include "src/tables.h"
+
+#define CDF1(x) (32768-(x))
+
+#define CDF2(a,b) \
+    CDF1(a), CDF1(b)
+#define CDF3(a,b,c) \
+    CDF1(a), CDF2(b,c)
+#define CDF4(a,b,c,d) \
+    CDF1(a), CDF3(b,c,d)
+#define CDF5(a,b,c,d,e) \
+    CDF1(a), CDF4(b,c,d,e)
+#define CDF6(a,b,c,d,e,f) \
+    CDF1(a), CDF5(b,c,d,e,f)
+#define CDF7(a,b,c,d,e,f,g) \
+    CDF1(a), CDF6(b,c,d,e,f,g)
+#define CDF8(a,b,c,d,e,f,g,h) \
+    CDF1(a), CDF7(b,c,d,e,f,g,h)
+#define CDF9(a,b,c,d,e,f,g,h,i) \
+    CDF1(a), CDF8(b,c,d,e,f,g,h,i)
+#define CDF10(a,b,c,d,e,f,g,h,i,j) \
+    CDF1(a), CDF9(b,c,d,e,f,g,h,i,j)
+#define CDF11(a,b,c,d,e,f,g,h,i,j,k) \
+    CDF1(a), CDF10(b,c,d,e,f,g,h,i,j,k)
+#define CDF12(a,b,c,d,e,f,g,h,i,j,k,l) \
+    CDF1(a), CDF11(b,c,d,e,f,g,h,i,j,k,l)
+#define CDF13(a,b,c,d,e,f,g,h,i,j,k,l,m) \
+    CDF1(a), CDF12(b,c,d,e,f,g,h,i,j,k,l,m)
+#define CDF14(a,b,c,d,e,f,g,h,i,j,k,l,m,n) \
+    CDF1(a), CDF13(b,c,d,e,f,g,h,i,j,k,l,m,n)
+#define CDF15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
+    CDF1(a), CDF14(b,c,d,e,f,g,h,i,j,k,l,m,n,o)
+
+static const CdfModeContext av1_default_cdf = {
+    .y_mode = {
+        { CDF12(22801, 23489, 24293, 24756, 25601, 26123,
+                26606, 27418, 27945, 29228, 29685, 30349) },
+        { CDF12(18673, 19845, 22631, 23318, 23950, 24649,
+                25527, 27364, 28152, 29701, 29984, 30852) },
+        { CDF12(19770, 20979, 23396, 23939, 24241, 24654,
+                25136, 27073, 27830, 29360, 29730, 30659) },
+        { CDF12(20155, 21301, 22838, 23178, 23261, 23533,
+                23703, 24804, 25352, 26575, 27016, 28049) },
+    }, .use_filter_intra = {
+        [BS_4x4]     = { CDF1( 4621) },
+        [BS_4x8]     = { CDF1( 6743) },
+        [BS_8x4]     = { CDF1( 5893) },
+        [BS_8x8]     = { CDF1( 7866) },
+        [BS_8x16]    = { CDF1(12551) },
+        [BS_16x8]    = { CDF1( 9394) },
+        [BS_16x16]   = { CDF1(12408) },
+        [BS_16x32]   = { CDF1(14301) },
+        [BS_32x16]   = { CDF1(12756) },
+        [BS_32x32]   = { CDF1(22343) },
+        [BS_32x64]   = { CDF1(16384) },
+        [BS_64x32]   = { CDF1(16384) },
+        [BS_64x64]   = { CDF1(16384) },
+        [BS_64x128]  = { CDF1(16384) },
+        [BS_128x64]  = { CDF1(16384) },
+        [BS_128x128] = { CDF1(16384) },
+        [BS_4x16]    = { CDF1(12770) },
+        [BS_16x4]    = { CDF1(10368) },
+        [BS_8x32]    = { CDF1(20229) },
+        [BS_32x8]    = { CDF1(18101) },
+        [BS_16x64]   = { CDF1(16384) },
+        [BS_64x16]   = { CDF1(16384) },
+    }, .filter_intra = {
+        CDF4(8949, 12776, 17211, 29558),
+    }, .uv_mode = {
+        {
+            { CDF12(22631, 24152, 25378, 25661, 25986, 26520,
+                    27055, 27923, 28244, 30059, 30941, 31961) },
+            { CDF12( 9513, 26881, 26973, 27046, 27118, 27664,
+                    27739, 27824, 28359, 29505, 29800, 31796) },
+            { CDF12( 9845,  9915, 28663, 28704, 28757, 28780,
+                    29198, 29822, 29854, 30764, 31777, 32029) },
+            { CDF12(13639, 13897, 14171, 25331, 25606, 25727,
+                    25953, 27148, 28577, 30612, 31355, 32493) },
+            { CDF12( 9764,  9835,  9930,  9954, 25386, 27053,
+                    27958, 28148, 28243, 31101, 31744, 32363) },
+            { CDF12(11825, 13589, 13677, 13720, 15048, 29213,
+                    29301, 29458, 29711, 31161, 31441, 32550) },
+            { CDF12(14175, 14399, 16608, 16821, 17718, 17775,
+                    28551, 30200, 30245, 31837, 32342, 32667) },
+            { CDF12(12885, 13038, 14978, 15590, 15673, 15748,
+                    16176, 29128, 29267, 30643, 31961, 32461) },
+            { CDF12(12026, 13661, 13874, 15305, 15490, 15726,
+                    15995, 16273, 28443, 30388, 30767, 32416) },
+            { CDF12(19052, 19840, 20579, 20916, 21150, 21467,
+                    21885, 22719, 23174, 28861, 30379, 32175) },
+            { CDF12(18627, 19649, 20974, 21219, 21492, 21816,
+                    22199, 23119, 23527, 27053, 31397, 32148) },
+            { CDF12(17026, 19004, 19997, 20339, 20586, 21103,
+                    21349, 21907, 22482, 25896, 26541, 31819) },
+            { CDF12(12124, 13759, 14959, 14992, 15007, 15051,
+                    15078, 15166, 15255, 15753, 16039, 16606) },
+        }, {
+            { CDF13(10407, 11208, 12900, 13181, 13823, 14175, 14899,
+                    15656, 15986, 20086, 20995, 22455, 24212) },
+            { CDF13( 4532, 19780, 20057, 20215, 20428, 21071, 21199,
+                    21451, 22099, 24228, 24693, 27032, 29472) },
+            { CDF13( 5273,  5379, 20177, 20270, 20385, 20439, 20949,
+                    21695, 21774, 23138, 24256, 24703, 26679) },
+            { CDF13( 6740,  7167,  7662, 14152, 14536, 14785, 15034,
+                    16741, 18371, 21520, 22206, 23389, 24182) },
+            { CDF13( 4987,  5368,  5928,  6068, 19114, 20315, 21857,
+                    22253, 22411, 24911, 25380, 26027, 26376) },
+            { CDF13( 5370,  6889,  7247,  7393,  9498, 21114, 21402,
+                    21753, 21981, 24780, 25386, 26517, 27176) },
+            { CDF13( 4816,  4961,  7204,  7326,  8765,  8930, 20169,
+                    20682, 20803, 23188, 23763, 24455, 24940) },
+            { CDF13( 6608,  6740,  8529,  9049,  9257,  9356,  9735,
+                    18827, 19059, 22336, 23204, 23964, 24793) },
+            { CDF13( 5998,  7419,  7781,  8933,  9255,  9549,  9753,
+                    10417, 18898, 22494, 23139, 24764, 25989) },
+            { CDF13(10660, 11298, 12550, 12957, 13322, 13624, 14040,
+                    15004, 15534, 20714, 21789, 23443, 24861) },
+            { CDF13(10522, 11530, 12552, 12963, 13378, 13779, 14245,
+                    15235, 15902, 20102, 22696, 23774, 25838) },
+            { CDF13(10099, 10691, 12639, 13049, 13386, 13665, 14125,
+                    15163, 15636, 19676, 20474, 23519, 25208) },
+            { CDF13( 3144,  5087,  7382,  7504,  7593,  7690,  7801,
+                     8064,  8232,  9248,  9875, 10521, 29048) },
+        },
+    }, .angle_delta = {
+        { CDF6( 2180,  5032,  7567, 22776, 26989, 30217) },
+        { CDF6( 2301,  5608,  8801, 23487, 26974, 30330) },
+        { CDF6( 3780, 11018, 13699, 19354, 23083, 31286) },
+        { CDF6( 4581, 11226, 15147, 17138, 21834, 28397) },
+        { CDF6( 1737, 10927, 14509, 19588, 22745, 28823) },
+        { CDF6( 2664, 10176, 12485, 17650, 21600, 30495) },
+        { CDF6( 2240, 11096, 15453, 20341, 22561, 28917) },
+        { CDF6( 3605, 10428, 12459, 17676, 21244, 30655) },
+    }, .filter = {
+        {
+            { CDF2(31935, 32720) }, { CDF2( 5568, 32719) },
+            { CDF2(  422,  2938) }, { CDF2(28244, 32608) },
+            { CDF2(31206, 31953) }, { CDF2( 4862, 32121) },
+            { CDF2(  770,  1152) }, { CDF2(20889, 25637) },
+        }, {
+            { CDF2(31910, 32724) }, { CDF2( 4120, 32712) },
+            { CDF2(  305,  2247) }, { CDF2(27403, 32636) },
+            { CDF2(31022, 32009) }, { CDF2( 2963, 32093) },
+            { CDF2(  601,   943) }, { CDF2(14969, 21398) },
+        },
+    }, .newmv_mode = {
+        { CDF1(24035) }, { CDF1(16630) }, { CDF1(15339) },
+        { CDF1( 8386) }, { CDF1(12222) }, { CDF1( 4676) },
+    }, .globalmv_mode = {
+        { CDF1( 2175) }, { CDF1( 1054) },
+    }, .refmv_mode = {
+        { CDF1(23974) }, { CDF1(24188) }, { CDF1(17848) },
+        { CDF1(28622) }, { CDF1(24312) }, { CDF1(19923) },
+    }, .drl_bit = {
+        { CDF1(13104) }, { CDF1(24560) }, { CDF1(18945) },
+    }, .comp_inter_mode = {
+        { CDF7( 7760, 13823, 15808, 17641, 19156, 20666, 26891) },
+        { CDF7(10730, 19452, 21145, 22749, 24039, 25131, 28724) },
+        { CDF7(10664, 20221, 21588, 22906, 24295, 25387, 28436) },
+        { CDF7(13298, 16984, 20471, 24182, 25067, 25736, 26422) },
+        { CDF7(18904, 23325, 25242, 27432, 27898, 28258, 30758) },
+        { CDF7(10725, 17454, 20124, 22820, 24195, 25168, 26046) },
+        { CDF7(17125, 24273, 25814, 27492, 28214, 28704, 30592) },
+        { CDF7(13046, 23214, 24505, 25942, 27435, 28442, 29330) },
+    }, .intra = {
+        { CDF1(  806) }, { CDF1(16662) }, { CDF1(20186) },
+        { CDF1(26538) },
+    }, .comp = {
+        { CDF1(26828) }, { CDF1(24035) }, { CDF1(12031) },
+        { CDF1(10640) }, { CDF1( 2901) },
+    }, .comp_dir = {
+        { CDF1( 1198) }, { CDF1( 2070) }, { CDF1( 9166) },
+        { CDF1( 7499) }, { CDF1(22475) },
+    }, .jnt_comp = {
+        { CDF1(18244) }, { CDF1(12865) }, { CDF1( 7053) },
+        { CDF1(13259) }, { CDF1( 9334) }, { CDF1( 4644) },
+    }, .mask_comp = {
+        { CDF1(26607) }, { CDF1(22891) }, { CDF1(18840) },
+        { CDF1(24594) }, { CDF1(19934) }, { CDF1(22674) },
+    }, .wedge_comp = {
+        { CDF1(23431) }, { CDF1(13171) }, { CDF1(11470) },
+        { CDF1( 9770) }, { CDF1( 9100) }, { CDF1( 8233) },
+        { CDF1( 6172) }, { CDF1(11820) }, { CDF1( 7701) },
+    }, .wedge_idx = {
+        { CDF15( 2438,  4440,  6599,  8663, 11005, 12874, 15751, 18094,
+                20359, 22362, 24127, 25702, 27752, 29450, 31171) },
+        { CDF15(  806,  3266,  6005,  6738,  7218,  7367,  7771, 14588,
+                16323, 17367, 18452, 19422, 22839, 26127, 29629) },
+        { CDF15( 2779,  3738,  4683,  7213,  7775,  8017,  8655, 14357,
+                17939, 21332, 24520, 27470, 29456, 30529, 31656) },
+        { CDF15( 1684,  3625,  5675,  7108,  9302, 11274, 14429, 17144,
+                19163, 20961, 22884, 24471, 26719, 28714, 30877) },
+        { CDF15( 1142,  3491,  6277,  7314,  8089,  8355,  9023, 13624,
+                15369, 16730, 18114, 19313, 22521, 26012, 29550) },
+        { CDF15( 2742,  4195,  5727,  8035,  8980,  9336, 10146, 14124,
+                17270, 20533, 23434, 25972, 27944, 29570, 31416) },
+        { CDF15( 1727,  3948,  6101,  7796,  9841, 12344, 15766, 18944,
+                20638, 22038, 23963, 25311, 26988, 28766, 31012) },
+        { CDF15(  154,   987,  1925,  2051,  2088,  2111,  2151, 23033,
+                23703, 24284, 24985, 25684, 27259, 28883, 30911) },
+        { CDF15( 1135,  1322,  1493,  2635,  2696,  2737,  2770, 21016,
+                22935, 25057, 27251, 29173, 30089, 30960, 31933) },
+    }, .interintra = {
+        { CDF1(16384) }, { CDF1(26887) }, { CDF1(27597) },
+        { CDF1(30237) },
+    }, .interintra_mode = {
+        { CDF3(8192, 16384, 24576) },
+        { CDF3(1875, 11082, 27332) },
+        { CDF3(2473,  9996, 26388) },
+        { CDF3(4238, 11537, 25926) },
+    }, .interintra_wedge = {
+        { CDF1(20036) }, { CDF1(24957) }, { CDF1(26704) },
+        { CDF1(27530) }, { CDF1(29564) }, { CDF1(29444) },
+        { CDF1(26872) },
+    }, .ref = {
+        { { CDF1( 4897) }, { CDF1(16973) }, { CDF1(29744) } },
+        { { CDF1( 1555) }, { CDF1(16751) }, { CDF1(30279) } },
+        { { CDF1( 4236) }, { CDF1(19647) }, { CDF1(31194) } },
+        { { CDF1( 8650) }, { CDF1(24773) }, { CDF1(31895) } },
+        { { CDF1(  904) }, { CDF1(11014) }, { CDF1(26875) } },
+        { { CDF1( 1444) }, { CDF1(15087) }, { CDF1(30304) } },
+    }, .comp_fwd_ref = {
+        { { CDF1( 4946) }, { CDF1(19891) }, { CDF1(30731) } },
+        { { CDF1( 9468) }, { CDF1(22441) }, { CDF1(31059) } },
+        { { CDF1( 1503) }, { CDF1(15160) }, { CDF1(27544) } },
+    }, .comp_bwd_ref = {
+        { { CDF1( 2235) }, { CDF1(17182) }, { CDF1(30606) } },
+        { { CDF1( 1423) }, { CDF1(15175) }, { CDF1(30489) } },
+    }, .comp_uni_ref = {
+        { { CDF1( 5284) }, { CDF1(23152) }, { CDF1(31774) } },
+        { { CDF1( 3865) }, { CDF1(14173) }, { CDF1(25120) } },
+        { { CDF1( 3128) }, { CDF1(15270) }, { CDF1(26710) } },
+    }, .txsz = {
+        {
+            { CDF1(19968) }, { CDF1(19968) }, { CDF1(24320) },
+        }, {
+            { CDF2(12272, 30172) }, { CDF2(12272, 30172) },
+            { CDF2(18677, 30848) },
+        }, {
+            { CDF2(12986, 15180) }, { CDF2(12986, 15180) },
+            { CDF2(24302, 25602) },
+        }, {
+            { CDF2( 5782, 11475) }, { CDF2( 5782, 11475) },
+            { CDF2(16803, 22759) },
+        },
+    }, .txpart = {
+        { { CDF1(28581) }, { CDF1(23846) }, { CDF1(20847) } },
+        { { CDF1(24315) }, { CDF1(18196) }, { CDF1(12133) } },
+        { { CDF1(18791) }, { CDF1(10887) }, { CDF1(11005) } },
+        { { CDF1(27179) }, { CDF1(20004) }, { CDF1(11281) } },
+        { { CDF1(26549) }, { CDF1(19308) }, { CDF1(14224) } },
+        { { CDF1(28015) }, { CDF1(21546) }, { CDF1(14400) } },
+        { { CDF1(28165) }, { CDF1(22401) }, { CDF1(16088) } },
+    }, .txtp_inter1 = {
+        { CDF15( 4458,  5560,  7695,  9709, 13330, 14789, 17537, 20266,
+                21504, 22848, 23934, 25474, 27727, 28915, 30631) },
+        { CDF15( 1645,  2573,  4778,  5711,  7807,  8622, 10522, 15357,
+                17674, 20408, 22517, 25010, 27116, 28856, 30749) },
+    }, .txtp_inter2 = {
+        CDF11(  770,  2421,  5225, 12907, 15819, 18927,
+              21561, 24089, 26595, 28526, 30529)
+    }, .txtp_inter3 = {
+        { CDF1(16384) }, { CDF1( 4167) }, { CDF1( 1998) }, { CDF1(  748) },
+    }, .txtp_intra1 = {
+        {
+            { CDF6( 1535,  8035,  9461, 12751, 23467, 27825) },
+            { CDF6(  564,  3335,  9709, 10870, 18143, 28094) },
+            { CDF6(  672,  3247,  3676, 11982, 19415, 23127) },
+            { CDF6( 5279, 13885, 15487, 18044, 23527, 30252) },
+            { CDF6( 4423,  6074,  7985, 10416, 25693, 29298) },
+            { CDF6( 1486,  4241,  9460, 10662, 16456, 27694) },
+            { CDF6(  439,  2838,  3522,  6737, 18058, 23754) },
+            { CDF6( 1190,  4233,  4855, 11670, 20281, 24377) },
+            { CDF6( 1045,  4312,  8647, 10159, 18644, 29335) },
+            { CDF6(  202,  3734,  4747,  7298, 17127, 24016) },
+            { CDF6(  447,  4312,  6819,  8884, 16010, 23858) },
+            { CDF6(  277,  4369,  5255,  8905, 16465, 22271) },
+            { CDF6( 3409,  5436, 10599, 15599, 19687, 24040) },
+        }, {
+            { CDF6( 1870, 13742, 14530, 16498, 23770, 27698) },
+            { CDF6(  326,  8796, 14632, 15079, 19272, 27486) },
+            { CDF6(  484,  7576,  7712, 14443, 19159, 22591) },
+            { CDF6( 1126, 15340, 15895, 17023, 20896, 30279) },
+            { CDF6(  655,  4854,  5249,  5913, 22099, 27138) },
+            { CDF6( 1299,  6458,  8885,  9290, 14851, 25497) },
+            { CDF6(  311,  5295,  5552,  6885, 16107, 22672) },
+            { CDF6(  883,  8059,  8270, 11258, 17289, 21549) },
+            { CDF6(  741,  7580,  9318, 10345, 16688, 29046) },
+            { CDF6(  110,  7406,  7915,  9195, 16041, 23329) },
+            { CDF6(  363,  7974,  9357, 10673, 15629, 24474) },
+            { CDF6(  153,  7647,  8112,  9936, 15307, 19996) },
+            { CDF6( 3511,  6332, 11165, 15335, 19323, 23594) },
+        },
+    }, .txtp_intra2 = {
+        {
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+        }, {
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+            { CDF4( 6554, 13107, 19661, 26214) },
+        }, {
+            { CDF4( 1127, 12814, 22772, 27483) },
+            { CDF4(  145,  6761, 11980, 26667) },
+            { CDF4(  362,  5887, 11678, 16725) },
+            { CDF4(  385, 15213, 18587, 30693) },
+            { CDF4(   25,  2914, 23134, 27903) },
+            { CDF4(   60,  4470, 11749, 23991) },
+            { CDF4(   37,  3332, 14511, 21448) },
+            { CDF4(  157,  6320, 13036, 17439) },
+            { CDF4(  119,  6719, 12906, 29396) },
+            { CDF4(   47,  5537, 12576, 21499) },
+            { CDF4(  269,  6076, 11258, 23115) },
+            { CDF4(   83,  5615, 12001, 17228) },
+            { CDF4( 1968,  5556, 12023, 18547) },
+        },
+    }, .skip = {
+        { CDF1(31671) }, { CDF1(16515) }, { CDF1( 4576) },
+    }, .skip_mode = {
+        { CDF1(32621) }, { CDF1(20708) }, { CDF1( 8127) },
+    }, .partition = {
+        {
+            // 128x128 -> 64x64
+            { CDF7(27899, 28219, 28529, 32484, 32539, 32619, 32639) },
+            { CDF7( 6607,  6990,  8268, 32060, 32219, 32338, 32371) },
+            { CDF7( 5429,  6676,  7122, 32027, 32227, 32531, 32582) },
+            { CDF7(  711,   966,  1172, 32448, 32538, 32617, 32664) },
+        }, {
+            // 64x64 -> 32x32
+            { CDF9(20137, 21547, 23078, 29566, 29837,
+                   30261, 30524, 30892, 31724) },
+            { CDF9( 6732,  7490,  9497, 27944, 28250,
+                   28515, 28969, 29630, 30104) },
+            { CDF9( 5945,  7663,  8348, 28683, 29117,
+                   29749, 30064, 30298, 32238) },
+            { CDF9(  870,  1212,  1487, 31198, 31394,
+                   31574, 31743, 31881, 32332) },
+        }, {
+            // 32x32 -> 16x16
+            { CDF9(18462, 20920, 23124, 27647, 28227,
+                   29049, 29519, 30178, 31544) },
+            { CDF9( 7689,  9060, 12056, 24992, 25660,
+                   26182, 26951, 28041, 29052) },
+            { CDF9( 6015,  9009, 10062, 24544, 25409,
+                   26545, 27071, 27526, 32047) },
+            { CDF9( 1394,  2208,  2796, 28614, 29061,
+                   29466, 29840, 30185, 31899) },
+        }, {
+            // 16x16 -> 8x8
+            { CDF9(15597, 20929, 24571, 26706, 27664,
+                   28821, 29601, 30571, 31902) },
+            { CDF9( 7925, 11043, 16785, 22470, 23971,
+                   25043, 26651, 28701, 29834) },
+            { CDF9( 5414, 13269, 15111, 20488, 22360,
+                   24500, 25537, 26336, 32117) },
+            { CDF9( 2662,  6362,  8614, 20860, 23053,
+                   24778, 26436, 27829, 31171) },
+        }, {
+            // 8x8 -> 4x4 only supports the four legacy partition types
+            { CDF3(19132, 25510, 30392) },
+            { CDF3(13928, 19855, 28540) },
+            { CDF3(12522, 23679, 28629) },
+            { CDF3( 9896, 18783, 25853) },
+        },
+    }, .seg_pred = {
+        { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+    }, .seg_id = {
+        { CDF7( 5622,  7893, 16093, 18233, 27809, 28373, 32533) },
+        { CDF7(14274, 18230, 22557, 24935, 29980, 30851, 32344) },
+        { CDF7(27527, 28487, 28723, 28890, 32397, 32647, 32679) },
+    }, .cfl_sign = {
+        CDF7( 1418,  2123, 13340, 18405, 26972, 28343, 32294)
+    }, .cfl_alpha = {
+        { CDF15( 7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696,
+                32700, 32704, 32708, 32712, 32716, 32720, 32724) },
+        { CDF15(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573,
+                32620, 32647, 32668, 32672, 32676, 32680, 32684) },
+        { CDF15(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649,
+                32673, 32677, 32681, 32685, 32689, 32693, 32697) },
+        { CDF15(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704,
+                32708, 32712, 32716, 32720, 32724, 32728, 32732) },
+        { CDF15(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321,
+                32394, 32464, 32516, 32560, 32576, 32593, 32622) },
+        { CDF15(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843,
+                32144, 32413, 32520, 32594, 32622, 32656, 32660) },
+    }, .restore_wiener = {
+        CDF1(11570)
+    }, .restore_sgrproj = {
+        CDF1(16855)
+    }, .restore_switchable = {
+        CDF2( 9413, 22581)
+    }, .delta_q = {
+        CDF3(28160, 32120, 32677)
+    }, .delta_lf = {
+        { CDF3(28160, 32120, 32677) },
+        { CDF3(28160, 32120, 32677) },
+        { CDF3(28160, 32120, 32677) },
+        { CDF3(28160, 32120, 32677) },
+        { CDF3(28160, 32120, 32677) },
+    }, .motion_mode = {
+        [BS_8x8]     = { CDF2( 7651, 24760) },
+        [BS_8x16]    = { CDF2( 4738, 24765) },
+        [BS_8x32]    = { CDF2(28799, 31390) },
+        [BS_16x8]    = { CDF2( 5391, 25528) },
+        [BS_16x16]   = { CDF2(19419, 26810) },
+        [BS_16x32]   = { CDF2( 5123, 23606) },
+        [BS_16x64]   = { CDF2(28973, 31594) },
+        [BS_32x8]    = { CDF2(26431, 30774) },
+        [BS_32x16]   = { CDF2(11606, 24308) },
+        [BS_32x32]   = { CDF2(26260, 29116) },
+        [BS_32x64]   = { CDF2(20360, 28062) },
+        [BS_64x16]   = { CDF2(29742, 31203) },
+        [BS_64x32]   = { CDF2(21679, 26830) },
+        [BS_64x64]   = { CDF2(29516, 30701) },
+        [BS_64x128]  = { CDF2(28898, 30397) },
+        [BS_128x64]  = { CDF2(30878, 31335) },
+        [BS_128x128] = { CDF2(32507, 32558) },
+    }, .obmc = {
+        [BS_8x8]     = { CDF1(10437) },
+        [BS_8x16]    = { CDF1( 9371) },
+        [BS_8x32]    = { CDF1(23664) },
+        [BS_16x8]    = { CDF1( 9301) },
+        [BS_16x16]   = { CDF1(17432) },
+        [BS_16x32]   = { CDF1(14423) },
+        [BS_16x64]   = { CDF1(24008) },
+        [BS_32x8]    = { CDF1(20901) },
+        [BS_32x16]   = { CDF1(15142) },
+        [BS_32x32]   = { CDF1(25817) },
+        [BS_32x64]   = { CDF1(22823) },
+        [BS_64x16]   = { CDF1(26879) },
+        [BS_64x32]   = { CDF1(22083) },
+        [BS_64x64]   = { CDF1(30128) },
+        [BS_64x128]  = { CDF1(31014) },
+        [BS_128x64]  = { CDF1(31560) },
+        [BS_128x128] = { CDF1(32638) },
+    }, .pal_y = {
+        { { CDF1(31676) }, { CDF1( 3419) }, { CDF1( 1261) } },
+        { { CDF1(31912) }, { CDF1( 2859) }, { CDF1(  980) } },
+        { { CDF1(31823) }, { CDF1( 3400) }, { CDF1(  781) } },
+        { { CDF1(32030) }, { CDF1( 3561) }, { CDF1(  904) } },
+        { { CDF1(32309) }, { CDF1( 7337) }, { CDF1( 1462) } },
+        { { CDF1(32265) }, { CDF1( 4015) }, { CDF1( 1521) } },
+        { { CDF1(32450) }, { CDF1( 7946) }, { CDF1(  129) } },
+    }, .pal_sz = {
+        {
+            { CDF6( 7952, 13000, 18149, 21478, 25527, 29241) },
+            { CDF6( 7139, 11421, 16195, 19544, 23666, 28073) },
+            { CDF6( 7788, 12741, 17325, 20500, 24315, 28530) },
+            { CDF6( 8271, 14064, 18246, 21564, 25071, 28533) },
+            { CDF6(12725, 19180, 21863, 24839, 27535, 30120) },
+            { CDF6( 9711, 14888, 16923, 21052, 25661, 27875) },
+            { CDF6(14940, 20797, 21678, 24186, 27033, 28999) },
+        }, {
+            { CDF6( 8713, 19979, 27128, 29609, 31331, 32272) },
+            { CDF6( 5839, 15573, 23581, 26947, 29848, 31700) },
+            { CDF6( 4426, 11260, 17999, 21483, 25863, 29430) },
+            { CDF6( 3228,  9464, 14993, 18089, 22523, 27420) },
+            { CDF6( 3768,  8886, 13091, 17852, 22495, 27207) },
+            { CDF6( 2464,  8451, 12861, 21632, 25525, 28555) },
+            { CDF6( 1269,  5435, 10433, 18963, 21700, 25865) },
+        },
+    }, .pal_uv = {
+        { CDF1(32461) }, { CDF1(21488) },
+    }, .color_map = {
+        { /* y */
+            {
+                { CDF1(28710) }, { CDF1(16384) }, { CDF1(10553) },
+                { CDF1(27036) }, { CDF1(31603) },
+            }, {
+                { CDF2(27877, 30490) }, { CDF2(11532, 25697) },
+                { CDF2( 6544, 30234) }, { CDF2(23018, 28072) },
+                { CDF2(31915, 32385) },
+            }, {
+                { CDF3(25572, 28046, 30045) },
+                { CDF3( 9478, 21590, 27256) },
+                { CDF3( 7248, 26837, 29824) },
+                { CDF3(19167, 24486, 28349) },
+                { CDF3(31400, 31825, 32250) },
+            }, {
+                { CDF4(24779, 26955, 28576, 30282) },
+                { CDF4( 8669, 20364, 24073, 28093) },
+                { CDF4( 4255, 27565, 29377, 31067) },
+                { CDF4(19864, 23674, 26716, 29530) },
+                { CDF4(31646, 31893, 32147, 32426) },
+            }, {
+                { CDF5(23132, 25407, 26970, 28435, 30073) },
+                { CDF5( 7443, 17242, 20717, 24762, 27982) },
+                { CDF5( 6300, 24862, 26944, 28784, 30671) },
+                { CDF5(18916, 22895, 25267, 27435, 29652) },
+                { CDF5(31270, 31550, 31808, 32059, 32353) },
+            }, {
+                { CDF6(23105, 25199, 26464, 27684, 28931, 30318) },
+                { CDF6( 6950, 15447, 18952, 22681, 25567, 28563) },
+                { CDF6( 7560, 23474, 25490, 27203, 28921, 30708) },
+                { CDF6(18544, 22373, 24457, 26195, 28119, 30045) },
+                { CDF6(31198, 31451, 31670, 31882, 32123, 32391) },
+            }, {
+                { CDF7(21689, 23883, 25163, 26352, 27506, 28827, 30195) },
+                { CDF7( 6892, 15385, 17840, 21606, 24287, 26753, 29204) },
+                { CDF7( 5651, 23182, 25042, 26518, 27982, 29392, 30900) },
+                { CDF7(19349, 22578, 24418, 25994, 27524, 29031, 30448) },
+                { CDF7(31028, 31270, 31504, 31705, 31927, 32153, 32392) },
+            },
+        }, { /* uv */
+            {
+                { CDF1(29089) }, { CDF1(16384) }, { CDF1( 8713) },
+                { CDF1(29257) }, { CDF1(31610) },
+            }, {
+                { CDF2(25257, 29145) }, { CDF2(12287, 27293) },
+                { CDF2( 7033, 27960) }, { CDF2(20145, 25405) },
+                { CDF2(30608, 31639) },
+            }, {
+                { CDF3(24210, 27175, 29903) },
+                { CDF3( 9888, 22386, 27214) },
+                { CDF3( 5901, 26053, 29293) },
+                { CDF3(18318, 22152, 28333) },
+                { CDF3(30459, 31136, 31926) },
+            }, {
+                { CDF4(22980, 25479, 27781, 29986) },
+                { CDF4( 8413, 21408, 24859, 28874) },
+                { CDF4( 2257, 29449, 30594, 31598) },
+                { CDF4(19189, 21202, 25915, 28620) },
+                { CDF4(31844, 32044, 32281, 32518) },
+            }, {
+                { CDF5(22217, 24567, 26637, 28683, 30548) },
+                { CDF5( 7307, 16406, 19636, 24632, 28424) },
+                { CDF5( 4441, 25064, 26879, 28942, 30919) },
+                { CDF5(17210, 20528, 23319, 26750, 29582) },
+                { CDF5(30674, 30953, 31396, 31735, 32207) },
+            }, {
+                { CDF6(21239, 23168, 25044, 26962, 28705, 30506) },
+                { CDF6( 6545, 15012, 18004, 21817, 25503, 28701) },
+                { CDF6( 3448, 26295, 27437, 28704, 30126, 31442) },
+                { CDF6(15889, 18323, 21704, 24698, 26976, 29690) },
+                { CDF6(30988, 31204, 31479, 31734, 31983, 32325) },
+            }, {
+                { CDF7(21442, 23288, 24758, 26246, 27649, 28980, 30563) },
+                { CDF7( 5863, 14933, 17552, 20668, 23683, 26411, 29273) },
+                { CDF7( 3415, 25810, 26877, 27990, 29223, 30394, 31618) },
+                { CDF7(17965, 20084, 22232, 23974, 26274, 28402, 30390) },
+                { CDF7(31190, 31329, 31516, 31679, 31825, 32026, 32322) },
+            },
+        },
+    }, .intrabc = {
+        CDF1(30531)
+    },
+};
+
+static const CdfMvComponent default_mv_component_cdf = {
+    .classes = {
+        CDF10(28672, 30976, 31858, 32320, 32551,
+              32656, 32740, 32757, 32762, 32767)
+    }, .class0 = {
+        CDF1(27648)
+    }, .classN = {
+        { CDF1(17408) }, { CDF1(17920) }, { CDF1(18944) },
+        { CDF1(20480) }, { CDF1(22528) }, { CDF1(24576) },
+        { CDF1(28672) }, { CDF1(29952) }, { CDF1(29952) },
+        { CDF1(30720) },
+    }, .class0_fp = {
+        { CDF3(16384, 24576, 26624) },
+        { CDF3(12288, 21248, 24128) },
+    }, .classN_fp = {
+        CDF3( 8192, 17408, 21248)
+    }, .class0_hp = {
+        CDF1(20480)
+    }, .classN_hp = {
+        CDF1(16384)
+    }, .sign = {
+        CDF1(16384)
+    },
+};
+
+static const uint16_t ALIGN(default_mv_joint_cdf[N_MV_JOINTS], 8) = {
+    CDF3( 4096, 11264, 19328)
+};
+
+static const uint16_t ALIGN(default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 3], 32) = {
+    {
+        { CDF12(15588, 17027, 19338, 20218, 20682, 21110,
+                21825, 23244, 24189, 28165, 29093, 30466) },
+        { CDF12(12016, 18066, 19516, 20303, 20719, 21444,
+                21888, 23032, 24434, 28658, 30172, 31409) },
+        { CDF12(10052, 10771, 22296, 22788, 23055, 23239,
+                24133, 25620, 26160, 29336, 29929, 31567) },
+        { CDF12(14091, 15406, 16442, 18808, 19136, 19546,
+                19998, 22096, 24746, 29585, 30958, 32462) },
+        { CDF12(12122, 13265, 15603, 16501, 18609, 20033,
+                22391, 25583, 26437, 30261, 31073, 32475) },
+    }, {
+        { CDF12(10023, 19585, 20848, 21440, 21832, 22760,
+                23089, 24023, 25381, 29014, 30482, 31436) },
+        { CDF12( 5983, 24099, 24560, 24886, 25066, 25795,
+                25913, 26423, 27610, 29905, 31276, 31794) },
+        { CDF12( 7444, 12781, 20177, 20728, 21077, 21607,
+                22170, 23405, 24469, 27915, 29090, 30492) },
+        { CDF12( 8537, 14689, 15432, 17087, 17408, 18172,
+                18408, 19825, 24649, 29153, 31096, 32210) },
+        { CDF12( 7543, 14231, 15496, 16195, 17905, 20717,
+                21984, 24516, 26001, 29675, 30981, 31994) },
+    }, {
+        { CDF12(12613, 13591, 21383, 22004, 22312, 22577,
+                23401, 25055, 25729, 29538, 30305, 32077) },
+        { CDF12( 9687, 13470, 18506, 19230, 19604, 20147,
+                20695, 22062, 23219, 27743, 29211, 30907) },
+        { CDF12( 6183,  6505, 26024, 26252, 26366, 26434,
+                27082, 28354, 28555, 30467, 30794, 32086) },
+        { CDF12(10718, 11734, 14954, 17224, 17565, 17924,
+                18561, 21523, 23878, 28975, 30287, 32252) },
+        { CDF12( 9194,  9858, 16501, 17263, 18424, 19171,
+                21563, 25961, 26561, 30072, 30737, 32463) },
+    }, {
+        { CDF12(12602, 14399, 15488, 18381, 18778, 19315,
+                19724, 21419, 25060, 29696, 30917, 32409) },
+        { CDF12( 8203, 13821, 14524, 17105, 17439, 18131,
+                18404, 19468, 25225, 29485, 31158, 32342) },
+        { CDF12( 8451,  9731, 15004, 17643, 18012, 18425,
+                19070, 21538, 24605, 29118, 30078, 32018) },
+        { CDF12( 7714,  9048,  9516, 16667, 16817, 16994,
+                17153, 18767, 26743, 30389, 31536, 32528) },
+        { CDF12( 8843, 10280, 11496, 15317, 16652, 17943,
+                19108, 22718, 25769, 29953, 30983, 32485) },
+    }, {
+        { CDF12(12578, 13671, 15979, 16834, 19075, 20913,
+                22989, 25449, 26219, 30214, 31150, 32477) },
+        { CDF12( 9563, 13626, 15080, 15892, 17756, 20863,
+                22207, 24236, 25380, 29653, 31143, 32277) },
+        { CDF12( 8356,  8901, 17616, 18256, 19350, 20106,
+                22598, 25947, 26466, 29900, 30523, 32261) },
+        { CDF12(10835, 11815, 13124, 16042, 17018, 18039,
+                18947, 22753, 24615, 29489, 30883, 32482) },
+        { CDF12( 7618,  8288,  9859, 10509, 15386, 18657,
+                22903, 28776, 29180, 31355, 31802, 32593) },
+    },
+};
+
+static const CdfCoefContext av1_default_coef_cdf[4] = {
+    [0] = {
+        .skip = {
+            {
+                { CDF1(31849) }, { CDF1( 5892) }, { CDF1(12112) },
+                { CDF1(21935) }, { CDF1(20289) }, { CDF1(27473) },
+                { CDF1(32487) }, { CDF1( 7654) }, { CDF1(19473) },
+                { CDF1(29984) }, { CDF1( 9961) }, { CDF1(30242) },
+                { CDF1(32117) },
+            }, {
+                { CDF1(31548) }, { CDF1( 1549) }, { CDF1(10130) },
+                { CDF1(16656) }, { CDF1(18591) }, { CDF1(26308) },
+                { CDF1(32537) }, { CDF1( 5403) }, { CDF1(18096) },
+                { CDF1(30003) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            }, {
+                { CDF1(29957) }, { CDF1( 5391) }, { CDF1(18039) },
+                { CDF1(23566) }, { CDF1(22431) }, { CDF1(25822) },
+                { CDF1(32197) }, { CDF1( 3778) }, { CDF1(15336) },
+                { CDF1(28981) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            }, {
+                { CDF1(17920) }, { CDF1( 1818) }, { CDF1( 7282) },
+                { CDF1(25273) }, { CDF1(10923) }, { CDF1(31554) },
+                { CDF1(32624) }, { CDF1( 1366) }, { CDF1(15628) },
+                { CDF1(30462) }, { CDF1(  146) }, { CDF1( 5132) },
+                { CDF1(31657) },
+            }, {
+                { CDF1( 6308) }, { CDF1(  117) }, { CDF1( 1638) },
+                { CDF1( 2161) }, { CDF1(16384) }, { CDF1(10923) },
+                { CDF1(30247) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            },
+        }, .eob_bin_16 = {
+            {
+                { CDF4(  840,  1039,  1980,  4895) },
+                { CDF4(  370,   671,  1883,  4471) },
+            }, {
+                { CDF4( 3247,  4950,  9688, 14563) },
+                { CDF4( 1904,  3354,  7763, 14647) },
+            },
+        }, .eob_bin_32 = {
+            {
+                { CDF5(  400,   520,   977,  2102,  6542) },
+                { CDF5(  210,   405,  1315,  3326,  7537) },
+            }, {
+                { CDF5( 2636,  4273,  7588, 11794, 20401) },
+                { CDF5( 1786,  3179,  6902, 11357, 19054) },
+            },
+        }, .eob_bin_64 = {
+            {
+                { CDF6(  329,   498,  1101,  1784,  3265,  7758) },
+                { CDF6(  335,   730,  1459,  5494,  8755, 12997) },
+            }, {
+                { CDF6( 3505,  5304, 10086, 13814, 17684, 23370) },
+                { CDF6( 1563,  2700,  4876, 10911, 14706, 22480) },
+            },
+        }, .eob_bin_128 = {
+            {
+                { CDF7(  219,   482,  1140,  2091,  3680,  6028, 12586) },
+                { CDF7(  371,   699,  1254,  4830,  9479, 12562, 17497) },
+            }, {
+                { CDF7( 5245,  7456, 12880, 15852, 20033, 23932, 27608) },
+                { CDF7( 2054,  3472,  5869, 14232, 18242, 20590, 26752) },
+            },
+        }, .eob_bin_256 = {
+            {
+                { CDF8(  310,   584,  1887,  3589,
+                        6168,  8611, 11352, 15652) },
+                { CDF8(  998,  1850,  2998,  5604,
+                       17341, 19888, 22899, 25583) },
+            }, {
+                { CDF8( 2520,  3240,  5952,  8870,
+                       12577, 17558, 19954, 24168) },
+                { CDF8( 2203,  4130,  7435, 10739,
+                       20652, 23681, 25609, 27261) },
+            },
+        }, .eob_bin_512 = {
+            { CDF9(  641,   983,  3707,  5430, 10234,
+                   14958, 18788, 23412, 26061) },
+            { CDF9( 5095,  6446,  9996, 13354, 16017,
+                   17986, 20919, 26129, 29140) },
+        }, .eob_bin_1024 = {
+            { CDF10(  393,   421,   751,  1623,  3160,
+                     6352, 13345, 18047, 22571, 25830) },
+            { CDF10( 1865,  1988,  2930,  4242, 10533,
+                    16538, 21354, 27255, 28546, 31784) },
+        }, .eob_hi_bit = {
+            {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16961) },
+                    { CDF1(17223) }, { CDF1( 7621) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(19069) },
+                    { CDF1(22525) }, { CDF1(13377) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(20401) },
+                    { CDF1(17025) }, { CDF1(12845) }, { CDF1(12873) },
+                    { CDF1(14094) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(20681) },
+                    { CDF1(20701) }, { CDF1(15250) }, { CDF1(15017) },
+                    { CDF1(14928) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(23905) },
+                    { CDF1(17194) }, { CDF1(16170) }, { CDF1(17695) },
+                    { CDF1(13826) }, { CDF1(15810) }, { CDF1(12036) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(23959) },
+                    { CDF1(20799) }, { CDF1(19021) }, { CDF1(16203) },
+                    { CDF1(17886) }, { CDF1(14144) }, { CDF1(12010) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(27399) },
+                    { CDF1(16327) }, { CDF1(18071) }, { CDF1(19584) },
+                    { CDF1(20721) }, { CDF1(18432) }, { CDF1(19560) },
+                    { CDF1(10150) }, { CDF1( 8805) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(24932) },
+                    { CDF1(20833) }, { CDF1(12027) }, { CDF1(16670) },
+                    { CDF1(19914) }, { CDF1(15106) }, { CDF1(17662) },
+                    { CDF1(13783) }, { CDF1(28756) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(23406) },
+                    { CDF1(21845) }, { CDF1(18432) }, { CDF1(16384) },
+                    { CDF1(17096) }, { CDF1(12561) }, { CDF1(17320) },
+                    { CDF1(22395) }, { CDF1(21370) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            },
+        }, .eob_base_tok = {
+            {
+                {
+                    { CDF2(17837, 29055) }, { CDF2(29600, 31446) },
+                    { CDF2(30844, 31878) }, { CDF2(24926, 28948) },
+                }, {
+                    { CDF2(21365, 30026) }, { CDF2(30512, 32423) },
+                    { CDF2(31658, 32621) }, { CDF2(29630, 31881) },
+                },
+            }, {
+                {
+                    { CDF2( 5717, 26477) }, { CDF2(30491, 31703) },
+                    { CDF2(31550, 32158) }, { CDF2(29648, 31491) },
+                }, {
+                    { CDF2(12608, 27820) }, { CDF2(30680, 32225) },
+                    { CDF2(30809, 32335) }, { CDF2(31299, 32423) },
+                },
+            }, {
+                {
+                    { CDF2( 1786, 12612) }, { CDF2(30663, 31625) },
+                    { CDF2(32339, 32468) }, { CDF2(31148, 31833) },
+                }, {
+                    { CDF2(18857, 23865) }, { CDF2(31428, 32428) },
+                    { CDF2(31744, 32373) }, { CDF2(31775, 32526) },
+                },
+            }, {
+                {
+                    { CDF2( 1787,  2532) }, { CDF2(30832, 31662) },
+                    { CDF2(31824, 32682) }, { CDF2(32133, 32569) },
+                }, {
+                    { CDF2(13751, 22235) }, { CDF2(32089, 32409) },
+                    { CDF2(27084, 27920) }, { CDF2(29291, 32594) },
+                },
+            }, {
+                {
+                    { CDF2( 1725,  3449) }, { CDF2(31102, 31935) },
+                    { CDF2(32457, 32613) }, { CDF2(32412, 32649) },
+                }, {
+                    { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+                    { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+                },
+            },
+        }, .base_tok = {
+            {
+                {
+                    { CDF3( 4034,  8930, 12727) },
+                    { CDF3(18082, 29741, 31877) },
+                    { CDF3(12596, 26124, 30493) },
+                    { CDF3( 9446, 21118, 27005) },
+                    { CDF3( 6308, 15141, 21279) },
+                    { CDF3( 2463,  6357,  9783) },
+                    { CDF3(20667, 30546, 31929) },
+                    { CDF3(13043, 26123, 30134) },
+                    { CDF3( 8151, 18757, 24778) },
+                    { CDF3( 5255, 12839, 18632) },
+                    { CDF3( 2820,  7206, 11161) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3(15736, 27553, 30604) },
+                    { CDF3(11210, 23794, 28787) },
+                    { CDF3( 5947, 13874, 19701) },
+                    { CDF3( 4215,  9323, 13891) },
+                    { CDF3( 2833,  6462, 10059) },
+                    { CDF3(19605, 30393, 31582) },
+                    { CDF3(13523, 26252, 30248) },
+                    { CDF3( 8446, 18622, 24512) },
+                    { CDF3( 3818, 10343, 15974) },
+                    { CDF3( 1481,  4117,  6796) },
+                    { CDF3(22649, 31302, 32190) },
+                    { CDF3(14829, 27127, 30449) },
+                    { CDF3( 8313, 17702, 23304) },
+                    { CDF3( 3022,  8301, 12786) },
+                    { CDF3( 1536,  4412,  7184) },
+                    { CDF3(22354, 29774, 31372) },
+                    { CDF3(14723, 25472, 29214) },
+                    { CDF3( 6673, 13745, 18662) },
+                    { CDF3( 2068,  5766,  9322) },
+                    { CDF3( 8192, 16384, 24576) },
+                }, {
+                    { CDF3( 6302, 16444, 21761) },
+                    { CDF3(23040, 31538, 32475) },
+                    { CDF3(15196, 28452, 31496) },
+                    { CDF3(10020, 22946, 28514) },
+                    { CDF3( 6533, 16862, 23501) },
+                    { CDF3( 3538,  9816, 15076) },
+                    { CDF3(24444, 31875, 32525) },
+                    { CDF3(15881, 28924, 31635) },
+                    { CDF3( 9922, 22873, 28466) },
+                    { CDF3( 6527, 16966, 23691) },
+                    { CDF3( 4114, 11303, 17220) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3(20201, 30770, 32209) },
+                    { CDF3(14754, 28071, 31258) },
+                    { CDF3( 8378, 20186, 26517) },
+                    { CDF3( 5916, 15299, 21978) },
+                    { CDF3( 4268, 11583, 17901) },
+                    { CDF3(24361, 32025, 32581) },
+                    { CDF3(18673, 30105, 31943) },
+                    { CDF3(10196, 22244, 27576) },
+                    { CDF3( 5495, 14349, 20417) },
+                    { CDF3( 2676,  7415, 11498) },
+                    { CDF3(24678, 31958, 32585) },
+                    { CDF3(18629, 29906, 31831) },
+                    { CDF3( 9364, 20724, 26315) },
+                    { CDF3( 4641, 12318, 18094) },
+                    { CDF3( 2758,  7387, 11579) },
+                    { CDF3(25433, 31842, 32469) },
+                    { CDF3(18795, 29289, 31411) },
+                    { CDF3( 7644, 17584, 23592) },
+                    { CDF3( 3408,  9014, 15047) },
+                    { CDF3( 8192, 16384, 24576) },
+                },
+            }, {
+                {
+                    { CDF3( 4536, 10072, 14001) },
+                    { CDF3(25459, 31416, 32206) },
+                    { CDF3(16605, 28048, 30818) },
+                    { CDF3(11008, 22857, 27719) },
+                    { CDF3( 6915, 16268, 22315) },
+                    { CDF3( 2625,  6812, 10537) },
+                    { CDF3(24257, 31788, 32499) },
+                    { CDF3(16880, 29454, 31879) },
+                    { CDF3(11958, 25054, 29778) },
+                    { CDF3( 7916, 18718, 25084) },
+                    { CDF3( 3383,  8777, 13446) },
+                    { CDF3(22720, 31603, 32393) },
+                    { CDF3(14960, 28125, 31335) },
+                    { CDF3( 9731, 22210, 27928) },
+                    { CDF3( 6304, 15832, 22277) },
+                    { CDF3( 2910,  7818, 12166) },
+                    { CDF3(20375, 30627, 32131) },
+                    { CDF3(13904, 27284, 30887) },
+                    { CDF3( 9368, 21558, 27144) },
+                    { CDF3( 5937, 14966, 21119) },
+                    { CDF3( 2667,  7225, 11319) },
+                    { CDF3(23970, 31470, 32378) },
+                    { CDF3(17173, 29734, 32018) },
+                    { CDF3(12795, 25441, 29965) },
+                    { CDF3( 8981, 19680, 25893) },
+                    { CDF3( 4728, 11372, 16902) },
+                    { CDF3(24287, 31797, 32439) },
+                    { CDF3(16703, 29145, 31696) },
+                    { CDF3(10833, 23554, 28725) },
+                    { CDF3( 6468, 16566, 23057) },
+                    { CDF3( 2415,  6562, 10278) },
+                    { CDF3(26610, 32395, 32659) },
+                    { CDF3(18590, 30498, 32117) },
+                    { CDF3(12420, 25756, 29950) },
+                    { CDF3( 7639, 18746, 24710) },
+                    { CDF3( 3001,  8086, 12347) },
+                    { CDF3(25076, 32064, 32580) },
+                    { CDF3(17946, 30128, 32028) },
+                    { CDF3(12024, 24985, 29378) },
+                    { CDF3( 7517, 18390, 24304) },
+                    { CDF3( 3243,  8781, 13331) },
+                }, {
+                    { CDF3( 6037, 16771, 21957) },
+                    { CDF3(24774, 31704, 32426) },
+                    { CDF3(16830, 28589, 31056) },
+                    { CDF3(10602, 22828, 27760) },
+                    { CDF3( 6733, 16829, 23071) },
+                    { CDF3( 3250,  8914, 13556) },
+                    { CDF3(25582, 32220, 32668) },
+                    { CDF3(18659, 30342, 32223) },
+                    { CDF3(12546, 26149, 30515) },
+                    { CDF3( 8420, 20451, 26801) },
+                    { CDF3( 4636, 12420, 18344) },
+                    { CDF3(27581, 32362, 32639) },
+                    { CDF3(18987, 30083, 31978) },
+                    { CDF3(11327, 24248, 29084) },
+                    { CDF3( 7264, 17719, 24120) },
+                    { CDF3( 3995, 10768, 16169) },
+                    { CDF3(25893, 31831, 32487) },
+                    { CDF3(16577, 28587, 31379) },
+                    { CDF3(10189, 22748, 28182) },
+                    { CDF3( 6832, 17094, 23556) },
+                    { CDF3( 3708, 10110, 15334) },
+                    { CDF3(25904, 32282, 32656) },
+                    { CDF3(19721, 30792, 32276) },
+                    { CDF3(12819, 26243, 30411) },
+                    { CDF3( 8572, 20614, 26891) },
+                    { CDF3( 5364, 14059, 20467) },
+                    { CDF3(26580, 32438, 32677) },
+                    { CDF3(20852, 31225, 32340) },
+                    { CDF3(12435, 25700, 29967) },
+                    { CDF3( 8691, 20825, 26976) },
+                    { CDF3( 4446, 12209, 17269) },
+                    { CDF3(27350, 32429, 32696) },
+                    { CDF3(21372, 30977, 32272) },
+                    { CDF3(12673, 25270, 29853) },
+                    { CDF3( 9208, 20925, 26640) },
+                    { CDF3( 5018, 13351, 18732) },
+                    { CDF3(27351, 32479, 32713) },
+                    { CDF3(21398, 31209, 32387) },
+                    { CDF3(12162, 25047, 29842) },
+                    { CDF3( 7896, 18691, 25319) },
+                    { CDF3( 4670, 12882, 18881) },
+                },
+            }, {
+                {
+                    { CDF3( 5487, 10460, 13708) },
+                    { CDF3(21597, 28303, 30674) },
+                    { CDF3(11037, 21953, 26476) },
+                    { CDF3( 8147, 17962, 22952) },
+                    { CDF3( 5242, 13061, 18532) },
+                    { CDF3( 1889,  5208,  8182) },
+                    { CDF3(26774, 32133, 32590) },
+                    { CDF3(17844, 29564, 31767) },
+                    { CDF3(11690, 24438, 29171) },
+                    { CDF3( 7542, 18215, 24459) },
+                    { CDF3( 2993,  8050, 12319) },
+                    { CDF3(28023, 32328, 32591) },
+                    { CDF3(18651, 30126, 31954) },
+                    { CDF3(12164, 25146, 29589) },
+                    { CDF3( 7762, 18530, 24771) },
+                    { CDF3( 3492,  9183, 13920) },
+                    { CDF3(27591, 32008, 32491) },
+                    { CDF3(17149, 28853, 31510) },
+                    { CDF3(11485, 24003, 28860) },
+                    { CDF3( 7697, 18086, 24210) },
+                    { CDF3( 3075,  7999, 12218) },
+                    { CDF3(28268, 32482, 32654) },
+                    { CDF3(19631, 31051, 32404) },
+                    { CDF3(13860, 27260, 31020) },
+                    { CDF3( 9605, 21613, 27594) },
+                    { CDF3( 4876, 12162, 17908) },
+                    { CDF3(27248, 32316, 32576) },
+                    { CDF3(18955, 30457, 32075) },
+                    { CDF3(11824, 23997, 28795) },
+                    { CDF3( 7346, 18196, 24647) },
+                    { CDF3( 3403,  9247, 14111) },
+                    { CDF3(29711, 32655, 32735) },
+                    { CDF3(21169, 31394, 32417) },
+                    { CDF3(13487, 27198, 30957) },
+                    { CDF3( 8828, 21683, 27614) },
+                    { CDF3( 4270, 11451, 17038) },
+                    { CDF3(28708, 32578, 32731) },
+                    { CDF3(20120, 31241, 32482) },
+                    { CDF3(13692, 27550, 31321) },
+                    { CDF3( 9418, 22514, 28439) },
+                    { CDF3( 4999, 13283, 19462) },
+                }, {
+                    { CDF3( 5673, 14302, 19711) },
+                    { CDF3(26251, 30701, 31834) },
+                    { CDF3(12782, 23783, 27803) },
+                    { CDF3( 9127, 20657, 25808) },
+                    { CDF3( 6368, 16208, 21462) },
+                    { CDF3( 2465,  7177, 10822) },
+                    { CDF3(29961, 32563, 32719) },
+                    { CDF3(18318, 29891, 31949) },
+                    { CDF3(11361, 24514, 29357) },
+                    { CDF3( 7900, 19603, 25607) },
+                    { CDF3( 4002, 10590, 15546) },
+                    { CDF3(29637, 32310, 32595) },
+                    { CDF3(18296, 29913, 31809) },
+                    { CDF3(10144, 21515, 26871) },
+                    { CDF3( 5358, 14322, 20394) },
+                    { CDF3( 3067,  8362, 13346) },
+                    { CDF3(28652, 32470, 32676) },
+                    { CDF3(17538, 30771, 32209) },
+                    { CDF3(13924, 26882, 30494) },
+                    { CDF3(10496, 22837, 27869) },
+                    { CDF3( 7236, 16396, 21621) },
+                    { CDF3(30743, 32687, 32746) },
+                    { CDF3(23006, 31676, 32489) },
+                    { CDF3(14494, 27828, 31120) },
+                    { CDF3(10174, 22801, 28352) },
+                    { CDF3( 6242, 15281, 21043) },
+                    { CDF3(25817, 32243, 32720) },
+                    { CDF3(18618, 31367, 32325) },
+                    { CDF3(13997, 28318, 31878) },
+                    { CDF3(12255, 26534, 31383) },
+                    { CDF3( 9561, 21588, 28450) },
+                    { CDF3(28188, 32635, 32724) },
+                    { CDF3(22060, 32365, 32728) },
+                    { CDF3(18102, 30690, 32528) },
+                    { CDF3(14196, 28864, 31999) },
+                    { CDF3(12262, 25792, 30865) },
+                    { CDF3(24176, 32109, 32628) },
+                    { CDF3(18280, 29681, 31963) },
+                    { CDF3(10205, 23703, 29664) },
+                    { CDF3( 7889, 20025, 27676) },
+                    { CDF3( 6060, 16743, 23970) },
+                },
+            }, {
+                {
+                    { CDF3( 5141,  7096,  8260) },
+                    { CDF3(27186, 29022, 29789) },
+                    { CDF3( 6668, 12568, 15682) },
+                    { CDF3( 2172,  6181,  8638) },
+                    { CDF3( 1126,  3379,  4531) },
+                    { CDF3(  443,  1361,  2254) },
+                    { CDF3(26083, 31153, 32436) },
+                    { CDF3(13486, 24603, 28483) },
+                    { CDF3( 6508, 14840, 19910) },
+                    { CDF3( 3386,  8800, 13286) },
+                    { CDF3( 1530,  4322,  7054) },
+                    { CDF3(29639, 32080, 32548) },
+                    { CDF3(15897, 27552, 30290) },
+                    { CDF3( 8588, 20047, 25383) },
+                    { CDF3( 4889, 13339, 19269) },
+                    { CDF3( 2240,  6871, 10498) },
+                    { CDF3(28165, 32197, 32517) },
+                    { CDF3(20735, 30427, 31568) },
+                    { CDF3(14325, 24671, 27692) },
+                    { CDF3( 5119, 12554, 17805) },
+                    { CDF3( 1810,  5441,  8261) },
+                    { CDF3(31212, 32724, 32748) },
+                    { CDF3(23352, 31766, 32545) },
+                    { CDF3(14669, 27570, 31059) },
+                    { CDF3( 8492, 20894, 27272) },
+                    { CDF3( 3644, 10194, 15204) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                }, {
+                    { CDF3( 2461,  7013,  9371) },
+                    { CDF3(24749, 29600, 30986) },
+                    { CDF3( 9466, 19037, 22417) },
+                    { CDF3( 3584,  9280, 14400) },
+                    { CDF3( 1505,  3929,  5433) },
+                    { CDF3(  677,  1500,  2736) },
+                    { CDF3(23987, 30702, 32117) },
+                    { CDF3(13554, 24571, 29263) },
+                    { CDF3( 6211, 14556, 21155) },
+                    { CDF3( 3135, 10972, 15625) },
+                    { CDF3( 2435,  7127, 11427) },
+                    { CDF3(31300, 32532, 32550) },
+                    { CDF3(14757, 30365, 31954) },
+                    { CDF3( 4405, 11612, 18553) },
+                    { CDF3(  580,  4132,  7322) },
+                    { CDF3( 1695, 10169, 14124) },
+                    { CDF3(30008, 32282, 32591) },
+                    { CDF3(19244, 30108, 31748) },
+                    { CDF3(11180, 24158, 29555) },
+                    { CDF3( 5650, 14972, 19209) },
+                    { CDF3( 2114,  5109,  8456) },
+                    { CDF3(31856, 32716, 32748) },
+                    { CDF3(23012, 31664, 32572) },
+                    { CDF3(13694, 26656, 30636) },
+                    { CDF3( 8142, 19508, 26093) },
+                    { CDF3( 4253, 10955, 16724) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                },
+            }, {
+                {
+                    { CDF3(  601,   983,  1311) },
+                    { CDF3(18725, 23406, 28087) },
+                    { CDF3( 5461,  8192, 10923) },
+                    { CDF3( 3781, 15124, 21425) },
+                    { CDF3( 2587,  7761, 12072) },
+                    { CDF3(  106,   458,   810) },
+                    { CDF3(22282, 29710, 31894) },
+                    { CDF3( 8508, 20926, 25984) },
+                    { CDF3( 3726, 12713, 18083) },
+                    { CDF3( 1620,  7112, 10893) },
+                    { CDF3(  729,  2236,  3495) },
+                    { CDF3(30163, 32474, 32684) },
+                    { CDF3(18304, 30464, 32000) },
+                    { CDF3(11443, 26526, 29647) },
+                    { CDF3( 6007, 15292, 21299) },
+                    { CDF3( 2234,  6703,  8937) },
+                    { CDF3(30954, 32177, 32571) },
+                    { CDF3(17363, 29562, 31076) },
+                    { CDF3( 9686, 22464, 27410) },
+                    { CDF3( 8192, 16384, 21390) },
+                    { CDF3( 1755,  8046, 11264) },
+                    { CDF3(31168, 32734, 32748) },
+                    { CDF3(22486, 31441, 32471) },
+                    { CDF3(12833, 25627, 29738) },
+                    { CDF3( 6980, 17379, 23122) },
+                    { CDF3( 3111,  8887, 13479) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                }, {
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                },
+            },
+        }, .dc_sign = {
+            { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } },
+            { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } },
+        }, .br_tok = {
+            {
+                {
+                    { CDF3(14298, 20718, 24174) },
+                    { CDF3(12536, 19601, 23789) },
+                    { CDF3( 8712, 15051, 19503) },
+                    { CDF3( 6170, 11327, 15434) },
+                    { CDF3( 4742,  8926, 12538) },
+                    { CDF3( 3803,  7317, 10546) },
+                    { CDF3( 1696,  3317,  4871) },
+                    { CDF3(14392, 19951, 22756) },
+                    { CDF3(15978, 23218, 26818) },
+                    { CDF3(12187, 19474, 23889) },
+                    { CDF3( 9176, 15640, 20259) },
+                    { CDF3( 7068, 12655, 17028) },
+                    { CDF3( 5656, 10442, 14472) },
+                    { CDF3( 2580,  4992,  7244) },
+                    { CDF3(12136, 18049, 21426) },
+                    { CDF3(13784, 20721, 24481) },
+                    { CDF3(10836, 17621, 21900) },
+                    { CDF3( 8372, 14444, 18847) },
+                    { CDF3( 6523, 11779, 16000) },
+                    { CDF3( 5337,  9898, 13760) },
+                    { CDF3( 3034,  5860,  8462) },
+                }, {
+                    { CDF3(15967, 22905, 26286) },
+                    { CDF3(13534, 20654, 24579) },
+                    { CDF3( 9504, 16092, 20535) },
+                    { CDF3( 6975, 12568, 16903) },
+                    { CDF3( 5364, 10091, 14020) },
+                    { CDF3( 4357,  8370, 11857) },
+                    { CDF3( 2506,  4934,  7218) },
+                    { CDF3(23032, 28815, 30936) },
+                    { CDF3(19540, 26704, 29719) },
+                    { CDF3(15158, 22969, 27097) },
+                    { CDF3(11408, 18865, 23650) },
+                    { CDF3( 8885, 15448, 20250) },
+                    { CDF3( 7108, 12853, 17416) },
+                    { CDF3( 4231,  8041, 11480) },
+                    { CDF3(19823, 26490, 29156) },
+                    { CDF3(18890, 25929, 28932) },
+                    { CDF3(15660, 23491, 27433) },
+                    { CDF3(12147, 19776, 24488) },
+                    { CDF3( 9728, 16774, 21649) },
+                    { CDF3( 7919, 14277, 19066) },
+                    { CDF3( 5440, 10170, 14185) },
+                },
+            }, {
+                {
+                    { CDF3(14406, 20862, 24414) },
+                    { CDF3(11824, 18907, 23109) },
+                    { CDF3( 8257, 14393, 18803) },
+                    { CDF3( 5860, 10747, 14778) },
+                    { CDF3( 4475,  8486, 11984) },
+                    { CDF3( 3606,  6954, 10043) },
+                    { CDF3( 1736,  3410,  5048) },
+                    { CDF3(14430, 20046, 22882) },
+                    { CDF3(15593, 22899, 26709) },
+                    { CDF3(12102, 19368, 23811) },
+                    { CDF3( 9059, 15584, 20262) },
+                    { CDF3( 6999, 12603, 17048) },
+                    { CDF3( 5684, 10497, 14553) },
+                    { CDF3( 2822,  5438,  7862) },
+                    { CDF3(15785, 21585, 24359) },
+                    { CDF3(18347, 25229, 28266) },
+                    { CDF3(14974, 22487, 26389) },
+                    { CDF3(11423, 18681, 23271) },
+                    { CDF3( 8863, 15350, 20008) },
+                    { CDF3( 7153, 12852, 17278) },
+                    { CDF3( 3707,  7036,  9982) },
+                }, {
+                    { CDF3(15460, 21696, 25469) },
+                    { CDF3(12170, 19249, 23191) },
+                    { CDF3( 8723, 15027, 19332) },
+                    { CDF3( 6428, 11704, 15874) },
+                    { CDF3( 4922,  9292, 13052) },
+                    { CDF3( 4139,  7695, 11010) },
+                    { CDF3( 2291,  4508,  6598) },
+                    { CDF3(19856, 26920, 29828) },
+                    { CDF3(17923, 25289, 28792) },
+                    { CDF3(14278, 21968, 26297) },
+                    { CDF3(10910, 18136, 22950) },
+                    { CDF3( 8423, 14815, 19627) },
+                    { CDF3( 6771, 12283, 16774) },
+                    { CDF3( 4074,  7750, 11081) },
+                    { CDF3(19852, 26074, 28672) },
+                    { CDF3(19371, 26110, 28989) },
+                    { CDF3(16265, 23873, 27663) },
+                    { CDF3(12758, 20378, 24952) },
+                    { CDF3(10095, 17098, 21961) },
+                    { CDF3( 8250, 14628, 19451) },
+                    { CDF3( 5205,  9745, 13622) },
+                },
+            }, {
+                {
+                    { CDF3(10563, 16233, 19763) },
+                    { CDF3( 9794, 16022, 19804) },
+                    { CDF3( 6750, 11945, 15759) },
+                    { CDF3( 4963,  9186, 12752) },
+                    { CDF3( 3845,  7435, 10627) },
+                    { CDF3( 3051,  6085,  8834) },
+                    { CDF3( 1311,  2596,  3830) },
+                    { CDF3(11246, 16404, 19689) },
+                    { CDF3(12315, 18911, 22731) },
+                    { CDF3(10557, 17095, 21289) },
+                    { CDF3( 8136, 14006, 18249) },
+                    { CDF3( 6348, 11474, 15565) },
+                    { CDF3( 5196,  9655, 13400) },
+                    { CDF3( 2349,  4526,  6587) },
+                    { CDF3(13337, 18730, 21569) },
+                    { CDF3(19306, 26071, 28882) },
+                    { CDF3(15952, 23540, 27254) },
+                    { CDF3(12409, 19934, 24430) },
+                    { CDF3( 9760, 16706, 21389) },
+                    { CDF3( 8004, 14220, 18818) },
+                    { CDF3( 4138,  7794, 10961) },
+                }, {
+                    { CDF3(10870, 16684, 20949) },
+                    { CDF3( 9664, 15230, 18680) },
+                    { CDF3( 6886, 12109, 15408) },
+                    { CDF3( 4825,  8900, 12305) },
+                    { CDF3( 3630,  7162, 10314) },
+                    { CDF3( 3036,  6429,  9387) },
+                    { CDF3( 1671,  3296,  4940) },
+                    { CDF3(13819, 19159, 23026) },
+                    { CDF3(11984, 19108, 23120) },
+                    { CDF3(10690, 17210, 21663) },
+                    { CDF3( 7984, 14154, 18333) },
+                    { CDF3( 6868, 12294, 16124) },
+                    { CDF3( 5274,  8994, 12868) },
+                    { CDF3( 2988,  5771,  8424) },
+                    { CDF3(19736, 26647, 29141) },
+                    { CDF3(18933, 26070, 28984) },
+                    { CDF3(15779, 23048, 27200) },
+                    { CDF3(12638, 20061, 24532) },
+                    { CDF3(10692, 17545, 22220) },
+                    { CDF3( 9217, 15251, 20054) },
+                    { CDF3( 5078,  9284, 12594) },
+                },
+            }, {
+                {
+                    { CDF3( 2331,  3662,  5244) },
+                    { CDF3( 2891,  4771,  6145) },
+                    { CDF3( 4598,  7623,  9729) },
+                    { CDF3( 3520,  6845,  9199) },
+                    { CDF3( 3417,  6119,  9324) },
+                    { CDF3( 2601,  5412,  7385) },
+                    { CDF3(  600,  1173,  1744) },
+                    { CDF3( 7672, 13286, 17469) },
+                    { CDF3( 4232,  7792, 10793) },
+                    { CDF3( 2915,  5317,  7397) },
+                    { CDF3( 2318,  4356,  6152) },
+                    { CDF3( 2127,  4000,  5554) },
+                    { CDF3( 1850,  3478,  5275) },
+                    { CDF3(  977,  1933,  2843) },
+                    { CDF3(18280, 24387, 27989) },
+                    { CDF3(15852, 22671, 26185) },
+                    { CDF3(13845, 20951, 24789) },
+                    { CDF3(11055, 17966, 22129) },
+                    { CDF3( 9138, 15422, 19801) },
+                    { CDF3( 7454, 13145, 17456) },
+                    { CDF3( 3370,  6393,  9013) },
+                }, {
+                    { CDF3( 5842,  9229, 10838) },
+                    { CDF3( 2313,  3491,  4276) },
+                    { CDF3( 2998,  6104,  7496) },
+                    { CDF3( 2420,  7447,  9868) },
+                    { CDF3( 3034,  8495, 10923) },
+                    { CDF3( 4076,  8937, 10975) },
+                    { CDF3( 1086,  2370,  3299) },
+                    { CDF3( 9714, 17254, 20444) },
+                    { CDF3( 8543, 13698, 17123) },
+                    { CDF3( 4918,  9007, 11910) },
+                    { CDF3( 4129,  7532, 10553) },
+                    { CDF3( 2364,  5533,  8058) },
+                    { CDF3( 1834,  3546,  5563) },
+                    { CDF3( 1473,  2908,  4133) },
+                    { CDF3(15405, 21193, 25619) },
+                    { CDF3(15691, 21952, 26561) },
+                    { CDF3(12962, 19194, 24165) },
+                    { CDF3(10272, 17855, 22129) },
+                    { CDF3( 8588, 15270, 20718) },
+                    { CDF3( 8682, 14669, 19500) },
+                    { CDF3( 4870,  9636, 13205) },
+                },
+            },
+        },
+    }, [1] = {
+        .skip = {
+            {
+                { CDF1(30371) }, { CDF1( 7570) }, { CDF1(13155) },
+                { CDF1(20751) }, { CDF1(20969) }, { CDF1(27067) },
+                { CDF1(32013) }, { CDF1( 5495) }, { CDF1(17942) },
+                { CDF1(28280) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            }, {
+                { CDF1(31782) }, { CDF1( 1836) }, { CDF1(10689) },
+                { CDF1(17604) }, { CDF1(21622) }, { CDF1(27518) },
+                { CDF1(32399) }, { CDF1( 4419) }, { CDF1(16294) },
+                { CDF1(28345) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            }, {
+                { CDF1(31901) }, { CDF1(10311) }, { CDF1(18047) },
+                { CDF1(24806) }, { CDF1(23288) }, { CDF1(27914) },
+                { CDF1(32296) }, { CDF1( 4215) }, { CDF1(15756) },
+                { CDF1(28341) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            }, {
+                { CDF1(26726) }, { CDF1( 1045) }, { CDF1(11703) },
+                { CDF1(20590) }, { CDF1(18554) }, { CDF1(25970) },
+                { CDF1(31938) }, { CDF1( 5583) }, { CDF1(21313) },
+                { CDF1(29390) }, { CDF1(  641) }, { CDF1(22265) },
+                { CDF1(31452) },
+            }, {
+                { CDF1(26584) }, { CDF1(  188) }, { CDF1( 8847) },
+                { CDF1(24519) }, { CDF1(22938) }, { CDF1(30583) },
+                { CDF1(32608) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            },
+        }, .eob_bin_16 = {
+            {
+                { CDF4( 2125,  2551,  5165,  8946) },
+                { CDF4(  513,   765,  1859,  6339) },
+            }, {
+                { CDF4( 7637,  9498, 14259, 19108) },
+                { CDF4( 2497,  4096,  8866, 16993) },
+            },
+        }, .eob_bin_32 = {
+            {
+                { CDF5(  989,  1249,  2019,  4151, 10785) },
+                { CDF5(  313,   441,  1099,  2917,  8562) },
+            }, {
+                { CDF5( 8394, 10352, 13932, 18855, 26014) },
+                { CDF5( 2578,  4124,  8181, 13670, 24234) },
+            },
+        }, .eob_bin_64 = {
+            {
+                { CDF6( 1260,  1446,  2253,  3712,  6652, 13369) },
+                { CDF6(  401,   605,  1029,  2563,  5845, 12626) },
+            }, {
+                { CDF6( 8609, 10612, 14624, 18714, 22614, 29024) },
+                { CDF6( 1923,  3127,  5867,  9703, 14277, 27100) },
+            },
+        }, .eob_bin_128 = {
+            {
+                { CDF7(  685,   933,  1488,  2714,  4766,  8562, 19254) },
+                { CDF7(  217,   352,   618,  2303,  5261,  9969, 17472) },
+            }, {
+                { CDF7( 8045, 11200, 15497, 19595, 23948, 27408, 30938) },
+                { CDF7( 2310,  4160,  7471, 14997, 17931, 20768, 30240) },
+            },
+        }, .eob_bin_256 = {
+            {
+                { CDF8( 1448,  2109,  4151,  6263,
+                        9329, 13260, 17944, 23300) },
+                { CDF8(  399,  1019,  1749,  3038,
+                       10444, 15546, 22739, 27294) },
+            }, {
+                { CDF8( 6402,  8148, 12623, 15072,
+                       18728, 22847, 26447, 29377) },
+                { CDF8( 1674,  3252,  5734, 10159,
+                       22397, 23802, 24821, 30940) },
+            },
+        }, .eob_bin_512 = {
+            { CDF9( 1230,  2278,  5035,  7776, 11871,
+                   15346, 19590, 24584, 28749) },
+            { CDF9( 7265,  9979, 15819, 19250, 21780,
+                   23846, 26478, 28396, 31811) },
+        }, .eob_bin_1024 = {
+            { CDF10(  696,   948,  3145,  5702,  9706,
+                    13217, 17851, 21856, 25692, 28034) },
+            { CDF10( 2672,  3591,  9330, 17084, 22725,
+                    24284, 26527, 28027, 28377, 30876) },
+        }, .eob_hi_bit = {
+            {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(17471) },
+                    { CDF1(20223) }, { CDF1(11357) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(20335) },
+                    { CDF1(21667) }, { CDF1(14818) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(20430) },
+                    { CDF1(20662) }, { CDF1(15367) }, { CDF1(16970) },
+                    { CDF1(14657) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(22117) },
+                    { CDF1(22028) }, { CDF1(18650) }, { CDF1(16042) },
+                    { CDF1(15885) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(22409) },
+                    { CDF1(21012) }, { CDF1(15650) }, { CDF1(17395) },
+                    { CDF1(15469) }, { CDF1(20205) }, { CDF1(19511) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(24220) },
+                    { CDF1(22480) }, { CDF1(17737) }, { CDF1(18916) },
+                    { CDF1(19268) }, { CDF1(18412) }, { CDF1(18844) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(25991) },
+                    { CDF1(20314) }, { CDF1(17731) }, { CDF1(19678) },
+                    { CDF1(18649) }, { CDF1(17307) }, { CDF1(21798) },
+                    { CDF1(17549) }, { CDF1(15630) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(26585) },
+                    { CDF1(21469) }, { CDF1(20432) }, { CDF1(17735) },
+                    { CDF1(19280) }, { CDF1(15235) }, { CDF1(20297) },
+                    { CDF1(22471) }, { CDF1(28997) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(26605) },
+                    { CDF1(11304) }, { CDF1(16726) }, { CDF1(16560) },
+                    { CDF1(20866) }, { CDF1(23524) }, { CDF1(19878) },
+                    { CDF1(13469) }, { CDF1(23084) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            },
+        }, .eob_base_tok = {
+            {
+                {
+                    { CDF2(17560, 29888) }, { CDF2(29671, 31549) },
+                    { CDF2(31007, 32056) }, { CDF2(27286, 30006) },
+                }, {
+                    { CDF2(26594, 31212) }, { CDF2(31208, 32582) },
+                    { CDF2(31835, 32637) }, { CDF2(30595, 32206) },
+                },
+            }, {
+                {
+                    { CDF2(15239, 29932) }, { CDF2(31315, 32095) },
+                    { CDF2(32130, 32434) }, { CDF2(30864, 31996) },
+                }, {
+                    { CDF2(26279, 30968) }, { CDF2(31142, 32495) },
+                    { CDF2(31713, 32540) }, { CDF2(31929, 32594) },
+                },
+            }, {
+                {
+                    { CDF2( 2644, 25198) }, { CDF2(32038, 32451) },
+                    { CDF2(32639, 32695) }, { CDF2(32166, 32518) },
+                }, {
+                    { CDF2(17187, 27668) }, { CDF2(31714, 32550) },
+                    { CDF2(32283, 32678) }, { CDF2(31930, 32563) },
+                },
+            }, {
+                {
+                    { CDF2( 1044,  2257) }, { CDF2(30755, 31923) },
+                    { CDF2(32208, 32693) }, { CDF2(32244, 32615) },
+                }, {
+                    { CDF2(21317, 26207) }, { CDF2(29133, 30868) },
+                    { CDF2(29311, 31231) }, { CDF2(29657, 31087) },
+                },
+            }, {
+                {
+                    { CDF2(  478,  1834) }, { CDF2(31005, 31987) },
+                    { CDF2(32317, 32724) }, { CDF2(30865, 32648) },
+                }, {
+                    { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+                    { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+                },
+            },
+        }, .base_tok = {
+            {
+                {
+                    { CDF3( 6041, 11854, 15927) },
+                    { CDF3(20326, 30905, 32251) },
+                    { CDF3(14164, 26831, 30725) },
+                    { CDF3( 9760, 20647, 26585) },
+                    { CDF3( 6416, 14953, 21219) },
+                    { CDF3( 2966,  7151, 10891) },
+                    { CDF3(23567, 31374, 32254) },
+                    { CDF3(14978, 27416, 30946) },
+                    { CDF3( 9434, 20225, 26254) },
+                    { CDF3( 6658, 14558, 20535) },
+                    { CDF3( 3916,  8677, 12989) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3(18088, 29545, 31587) },
+                    { CDF3(13062, 25843, 30073) },
+                    { CDF3( 8940, 16827, 22251) },
+                    { CDF3( 7654, 13220, 17973) },
+                    { CDF3( 5733, 10316, 14456) },
+                    { CDF3(22879, 31388, 32114) },
+                    { CDF3(15215, 27993, 30955) },
+                    { CDF3( 9397, 19445, 24978) },
+                    { CDF3( 3442,  9813, 15344) },
+                    { CDF3( 1368,  3936,  6532) },
+                    { CDF3(25494, 32033, 32406) },
+                    { CDF3(16772, 27963, 30718) },
+                    { CDF3( 9419, 18165, 23260) },
+                    { CDF3( 2677,  7501, 11797) },
+                    { CDF3( 1516,  4344,  7170) },
+                    { CDF3(26556, 31454, 32101) },
+                    { CDF3(17128, 27035, 30108) },
+                    { CDF3( 8324, 15344, 20249) },
+                    { CDF3( 1903,  5696,  9469) },
+                    { CDF3( 8192, 16384, 24576) },
+                }, {
+                    { CDF3( 8455, 19003, 24368) },
+                    { CDF3(23563, 32021, 32604) },
+                    { CDF3(16237, 29446, 31935) },
+                    { CDF3(10724, 23999, 29358) },
+                    { CDF3( 6725, 17528, 24416) },
+                    { CDF3( 3927, 10927, 16825) },
+                    { CDF3(26313, 32288, 32634) },
+                    { CDF3(17430, 30095, 32095) },
+                    { CDF3(11116, 24606, 29679) },
+                    { CDF3( 7195, 18384, 25269) },
+                    { CDF3( 4726, 12852, 19315) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3(22822, 31648, 32483) },
+                    { CDF3(16724, 29633, 31929) },
+                    { CDF3(10261, 23033, 28725) },
+                    { CDF3( 7029, 17840, 24528) },
+                    { CDF3( 4867, 13886, 21502) },
+                    { CDF3(25298, 31892, 32491) },
+                    { CDF3(17809, 29330, 31512) },
+                    { CDF3( 9668, 21329, 26579) },
+                    { CDF3( 4774, 12956, 18976) },
+                    { CDF3( 2322,  7030, 11540) },
+                    { CDF3(25472, 31920, 32543) },
+                    { CDF3(17957, 29387, 31632) },
+                    { CDF3( 9196, 20593, 26400) },
+                    { CDF3( 4680, 12705, 19202) },
+                    { CDF3( 2917,  8456, 13436) },
+                    { CDF3(26471, 32059, 32574) },
+                    { CDF3(18458, 29783, 31909) },
+                    { CDF3( 8400, 19464, 25956) },
+                    { CDF3( 3812, 10973, 17206) },
+                    { CDF3( 8192, 16384, 24576) },
+                },
+            }, {
+                {
+                    { CDF3( 6779, 13743, 17678) },
+                    { CDF3(24806, 31797, 32457) },
+                    { CDF3(17616, 29047, 31372) },
+                    { CDF3(11063, 23175, 28003) },
+                    { CDF3( 6521, 16110, 22324) },
+                    { CDF3( 2764,  7504, 11654) },
+                    { CDF3(25266, 32367, 32637) },
+                    { CDF3(19054, 30553, 32175) },
+                    { CDF3(12139, 25212, 29807) },
+                    { CDF3( 7311, 18162, 24704) },
+                    { CDF3( 3397,  9164, 14074) },
+                    { CDF3(25988, 32208, 32522) },
+                    { CDF3(16253, 28912, 31526) },
+                    { CDF3( 9151, 21387, 27372) },
+                    { CDF3( 5688, 14915, 21496) },
+                    { CDF3( 2717,  7627, 12004) },
+                    { CDF3(23144, 31855, 32443) },
+                    { CDF3(16070, 28491, 31325) },
+                    { CDF3( 8702, 20467, 26517) },
+                    { CDF3( 5243, 13956, 20367) },
+                    { CDF3( 2621,  7335, 11567) },
+                    { CDF3(26636, 32340, 32630) },
+                    { CDF3(19990, 31050, 32341) },
+                    { CDF3(13243, 26105, 30315) },
+                    { CDF3( 8588, 19521, 25918) },
+                    { CDF3( 4717, 11585, 17304) },
+                    { CDF3(25844, 32292, 32582) },
+                    { CDF3(19090, 30635, 32097) },
+                    { CDF3(11963, 24546, 28939) },
+                    { CDF3( 6218, 16087, 22354) },
+                    { CDF3( 2340,  6608, 10426) },
+                    { CDF3(28046, 32576, 32694) },
+                    { CDF3(21178, 31313, 32296) },
+                    { CDF3(13486, 26184, 29870) },
+                    { CDF3( 7149, 17871, 23723) },
+                    { CDF3( 2833,  7958, 12259) },
+                    { CDF3(27710, 32528, 32686) },
+                    { CDF3(20674, 31076, 32268) },
+                    { CDF3(12413, 24955, 29243) },
+                    { CDF3( 6676, 16927, 23097) },
+                    { CDF3( 2966,  8333, 12919) },
+                }, {
+                    { CDF3( 8639, 19339, 24429) },
+                    { CDF3(24404, 31837, 32525) },
+                    { CDF3(16997, 29425, 31784) },
+                    { CDF3(11253, 24234, 29149) },
+                    { CDF3( 6751, 17394, 24028) },
+                    { CDF3( 3490,  9830, 15191) },
+                    { CDF3(26283, 32471, 32714) },
+                    { CDF3(19599, 31168, 32442) },
+                    { CDF3(13146, 26954, 30893) },
+                    { CDF3( 8214, 20588, 26890) },
+                    { CDF3( 4699, 13081, 19300) },
+                    { CDF3(28212, 32458, 32669) },
+                    { CDF3(18594, 30316, 32100) },
+                    { CDF3(11219, 24408, 29234) },
+                    { CDF3( 6865, 17656, 24149) },
+                    { CDF3( 3678, 10362, 16006) },
+                    { CDF3(25825, 32136, 32616) },
+                    { CDF3(17313, 29853, 32021) },
+                    { CDF3(11197, 24471, 29472) },
+                    { CDF3( 6947, 17781, 24405) },
+                    { CDF3( 3768, 10660, 16261) },
+                    { CDF3(27352, 32500, 32706) },
+                    { CDF3(20850, 31468, 32469) },
+                    { CDF3(14021, 27707, 31133) },
+                    { CDF3( 8964, 21748, 27838) },
+                    { CDF3( 5437, 14665, 21187) },
+                    { CDF3(26304, 32492, 32698) },
+                    { CDF3(20409, 31380, 32385) },
+                    { CDF3(13682, 27222, 30632) },
+                    { CDF3( 8974, 21236, 26685) },
+                    { CDF3( 4234, 11665, 16934) },
+                    { CDF3(26273, 32357, 32711) },
+                    { CDF3(20672, 31242, 32441) },
+                    { CDF3(14172, 27254, 30902) },
+                    { CDF3( 9870, 21898, 27275) },
+                    { CDF3( 5164, 13506, 19270) },
+                    { CDF3(26725, 32459, 32728) },
+                    { CDF3(20991, 31442, 32527) },
+                    { CDF3(13071, 26434, 30811) },
+                    { CDF3( 8184, 20090, 26742) },
+                    { CDF3( 4803, 13255, 19895) },
+                },
+            }, {
+                {
+                    { CDF3( 7555, 14942, 18501) },
+                    { CDF3(24410, 31178, 32287) },
+                    { CDF3(14394, 26738, 30253) },
+                    { CDF3( 8413, 19554, 25195) },
+                    { CDF3( 4766, 12924, 18785) },
+                    { CDF3( 2029,  5806,  9207) },
+                    { CDF3(26776, 32364, 32663) },
+                    { CDF3(18732, 29967, 31931) },
+                    { CDF3(11005, 23786, 28852) },
+                    { CDF3( 6466, 16909, 23510) },
+                    { CDF3( 3044,  8638, 13419) },
+                    { CDF3(29208, 32582, 32704) },
+                    { CDF3(20068, 30857, 32208) },
+                    { CDF3(12003, 25085, 29595) },
+                    { CDF3( 6947, 17750, 24189) },
+                    { CDF3( 3245,  9103, 14007) },
+                    { CDF3(27359, 32465, 32669) },
+                    { CDF3(19421, 30614, 32174) },
+                    { CDF3(11915, 25010, 29579) },
+                    { CDF3( 6950, 17676, 24074) },
+                    { CDF3( 3007,  8473, 13096) },
+                    { CDF3(29002, 32676, 32735) },
+                    { CDF3(22102, 31849, 32576) },
+                    { CDF3(14408, 28009, 31405) },
+                    { CDF3( 9027, 21679, 27931) },
+                    { CDF3( 4694, 12678, 18748) },
+                    { CDF3(28216, 32528, 32682) },
+                    { CDF3(20849, 31264, 32318) },
+                    { CDF3(12756, 25815, 29751) },
+                    { CDF3( 7565, 18801, 24923) },
+                    { CDF3( 3509,  9533, 14477) },
+                    { CDF3(30133, 32687, 32739) },
+                    { CDF3(23063, 31910, 32515) },
+                    { CDF3(14588, 28051, 31132) },
+                    { CDF3( 9085, 21649, 27457) },
+                    { CDF3( 4261, 11654, 17264) },
+                    { CDF3(29518, 32691, 32748) },
+                    { CDF3(22451, 31959, 32613) },
+                    { CDF3(14864, 28722, 31700) },
+                    { CDF3( 9695, 22964, 28716) },
+                    { CDF3( 4932, 13358, 19502) },
+                }, {
+                    { CDF3( 6465, 16958, 21688) },
+                    { CDF3(25199, 31514, 32360) },
+                    { CDF3(14774, 27149, 30607) },
+                    { CDF3( 9257, 21438, 26972) },
+                    { CDF3( 5723, 15183, 21882) },
+                    { CDF3( 3150,  8879, 13731) },
+                    { CDF3(26989, 32262, 32682) },
+                    { CDF3(17396, 29937, 32085) },
+                    { CDF3(11387, 24901, 29784) },
+                    { CDF3( 7289, 18821, 25548) },
+                    { CDF3( 3734, 10577, 16086) },
+                    { CDF3(29728, 32501, 32695) },
+                    { CDF3(17431, 29701, 31903) },
+                    { CDF3( 9921, 22826, 28300) },
+                    { CDF3( 5896, 15434, 22068) },
+                    { CDF3( 3430,  9646, 14757) },
+                    { CDF3(28614, 32511, 32705) },
+                    { CDF3(19364, 30638, 32263) },
+                    { CDF3(13129, 26254, 30402) },
+                    { CDF3( 8754, 20484, 26440) },
+                    { CDF3( 4378, 11607, 17110) },
+                    { CDF3(30292, 32671, 32744) },
+                    { CDF3(21780, 31603, 32501) },
+                    { CDF3(14314, 27829, 31291) },
+                    { CDF3( 9611, 22327, 28263) },
+                    { CDF3( 4890, 13087, 19065) },
+                    { CDF3(25862, 32567, 32733) },
+                    { CDF3(20794, 32050, 32567) },
+                    { CDF3(17243, 30625, 32254) },
+                    { CDF3(13283, 27628, 31474) },
+                    { CDF3( 9669, 22532, 28918) },
+                    { CDF3(27435, 32697, 32748) },
+                    { CDF3(24922, 32390, 32714) },
+                    { CDF3(21449, 31504, 32536) },
+                    { CDF3(16392, 29729, 31832) },
+                    { CDF3(11692, 24884, 29076) },
+                    { CDF3(24193, 32290, 32735) },
+                    { CDF3(18909, 31104, 32563) },
+                    { CDF3(12236, 26841, 31403) },
+                    { CDF3( 8171, 21840, 29082) },
+                    { CDF3( 7224, 17280, 25275) },
+                },
+            }, {
+                {
+                    { CDF3( 3078,  6839,  9890) },
+                    { CDF3(13837, 20450, 24479) },
+                    { CDF3( 5914, 14222, 19328) },
+                    { CDF3( 3866, 10267, 14762) },
+                    { CDF3( 2612,  7208, 11042) },
+                    { CDF3( 1067,  2991,  4776) },
+                    { CDF3(25817, 31646, 32529) },
+                    { CDF3(13708, 26338, 30385) },
+                    { CDF3( 7328, 18585, 24870) },
+                    { CDF3( 4691, 13080, 19276) },
+                    { CDF3( 1825,  5253,  8352) },
+                    { CDF3(29386, 32315, 32624) },
+                    { CDF3(17160, 29001, 31360) },
+                    { CDF3( 9602, 21862, 27396) },
+                    { CDF3( 5915, 15772, 22148) },
+                    { CDF3( 2786,  7779, 12047) },
+                    { CDF3(29246, 32450, 32663) },
+                    { CDF3(18696, 29929, 31818) },
+                    { CDF3(10510, 23369, 28560) },
+                    { CDF3( 6229, 16499, 23125) },
+                    { CDF3( 2608,  7448, 11705) },
+                    { CDF3(30753, 32710, 32748) },
+                    { CDF3(21638, 31487, 32503) },
+                    { CDF3(12937, 26854, 30870) },
+                    { CDF3( 8182, 20596, 26970) },
+                    { CDF3( 3637, 10269, 15497) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                }, {
+                    { CDF3( 5244, 12150, 16906) },
+                    { CDF3(20486, 26858, 29701) },
+                    { CDF3( 7756, 18317, 23735) },
+                    { CDF3( 3452,  9256, 13146) },
+                    { CDF3( 2020,  5206,  8229) },
+                    { CDF3( 1801,  4993,  7903) },
+                    { CDF3(27051, 31858, 32531) },
+                    { CDF3(15988, 27531, 30619) },
+                    { CDF3( 9188, 21484, 26719) },
+                    { CDF3( 6273, 17186, 23800) },
+                    { CDF3( 3108,  9355, 14764) },
+                    { CDF3(31076, 32520, 32680) },
+                    { CDF3(18119, 30037, 31850) },
+                    { CDF3(10244, 22969, 27472) },
+                    { CDF3( 4692, 14077, 19273) },
+                    { CDF3( 3694, 11677, 17556) },
+                    { CDF3(30060, 32581, 32720) },
+                    { CDF3(21011, 30775, 32120) },
+                    { CDF3(11931, 24820, 29289) },
+                    { CDF3( 7119, 17662, 24356) },
+                    { CDF3( 3833, 10706, 16304) },
+                    { CDF3(31954, 32731, 32748) },
+                    { CDF3(23913, 31724, 32489) },
+                    { CDF3(15520, 28060, 31286) },
+                    { CDF3(11517, 23008, 28571) },
+                    { CDF3( 6193, 14508, 20629) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                },
+            }, {
+                {
+                    { CDF3( 1035,  2807,  4156) },
+                    { CDF3(13162, 18138, 20939) },
+                    { CDF3( 2696,  6633,  8755) },
+                    { CDF3( 1373,  4161,  6853) },
+                    { CDF3( 1099,  2746,  4716) },
+                    { CDF3(  340,  1021,  1599) },
+                    { CDF3(22826, 30419, 32135) },
+                    { CDF3(10395, 21762, 26942) },
+                    { CDF3( 4726, 12407, 17361) },
+                    { CDF3( 2447,  7080, 10593) },
+                    { CDF3( 1227,  3717,  6011) },
+                    { CDF3(28156, 31424, 31934) },
+                    { CDF3(16915, 27754, 30373) },
+                    { CDF3( 9148, 20990, 26431) },
+                    { CDF3( 5950, 15515, 21148) },
+                    { CDF3( 2492,  7327, 11526) },
+                    { CDF3(30602, 32477, 32670) },
+                    { CDF3(20026, 29955, 31568) },
+                    { CDF3(11220, 23628, 28105) },
+                    { CDF3( 6652, 17019, 22973) },
+                    { CDF3( 3064,  8536, 13043) },
+                    { CDF3(31769, 32724, 32748) },
+                    { CDF3(22230, 30887, 32373) },
+                    { CDF3(12234, 25079, 29731) },
+                    { CDF3( 7326, 18816, 25353) },
+                    { CDF3( 3933, 10907, 16616) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                }, {
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                },
+            },
+        }, .dc_sign = {
+            { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } },
+            { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } },
+        }, .br_tok = {
+            {
+                {
+                    { CDF3(14995, 21341, 24749) },
+                    { CDF3(13158, 20289, 24601) },
+                    { CDF3( 8941, 15326, 19876) },
+                    { CDF3( 6297, 11541, 15807) },
+                    { CDF3( 4817,  9029, 12776) },
+                    { CDF3( 3731,  7273, 10627) },
+                    { CDF3( 1847,  3617,  5354) },
+                    { CDF3(14472, 19659, 22343) },
+                    { CDF3(16806, 24162, 27533) },
+                    { CDF3(12900, 20404, 24713) },
+                    { CDF3( 9411, 16112, 20797) },
+                    { CDF3( 7056, 12697, 17148) },
+                    { CDF3( 5544, 10339, 14460) },
+                    { CDF3( 2954,  5704,  8319) },
+                    { CDF3(12464, 18071, 21354) },
+                    { CDF3(15482, 22528, 26034) },
+                    { CDF3(12070, 19269, 23624) },
+                    { CDF3( 8953, 15406, 20106) },
+                    { CDF3( 7027, 12730, 17220) },
+                    { CDF3( 5887, 10913, 15140) },
+                    { CDF3( 3793,  7278, 10447) },
+                }, {
+                    { CDF3(15571, 22232, 25749) },
+                    { CDF3(14506, 21575, 25374) },
+                    { CDF3(10189, 17089, 21569) },
+                    { CDF3( 7316, 13301, 17915) },
+                    { CDF3( 5783, 10912, 15190) },
+                    { CDF3( 4760,  9155, 13088) },
+                    { CDF3( 2993,  5966,  8774) },
+                    { CDF3(23424, 28903, 30778) },
+                    { CDF3(20775, 27666, 30290) },
+                    { CDF3(16474, 24410, 28299) },
+                    { CDF3(12471, 20180, 24987) },
+                    { CDF3( 9410, 16487, 21439) },
+                    { CDF3( 7536, 13614, 18529) },
+                    { CDF3( 5048,  9586, 13549) },
+                    { CDF3(21090, 27290, 29756) },
+                    { CDF3(20796, 27402, 30026) },
+                    { CDF3(17819, 25485, 28969) },
+                    { CDF3(13860, 21909, 26462) },
+                    { CDF3(11002, 18494, 23529) },
+                    { CDF3( 8953, 15929, 20897) },
+                    { CDF3( 6448, 11918, 16454) },
+                },
+            }, {
+                {
+                    { CDF3(15999, 22208, 25449) },
+                    { CDF3(13050, 19988, 24122) },
+                    { CDF3( 8594, 14864, 19378) },
+                    { CDF3( 6033, 11079, 15238) },
+                    { CDF3( 4554,  8683, 12347) },
+                    { CDF3( 3672,  7139, 10337) },
+                    { CDF3( 1900,  3771,  5576) },
+                    { CDF3(15788, 21340, 23949) },
+                    { CDF3(16825, 24235, 27758) },
+                    { CDF3(12873, 20402, 24810) },
+                    { CDF3( 9590, 16363, 21094) },
+                    { CDF3( 7352, 13209, 17733) },
+                    { CDF3( 5960, 10989, 15184) },
+                    { CDF3( 3232,  6234,  9007) },
+                    { CDF3(15761, 20716, 23224) },
+                    { CDF3(19318, 25989, 28759) },
+                    { CDF3(15529, 23094, 26929) },
+                    { CDF3(11662, 18989, 23641) },
+                    { CDF3( 8955, 15568, 20366) },
+                    { CDF3( 7281, 13106, 17708) },
+                    { CDF3( 4248,  8059, 11440) },
+                }, {
+                    { CDF3(14899, 21217, 24503) },
+                    { CDF3(13519, 20283, 24047) },
+                    { CDF3( 9429, 15966, 20365) },
+                    { CDF3( 6700, 12355, 16652) },
+                    { CDF3( 5088,  9704, 13716) },
+                    { CDF3( 4243,  8154, 11731) },
+                    { CDF3( 2702,  5364,  7861) },
+                    { CDF3(22745, 28388, 30454) },
+                    { CDF3(20235, 27146, 29922) },
+                    { CDF3(15896, 23715, 27637) },
+                    { CDF3(11840, 19350, 24131) },
+                    { CDF3( 9122, 15932, 20880) },
+                    { CDF3( 7488, 13581, 18362) },
+                    { CDF3( 5114,  9568, 13370) },
+                    { CDF3(20845, 26553, 28932) },
+                    { CDF3(20981, 27372, 29884) },
+                    { CDF3(17781, 25335, 28785) },
+                    { CDF3(13760, 21708, 26297) },
+                    { CDF3(10975, 18415, 23365) },
+                    { CDF3( 9045, 15789, 20686) },
+                    { CDF3( 6130, 11199, 15423) },
+                },
+            }, {
+                {
+                    { CDF3(13549, 19724, 23158) },
+                    { CDF3(11844, 18382, 22246) },
+                    { CDF3( 7919, 13619, 17773) },
+                    { CDF3( 5486, 10143, 13946) },
+                    { CDF3( 4166,  7983, 11324) },
+                    { CDF3( 3364,  6506,  9427) },
+                    { CDF3( 1598,  3160,  4674) },
+                    { CDF3(15281, 20979, 23781) },
+                    { CDF3(14939, 22119, 25952) },
+                    { CDF3(11363, 18407, 22812) },
+                    { CDF3( 8609, 14857, 19370) },
+                    { CDF3( 6737, 12184, 16480) },
+                    { CDF3( 5506, 10263, 14262) },
+                    { CDF3( 2990,  5786,  8380) },
+                    { CDF3(20249, 25253, 27417) },
+                    { CDF3(21070, 27518, 30001) },
+                    { CDF3(16854, 24469, 28074) },
+                    { CDF3(12864, 20486, 25000) },
+                    { CDF3( 9962, 16978, 21778) },
+                    { CDF3( 8074, 14338, 19048) },
+                    { CDF3( 4494,  8479, 11906) },
+                }, {
+                    { CDF3(13960, 19617, 22829) },
+                    { CDF3(11150, 17341, 21228) },
+                    { CDF3( 7150, 12964, 17190) },
+                    { CDF3( 5331, 10002, 13867) },
+                    { CDF3( 4167,  7744, 11057) },
+                    { CDF3( 3480,  6629,  9646) },
+                    { CDF3( 1883,  3784,  5686) },
+                    { CDF3(18752, 25660, 28912) },
+                    { CDF3(16968, 24586, 28030) },
+                    { CDF3(13520, 21055, 25313) },
+                    { CDF3(10453, 17626, 22280) },
+                    { CDF3( 8386, 14505, 19116) },
+                    { CDF3( 6742, 12595, 17008) },
+                    { CDF3( 4273,  8140, 11499) },
+                    { CDF3(22120, 27827, 30233) },
+                    { CDF3(20563, 27358, 29895) },
+                    { CDF3(17076, 24644, 28153) },
+                    { CDF3(13362, 20942, 25309) },
+                    { CDF3(10794, 17965, 22695) },
+                    { CDF3( 9014, 15652, 20319) },
+                    { CDF3( 5708, 10512, 14497) },
+                },
+            }, {
+                {
+                    { CDF3( 5705, 10930, 15725) },
+                    { CDF3( 7946, 12765, 16115) },
+                    { CDF3( 6801, 12123, 16226) },
+                    { CDF3( 5462, 10135, 14200) },
+                    { CDF3( 4189,  8011, 11507) },
+                    { CDF3( 3191,  6229,  9408) },
+                    { CDF3( 1057,  2137,  3212) },
+                    { CDF3(10018, 17067, 21491) },
+                    { CDF3( 7380, 12582, 16453) },
+                    { CDF3( 6068, 10845, 14339) },
+                    { CDF3( 5098,  9198, 12555) },
+                    { CDF3( 4312,  8010, 11119) },
+                    { CDF3( 3700,  6966,  9781) },
+                    { CDF3( 1693,  3326,  4887) },
+                    { CDF3(18757, 24930, 27774) },
+                    { CDF3(17648, 24596, 27817) },
+                    { CDF3(14707, 22052, 26026) },
+                    { CDF3(11720, 18852, 23292) },
+                    { CDF3( 9357, 15952, 20525) },
+                    { CDF3( 7810, 13753, 18210) },
+                    { CDF3( 3879,  7333, 10328) },
+                }, {
+                    { CDF3( 8278, 13242, 15922) },
+                    { CDF3(10547, 15867, 18919) },
+                    { CDF3( 9106, 15842, 20609) },
+                    { CDF3( 6833, 13007, 17218) },
+                    { CDF3( 4811,  9712, 13923) },
+                    { CDF3( 3985,  7352, 11128) },
+                    { CDF3( 1688,  3458,  5262) },
+                    { CDF3(12951, 21861, 26510) },
+                    { CDF3( 9788, 16044, 20276) },
+                    { CDF3( 6309, 11244, 14870) },
+                    { CDF3( 5183,  9349, 12566) },
+                    { CDF3( 4389,  8229, 11492) },
+                    { CDF3( 3633,  6945, 10620) },
+                    { CDF3( 3600,  6847,  9907) },
+                    { CDF3(21748, 28137, 30255) },
+                    { CDF3(19436, 26581, 29560) },
+                    { CDF3(16359, 24201, 27953) },
+                    { CDF3(13961, 21693, 25871) },
+                    { CDF3(11544, 18686, 23322) },
+                    { CDF3( 9372, 16462, 20952) },
+                    { CDF3( 6138, 11210, 15390) },
+                },
+            },
+        },
+    }, [2] = {
+        .skip = {
+            {
+                { CDF1(29614) }, { CDF1( 9068) }, { CDF1(12924) },
+                { CDF1(19538) }, { CDF1(17737) }, { CDF1(24619) },
+                { CDF1(30642) }, { CDF1( 4119) }, { CDF1(16026) },
+                { CDF1(25657) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            }, {
+                { CDF1(31957) }, { CDF1( 3230) }, { CDF1(11153) },
+                { CDF1(18123) }, { CDF1(20143) }, { CDF1(26536) },
+                { CDF1(31986) }, { CDF1( 3050) }, { CDF1(14603) },
+                { CDF1(25155) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            }, {
+                { CDF1(32363) }, { CDF1(10692) }, { CDF1(19090) },
+                { CDF1(24357) }, { CDF1(24442) }, { CDF1(28312) },
+                { CDF1(32169) }, { CDF1( 3648) }, { CDF1(15690) },
+                { CDF1(26815) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            }, {
+                { CDF1(30669) }, { CDF1( 3832) }, { CDF1(11663) },
+                { CDF1(18889) }, { CDF1(19782) }, { CDF1(23313) },
+                { CDF1(31330) }, { CDF1( 5124) }, { CDF1(18719) },
+                { CDF1(28468) }, { CDF1( 3082) }, { CDF1(20982) },
+                { CDF1(29443) },
+            }, {
+                { CDF1(28573) }, { CDF1( 3183) }, { CDF1(17802) },
+                { CDF1(25977) }, { CDF1(26677) }, { CDF1(27832) },
+                { CDF1(32387) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            },
+        }, .eob_bin_16 = {
+            {
+                { CDF4( 4016,  4897,  8881, 14968) },
+                { CDF4(  716,  1105,  2646, 10056) },
+            }, {
+                { CDF4(11139, 13270, 18241, 23566) },
+                { CDF4( 3192,  5032, 10297, 19755) },
+            },
+        }, .eob_bin_32 = {
+            {
+                { CDF5( 2515,  3003,  4452,  8162, 16041) },
+                { CDF5(  574,   821,  1836,  5089, 13128) },
+            }, {
+                { CDF5(13468, 16303, 20361, 25105, 29281) },
+                { CDF5( 3542,  5502, 10415, 16760, 25644) },
+            },
+        }, .eob_bin_64 = {
+            {
+                { CDF6( 2374,  2772,  4583,  7276, 12288, 19706) },
+                { CDF6(  497,   810,  1315,  3000,  7004, 15641) },
+            }, {
+                { CDF6(15050, 17126, 21410, 24886, 28156, 30726) },
+                { CDF6( 4034,  6290, 10235, 14982, 21214, 28491) },
+            },
+        }, .eob_bin_128 = {
+            {
+                { CDF7( 1366,  1738,  2527,  5016,  9355, 15797, 24643) },
+                { CDF7(  354,   558,   944,  2760,  7287, 14037, 21779) },
+            }, {
+                { CDF7(13627, 16246, 20173, 24429, 27948, 30415, 31863) },
+                { CDF7( 6275,  9889, 14769, 23164, 27988, 30493, 32272) },
+            },
+        }, .eob_bin_256 = {
+            {
+                { CDF8( 3089,  3920,  6038,  9460,
+                       14266, 19881, 25766, 29176) },
+                { CDF8( 1084,  2358,  3488,  5122,
+                       11483, 18103, 26023, 29799) },
+            }, {
+                { CDF8(11514, 13794, 17480, 20754,
+                       24361, 27378, 29492, 31277) },
+                { CDF8( 6571,  9610, 15516, 21826,
+                       29092, 30829, 31842, 32708) },
+            },
+        }, .eob_bin_512 = {
+            { CDF9( 2624,  3936,  6480,  9686, 13979,
+                   17726, 23267, 28410, 31078) },
+            { CDF9(12015, 14769, 19588, 22052, 24222,
+                   25812, 27300, 29219, 32114) },
+        }, .eob_bin_1024 = {
+            { CDF10( 2784,  3831,  7041, 10521, 14847,
+                    18844, 23155, 26682, 29229, 31045) },
+            { CDF10( 9577, 12466, 17739, 20750, 22061,
+                    23215, 24601, 25483, 25843, 32056) },
+        }, .eob_hi_bit = {
+            {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(18983) },
+                    { CDF1(20512) }, { CDF1(14885) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(20090) },
+                    { CDF1(19444) }, { CDF1(17286) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(19139) },
+                    { CDF1(21487) }, { CDF1(18959) }, { CDF1(20910) },
+                    { CDF1(19089) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(20536) },
+                    { CDF1(20664) }, { CDF1(20625) }, { CDF1(19123) },
+                    { CDF1(14862) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(19833) },
+                    { CDF1(21502) }, { CDF1(17485) }, { CDF1(20267) },
+                    { CDF1(18353) }, { CDF1(23329) }, { CDF1(21478) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(22041) },
+                    { CDF1(23434) }, { CDF1(20001) }, { CDF1(20554) },
+                    { CDF1(20951) }, { CDF1(20145) }, { CDF1(15562) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(23312) },
+                    { CDF1(21607) }, { CDF1(16526) }, { CDF1(18957) },
+                    { CDF1(18034) }, { CDF1(18934) }, { CDF1(24247) },
+                    { CDF1(16921) }, { CDF1(17080) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(26579) },
+                    { CDF1(24910) }, { CDF1(18637) }, { CDF1(19800) },
+                    { CDF1(20388) }, { CDF1( 9887) }, { CDF1(15642) },
+                    { CDF1(30198) }, { CDF1(24721) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(26998) },
+                    { CDF1(16737) }, { CDF1(17838) }, { CDF1(18922) },
+                    { CDF1(19515) }, { CDF1(18636) }, { CDF1(17333) },
+                    { CDF1(15776) }, { CDF1(22658) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            },
+        }, .eob_base_tok = {
+            {
+                {
+                    { CDF2(20092, 30774) }, { CDF2(30695, 32020) },
+                    { CDF2(31131, 32103) }, { CDF2(28666, 30870) },
+                }, {
+                    { CDF2(27258, 31095) }, { CDF2(31804, 32623) },
+                    { CDF2(31763, 32528) }, { CDF2(31438, 32506) },
+                },
+            }, {
+                {
+                    { CDF2(18049, 30489) }, { CDF2(31706, 32286) },
+                    { CDF2(32163, 32473) }, { CDF2(31550, 32184) },
+                }, {
+                    { CDF2(27116, 30842) }, { CDF2(31971, 32598) },
+                    { CDF2(32088, 32576) }, { CDF2(32067, 32664) },
+                },
+            }, {
+                {
+                    { CDF2(12854, 29093) }, { CDF2(32272, 32558) },
+                    { CDF2(32667, 32729) }, { CDF2(32306, 32585) },
+                }, {
+                    { CDF2(25476, 30366) }, { CDF2(32169, 32687) },
+                    { CDF2(32479, 32689) }, { CDF2(31673, 32634) },
+                },
+            }, {
+                {
+                    { CDF2( 2809, 19301) }, { CDF2(32205, 32622) },
+                    { CDF2(32338, 32730) }, { CDF2(31786, 32616) },
+                }, {
+                    { CDF2(22737, 29105) }, { CDF2(30810, 32362) },
+                    { CDF2(30014, 32627) }, { CDF2(30528, 32574) },
+                },
+            }, {
+                {
+                    { CDF2(  935,  3382) }, { CDF2(30789, 31909) },
+                    { CDF2(32466, 32756) }, { CDF2(30860, 32513) },
+                }, {
+                    { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+                    { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+                },
+            },
+        }, .base_tok = {
+            {
+                {
+                    { CDF3( 8896, 16227, 20630) },
+                    { CDF3(23629, 31782, 32527) },
+                    { CDF3(15173, 27755, 31321) },
+                    { CDF3(10158, 21233, 27382) },
+                    { CDF3( 6420, 14857, 21558) },
+                    { CDF3( 3269,  8155, 12646) },
+                    { CDF3(24835, 32009, 32496) },
+                    { CDF3(16509, 28421, 31579) },
+                    { CDF3(10957, 21514, 27418) },
+                    { CDF3( 7881, 15930, 22096) },
+                    { CDF3( 5388, 10960, 15918) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3(20745, 30773, 32093) },
+                    { CDF3(15200, 27221, 30861) },
+                    { CDF3(13032, 20873, 25667) },
+                    { CDF3(12285, 18663, 23494) },
+                    { CDF3(11563, 17481, 21489) },
+                    { CDF3(26260, 31982, 32320) },
+                    { CDF3(15397, 28083, 31100) },
+                    { CDF3( 9742, 19217, 24824) },
+                    { CDF3( 3261,  9629, 15362) },
+                    { CDF3( 1480,  4322,  7499) },
+                    { CDF3(27599, 32256, 32460) },
+                    { CDF3(16857, 27659, 30774) },
+                    { CDF3( 9551, 18290, 23748) },
+                    { CDF3( 3052,  8933, 14103) },
+                    { CDF3( 2021,  5910,  9787) },
+                    { CDF3(29005, 32015, 32392) },
+                    { CDF3(17677, 27694, 30863) },
+                    { CDF3( 9204, 17356, 23219) },
+                    { CDF3( 2403,  7516, 12814) },
+                    { CDF3( 8192, 16384, 24576) },
+                }, {
+                    { CDF3(10808, 22056, 26896) },
+                    { CDF3(25739, 32313, 32676) },
+                    { CDF3(17288, 30203, 32221) },
+                    { CDF3(11359, 24878, 29896) },
+                    { CDF3( 6949, 17767, 24893) },
+                    { CDF3( 4287, 11796, 18071) },
+                    { CDF3(27880, 32521, 32705) },
+                    { CDF3(19038, 31004, 32414) },
+                    { CDF3(12564, 26345, 30768) },
+                    { CDF3( 8269, 19947, 26779) },
+                    { CDF3( 5674, 14657, 21674) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3(25742, 32319, 32671) },
+                    { CDF3(19557, 31164, 32454) },
+                    { CDF3(13381, 26381, 30755) },
+                    { CDF3(10101, 21466, 26722) },
+                    { CDF3( 9209, 19650, 26825) },
+                    { CDF3(27107, 31917, 32432) },
+                    { CDF3(18056, 28893, 31203) },
+                    { CDF3(10200, 21434, 26764) },
+                    { CDF3( 4660, 12913, 19502) },
+                    { CDF3( 2368,  6930, 12504) },
+                    { CDF3(26960, 32158, 32613) },
+                    { CDF3(18628, 30005, 32031) },
+                    { CDF3(10233, 22442, 28232) },
+                    { CDF3( 5471, 14630, 21516) },
+                    { CDF3( 3235, 10767, 17109) },
+                    { CDF3(27696, 32440, 32692) },
+                    { CDF3(20032, 31167, 32438) },
+                    { CDF3( 8700, 21341, 28442) },
+                    { CDF3( 5662, 14831, 21795) },
+                    { CDF3( 8192, 16384, 24576) },
+                },
+            }, {
+                {
+                    { CDF3( 9704, 17294, 21132) },
+                    { CDF3(26762, 32278, 32633) },
+                    { CDF3(18382, 29620, 31819) },
+                    { CDF3(10891, 23475, 28723) },
+                    { CDF3( 6358, 16583, 23309) },
+                    { CDF3( 3248,  9118, 14141) },
+                    { CDF3(27204, 32573, 32699) },
+                    { CDF3(19818, 30824, 32329) },
+                    { CDF3(11772, 25120, 30041) },
+                    { CDF3( 6995, 18033, 25039) },
+                    { CDF3( 3752, 10442, 16098) },
+                    { CDF3(27222, 32256, 32559) },
+                    { CDF3(15356, 28399, 31475) },
+                    { CDF3( 8821, 20635, 27057) },
+                    { CDF3( 5511, 14404, 21239) },
+                    { CDF3( 2935,  8222, 13051) },
+                    { CDF3(24875, 32120, 32529) },
+                    { CDF3(15233, 28265, 31445) },
+                    { CDF3( 8605, 20570, 26932) },
+                    { CDF3( 5431, 14413, 21196) },
+                    { CDF3( 2994,  8341, 13223) },
+                    { CDF3(28201, 32604, 32700) },
+                    { CDF3(21041, 31446, 32456) },
+                    { CDF3(13221, 26213, 30475) },
+                    { CDF3( 8255, 19385, 26037) },
+                    { CDF3( 4930, 12585, 18830) },
+                    { CDF3(28768, 32448, 32627) },
+                    { CDF3(19705, 30561, 32021) },
+                    { CDF3(11572, 23589, 28220) },
+                    { CDF3( 5532, 15034, 21446) },
+                    { CDF3( 2460,  7150, 11456) },
+                    { CDF3(29874, 32619, 32699) },
+                    { CDF3(21621, 31071, 32201) },
+                    { CDF3(12511, 24747, 28992) },
+                    { CDF3( 6281, 16395, 22748) },
+                    { CDF3( 3246,  9278, 14497) },
+                    { CDF3(29715, 32625, 32712) },
+                    { CDF3(20958, 31011, 32283) },
+                    { CDF3(11233, 23671, 28806) },
+                    { CDF3( 6012, 16128, 22868) },
+                    { CDF3( 3427,  9851, 15414) },
+                }, {
+                    { CDF3(11016, 22111, 26794) },
+                    { CDF3(25946, 32357, 32677) },
+                    { CDF3(17890, 30452, 32252) },
+                    { CDF3(11678, 25142, 29816) },
+                    { CDF3( 6720, 17534, 24584) },
+                    { CDF3( 4230, 11665, 17820) },
+                    { CDF3(28400, 32623, 32747) },
+                    { CDF3(21164, 31668, 32575) },
+                    { CDF3(13572, 27388, 31182) },
+                    { CDF3( 8234, 20750, 27358) },
+                    { CDF3( 5065, 14055, 20897) },
+                    { CDF3(28981, 32547, 32705) },
+                    { CDF3(18681, 30543, 32239) },
+                    { CDF3(10919, 24075, 29286) },
+                    { CDF3( 6431, 17199, 24077) },
+                    { CDF3( 3819, 10464, 16618) },
+                    { CDF3(26870, 32467, 32693) },
+                    { CDF3(19041, 30831, 32347) },
+                    { CDF3(11794, 25211, 30016) },
+                    { CDF3( 6888, 18019, 24970) },
+                    { CDF3( 4370, 12363, 18992) },
+                    { CDF3(29578, 32670, 32744) },
+                    { CDF3(23159, 32007, 32613) },
+                    { CDF3(15315, 28669, 31676) },
+                    { CDF3( 9298, 22607, 28782) },
+                    { CDF3( 6144, 15913, 22968) },
+                    { CDF3(28110, 32499, 32669) },
+                    { CDF3(21574, 30937, 32015) },
+                    { CDF3(12759, 24818, 28727) },
+                    { CDF3( 6545, 16761, 23042) },
+                    { CDF3( 3649, 10597, 16833) },
+                    { CDF3(28163, 32552, 32728) },
+                    { CDF3(22101, 31469, 32464) },
+                    { CDF3(13160, 25472, 30143) },
+                    { CDF3( 7303, 18684, 25468) },
+                    { CDF3( 5241, 13975, 20955) },
+                    { CDF3(28400, 32631, 32744) },
+                    { CDF3(22104, 31793, 32603) },
+                    { CDF3(13557, 26571, 30846) },
+                    { CDF3( 7749, 19861, 26675) },
+                    { CDF3( 4873, 14030, 21234) },
+                },
+            }, {
+                {
+                    { CDF3( 9800, 17635, 21073) },
+                    { CDF3(26153, 31885, 32527) },
+                    { CDF3(15038, 27852, 31006) },
+                    { CDF3( 8718, 20564, 26486) },
+                    { CDF3( 5128, 14076, 20514) },
+                    { CDF3( 2636,  7566, 11925) },
+                    { CDF3(27551, 32504, 32701) },
+                    { CDF3(18310, 30054, 32100) },
+                    { CDF3(10211, 23420, 29082) },
+                    { CDF3( 6222, 16876, 23916) },
+                    { CDF3( 3462,  9954, 15498) },
+                    { CDF3(29991, 32633, 32721) },
+                    { CDF3(19883, 30751, 32201) },
+                    { CDF3(11141, 24184, 29285) },
+                    { CDF3( 6420, 16940, 23774) },
+                    { CDF3( 3392,  9753, 15118) },
+                    { CDF3(28465, 32616, 32712) },
+                    { CDF3(19850, 30702, 32244) },
+                    { CDF3(10983, 24024, 29223) },
+                    { CDF3( 6294, 16770, 23582) },
+                    { CDF3( 3244,  9283, 14509) },
+                    { CDF3(30023, 32717, 32748) },
+                    { CDF3(22940, 32032, 32626) },
+                    { CDF3(14282, 27928, 31473) },
+                    { CDF3( 8562, 21327, 27914) },
+                    { CDF3( 4846, 13393, 19919) },
+                    { CDF3(29981, 32590, 32695) },
+                    { CDF3(20465, 30963, 32166) },
+                    { CDF3(11479, 23579, 28195) },
+                    { CDF3( 5916, 15648, 22073) },
+                    { CDF3( 3031,  8605, 13398) },
+                    { CDF3(31146, 32691, 32739) },
+                    { CDF3(23106, 31724, 32444) },
+                    { CDF3(13783, 26738, 30439) },
+                    { CDF3( 7852, 19468, 25807) },
+                    { CDF3( 3860, 11124, 16853) },
+                    { CDF3(31014, 32724, 32748) },
+                    { CDF3(23629, 32109, 32628) },
+                    { CDF3(14747, 28115, 31403) },
+                    { CDF3( 8545, 21242, 27478) },
+                    { CDF3( 4574, 12781, 19067) },
+                }, {
+                    { CDF3( 9185, 19694, 24688) },
+                    { CDF3(26081, 31985, 32621) },
+                    { CDF3(16015, 29000, 31787) },
+                    { CDF3(10542, 23690, 29206) },
+                    { CDF3( 6732, 17945, 24677) },
+                    { CDF3( 3916, 11039, 16722) },
+                    { CDF3(28224, 32566, 32744) },
+                    { CDF3(19100, 31138, 32485) },
+                    { CDF3(12528, 26620, 30879) },
+                    { CDF3( 7741, 20277, 26885) },
+                    { CDF3( 4566, 12845, 18990) },
+                    { CDF3(29933, 32593, 32718) },
+                    { CDF3(17670, 30333, 32155) },
+                    { CDF3(10385, 23600, 28909) },
+                    { CDF3( 6243, 16236, 22407) },
+                    { CDF3( 3976, 10389, 16017) },
+                    { CDF3(28377, 32561, 32738) },
+                    { CDF3(19366, 31175, 32482) },
+                    { CDF3(13327, 27175, 31094) },
+                    { CDF3( 8258, 20769, 27143) },
+                    { CDF3( 4703, 13198, 19527) },
+                    { CDF3(31086, 32706, 32748) },
+                    { CDF3(22853, 31902, 32583) },
+                    { CDF3(14759, 28186, 31419) },
+                    { CDF3( 9284, 22382, 28348) },
+                    { CDF3( 5585, 15192, 21868) },
+                    { CDF3(28291, 32652, 32746) },
+                    { CDF3(19849, 32107, 32571) },
+                    { CDF3(14834, 26818, 29214) },
+                    { CDF3(10306, 22594, 28672) },
+                    { CDF3( 6615, 17384, 23384) },
+                    { CDF3(28947, 32604, 32745) },
+                    { CDF3(25625, 32289, 32646) },
+                    { CDF3(18758, 28672, 31403) },
+                    { CDF3(10017, 23430, 28523) },
+                    { CDF3( 6862, 15269, 22131) },
+                    { CDF3(23933, 32509, 32739) },
+                    { CDF3(19927, 31495, 32631) },
+                    { CDF3(11903, 26023, 30621) },
+                    { CDF3( 7026, 20094, 27252) },
+                    { CDF3( 5998, 18106, 24437) },
+                },
+            }, {
+                {
+                    { CDF3( 4456, 11274, 15533) },
+                    { CDF3(21219, 29079, 31616) },
+                    { CDF3(11173, 23774, 28567) },
+                    { CDF3( 7282, 18293, 24263) },
+                    { CDF3( 4890, 13286, 19115) },
+                    { CDF3( 1890,  5508,  8659) },
+                    { CDF3(26651, 32136, 32647) },
+                    { CDF3(14630, 28254, 31455) },
+                    { CDF3( 8716, 21287, 27395) },
+                    { CDF3( 5615, 15331, 22008) },
+                    { CDF3( 2675,  7700, 12150) },
+                    { CDF3(29954, 32526, 32690) },
+                    { CDF3(16126, 28982, 31633) },
+                    { CDF3( 9030, 21361, 27352) },
+                    { CDF3( 5411, 14793, 21271) },
+                    { CDF3( 2943,  8422, 13163) },
+                    { CDF3(29539, 32601, 32730) },
+                    { CDF3(18125, 30385, 32201) },
+                    { CDF3(10422, 24090, 29468) },
+                    { CDF3( 6468, 17487, 24438) },
+                    { CDF3( 2970,  8653, 13531) },
+                    { CDF3(30912, 32715, 32748) },
+                    { CDF3(20666, 31373, 32497) },
+                    { CDF3(12509, 26640, 30917) },
+                    { CDF3( 8058, 20629, 27290) },
+                    { CDF3( 4231, 12006, 18052) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                }, {
+                    { CDF3(10202, 20633, 25484) },
+                    { CDF3(27336, 31445, 32352) },
+                    { CDF3(12420, 24384, 28552) },
+                    { CDF3( 7648, 18115, 23856) },
+                    { CDF3( 5662, 14341, 19902) },
+                    { CDF3( 3611, 10328, 15390) },
+                    { CDF3(30945, 32616, 32736) },
+                    { CDF3(18682, 30505, 32253) },
+                    { CDF3(11513, 25336, 30203) },
+                    { CDF3( 7449, 19452, 26148) },
+                    { CDF3( 4482, 13051, 18886) },
+                    { CDF3(32022, 32690, 32747) },
+                    { CDF3(18578, 30501, 32146) },
+                    { CDF3(11249, 23368, 28631) },
+                    { CDF3( 5645, 16958, 22158) },
+                    { CDF3( 5009, 11444, 16637) },
+                    { CDF3(31357, 32710, 32748) },
+                    { CDF3(21552, 31494, 32504) },
+                    { CDF3(13891, 27677, 31340) },
+                    { CDF3( 9051, 22098, 28172) },
+                    { CDF3( 5190, 13377, 19486) },
+                    { CDF3(32364, 32740, 32748) },
+                    { CDF3(24839, 31907, 32551) },
+                    { CDF3(17160, 28779, 31696) },
+                    { CDF3(12452, 24137, 29602) },
+                    { CDF3( 6165, 15389, 22477) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                },
+            }, {
+                {
+                    { CDF3( 2575,  7281, 11077) },
+                    { CDF3(14002, 20866, 25402) },
+                    { CDF3( 6343, 15056, 19658) },
+                    { CDF3( 4474, 11858, 17041) },
+                    { CDF3( 2865,  8299, 12534) },
+                    { CDF3( 1344,  3949,  6391) },
+                    { CDF3(24720, 31239, 32459) },
+                    { CDF3(12585, 25356, 29968) },
+                    { CDF3( 7181, 18246, 24444) },
+                    { CDF3( 5025, 13667, 19885) },
+                    { CDF3( 2521,  7304, 11605) },
+                    { CDF3(29908, 32252, 32584) },
+                    { CDF3(17421, 29156, 31575) },
+                    { CDF3( 9889, 22188, 27782) },
+                    { CDF3( 5878, 15647, 22123) },
+                    { CDF3( 2814,  8665, 13323) },
+                    { CDF3(30183, 32568, 32713) },
+                    { CDF3(18528, 30195, 32049) },
+                    { CDF3(10982, 24606, 29657) },
+                    { CDF3( 6957, 18165, 25231) },
+                    { CDF3( 3508, 10118, 15468) },
+                    { CDF3(31761, 32736, 32748) },
+                    { CDF3(21041, 31328, 32546) },
+                    { CDF3(12568, 26732, 31166) },
+                    { CDF3( 8052, 20720, 27733) },
+                    { CDF3( 4336, 12192, 18396) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                }, {
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                },
+            },
+        }, .dc_sign = {
+            { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } },
+            { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } },
+        }, .br_tok = {
+            {
+                {
+                    { CDF3(16138, 22223, 25509) },
+                    { CDF3(15347, 22430, 26332) },
+                    { CDF3( 9614, 16736, 21332) },
+                    { CDF3( 6600, 12275, 16907) },
+                    { CDF3( 4811,  9424, 13547) },
+                    { CDF3( 3748,  7809, 11420) },
+                    { CDF3( 2254,  4587,  6890) },
+                    { CDF3(15196, 20284, 23177) },
+                    { CDF3(18317, 25469, 28451) },
+                    { CDF3(13918, 21651, 25842) },
+                    { CDF3(10052, 17150, 21995) },
+                    { CDF3( 7499, 13630, 18587) },
+                    { CDF3( 6158, 11417, 16003) },
+                    { CDF3( 4014,  7785, 11252) },
+                    { CDF3(15048, 21067, 24384) },
+                    { CDF3(18202, 25346, 28553) },
+                    { CDF3(14302, 22019, 26356) },
+                    { CDF3(10839, 18139, 23166) },
+                    { CDF3( 8715, 15744, 20806) },
+                    { CDF3( 7536, 13576, 18544) },
+                    { CDF3( 5413, 10335, 14498) },
+                }, {
+                    { CDF3(17394, 24501, 27895) },
+                    { CDF3(15889, 23420, 27185) },
+                    { CDF3(11561, 19133, 23870) },
+                    { CDF3( 8285, 14812, 19844) },
+                    { CDF3( 6496, 12043, 16550) },
+                    { CDF3( 4771,  9574, 13677) },
+                    { CDF3( 3603,  6830, 10144) },
+                    { CDF3(21656, 27704, 30200) },
+                    { CDF3(21324, 27915, 30511) },
+                    { CDF3(17327, 25336, 28997) },
+                    { CDF3(13417, 21381, 26033) },
+                    { CDF3(10132, 17425, 22338) },
+                    { CDF3( 8580, 15016, 19633) },
+                    { CDF3( 5694, 11477, 16411) },
+                    { CDF3(24116, 29780, 31450) },
+                    { CDF3(23853, 29695, 31591) },
+                    { CDF3(20085, 27614, 30428) },
+                    { CDF3(15326, 24335, 28575) },
+                    { CDF3(11814, 19472, 24810) },
+                    { CDF3(10221, 18611, 24767) },
+                    { CDF3( 7689, 14558, 20321) },
+                },
+            }, {
+                {
+                    { CDF3(16214, 22380, 25770) },
+                    { CDF3(14213, 21304, 25295) },
+                    { CDF3( 9213, 15823, 20455) },
+                    { CDF3( 6395, 11758, 16139) },
+                    { CDF3( 4779,  9187, 13066) },
+                    { CDF3( 3821,  7501, 10953) },
+                    { CDF3( 2293,  4567,  6795) },
+                    { CDF3(15859, 21283, 23820) },
+                    { CDF3(18404, 25602, 28726) },
+                    { CDF3(14325, 21980, 26206) },
+                    { CDF3(10669, 17937, 22720) },
+                    { CDF3( 8297, 14642, 19447) },
+                    { CDF3( 6746, 12389, 16893) },
+                    { CDF3( 4324,  8251, 11770) },
+                    { CDF3(16532, 21631, 24475) },
+                    { CDF3(20667, 27150, 29668) },
+                    { CDF3(16728, 24510, 28175) },
+                    { CDF3(12861, 20645, 25332) },
+                    { CDF3(10076, 17361, 22417) },
+                    { CDF3( 8395, 14940, 19963) },
+                    { CDF3( 5731, 10683, 14912) },
+                }, {
+                    { CDF3(14433, 21155, 24938) },
+                    { CDF3(14658, 21716, 25545) },
+                    { CDF3( 9923, 16824, 21557) },
+                    { CDF3( 6982, 13052, 17721) },
+                    { CDF3( 5419, 10503, 15050) },
+                    { CDF3( 4852,  9162, 13014) },
+                    { CDF3( 3271,  6395,  9630) },
+                    { CDF3(22210, 27833, 30109) },
+                    { CDF3(20750, 27368, 29821) },
+                    { CDF3(16894, 24828, 28573) },
+                    { CDF3(13247, 21276, 25757) },
+                    { CDF3(10038, 17265, 22563) },
+                    { CDF3( 8587, 14947, 20327) },
+                    { CDF3( 5645, 11371, 15252) },
+                    { CDF3(22027, 27526, 29714) },
+                    { CDF3(23098, 29146, 31221) },
+                    { CDF3(19886, 27341, 30272) },
+                    { CDF3(15609, 23747, 28046) },
+                    { CDF3(11993, 20065, 24939) },
+                    { CDF3( 9637, 18267, 23671) },
+                    { CDF3( 7625, 13801, 19144) },
+                },
+            }, {
+                {
+                    { CDF3(14438, 20798, 24089) },
+                    { CDF3(12621, 19203, 23097) },
+                    { CDF3( 8177, 14125, 18402) },
+                    { CDF3( 5674, 10501, 14456) },
+                    { CDF3( 4236,  8239, 11733) },
+                    { CDF3( 3447,  6750,  9806) },
+                    { CDF3( 1986,  3950,  5864) },
+                    { CDF3(16208, 22099, 24930) },
+                    { CDF3(16537, 24025, 27585) },
+                    { CDF3(12780, 20381, 24867) },
+                    { CDF3( 9767, 16612, 21416) },
+                    { CDF3( 7686, 13738, 18398) },
+                    { CDF3( 6333, 11614, 15964) },
+                    { CDF3( 3941,  7571, 10836) },
+                    { CDF3(22819, 27422, 29202) },
+                    { CDF3(22224, 28514, 30721) },
+                    { CDF3(17660, 25433, 28913) },
+                    { CDF3(13574, 21482, 26002) },
+                    { CDF3(10629, 17977, 22938) },
+                    { CDF3( 8612, 15298, 20265) },
+                    { CDF3( 5607, 10491, 14596) },
+                }, {
+                    { CDF3(13569, 19800, 23206) },
+                    { CDF3(13128, 19924, 23869) },
+                    { CDF3( 8329, 14841, 19403) },
+                    { CDF3( 6130, 10976, 15057) },
+                    { CDF3( 4682,  8839, 12518) },
+                    { CDF3( 3656,  7409, 10588) },
+                    { CDF3( 2577,  5099,  7412) },
+                    { CDF3(22427, 28684, 30585) },
+                    { CDF3(20913, 27750, 30139) },
+                    { CDF3(15840, 24109, 27834) },
+                    { CDF3(12308, 20029, 24569) },
+                    { CDF3(10216, 16785, 21458) },
+                    { CDF3( 8309, 14203, 19113) },
+                    { CDF3( 6043, 11168, 15307) },
+                    { CDF3(23166, 28901, 30998) },
+                    { CDF3(21899, 28405, 30751) },
+                    { CDF3(18413, 26091, 29443) },
+                    { CDF3(15233, 23114, 27352) },
+                    { CDF3(12683, 20472, 25288) },
+                    { CDF3(10702, 18259, 23409) },
+                    { CDF3( 8125, 14464, 19226) },
+                },
+            }, {
+                {
+                    { CDF3( 9040, 14786, 18360) },
+                    { CDF3( 9979, 15718, 19415) },
+                    { CDF3( 7913, 13918, 18311) },
+                    { CDF3( 5859, 10889, 15184) },
+                    { CDF3( 4593,  8677, 12510) },
+                    { CDF3( 3820,  7396, 10791) },
+                    { CDF3( 1730,  3471,  5192) },
+                    { CDF3(11803, 18365, 22709) },
+                    { CDF3(11419, 18058, 22225) },
+                    { CDF3( 9418, 15774, 20243) },
+                    { CDF3( 7539, 13325, 17657) },
+                    { CDF3( 6233, 11317, 15384) },
+                    { CDF3( 5137,  9656, 13545) },
+                    { CDF3( 2977,  5774,  8349) },
+                    { CDF3(21207, 27246, 29640) },
+                    { CDF3(19547, 26578, 29497) },
+                    { CDF3(16169, 23871, 27690) },
+                    { CDF3(12820, 20458, 25018) },
+                    { CDF3(10224, 17332, 22214) },
+                    { CDF3( 8526, 15048, 19884) },
+                    { CDF3( 5037,  9410, 13118) },
+                }, {
+                    { CDF3(12339, 17329, 20140) },
+                    { CDF3(13505, 19895, 23225) },
+                    { CDF3( 9847, 16944, 21564) },
+                    { CDF3( 7280, 13256, 18348) },
+                    { CDF3( 4712, 10009, 14454) },
+                    { CDF3( 4361,  7914, 12477) },
+                    { CDF3( 2870,  5628,  7995) },
+                    { CDF3(20061, 25504, 28526) },
+                    { CDF3(15235, 22878, 26145) },
+                    { CDF3(12985, 19958, 24155) },
+                    { CDF3( 9782, 16641, 21403) },
+                    { CDF3( 9456, 16360, 20760) },
+                    { CDF3( 6855, 12940, 18557) },
+                    { CDF3( 5661, 10564, 15002) },
+                    { CDF3(25656, 30602, 31894) },
+                    { CDF3(22570, 29107, 31092) },
+                    { CDF3(18917, 26423, 29541) },
+                    { CDF3(15940, 23649, 27754) },
+                    { CDF3(12803, 20581, 25219) },
+                    { CDF3(11082, 18695, 23376) },
+                    { CDF3( 7939, 14373, 19005) },
+                },
+            },
+        },
+    }, [3] = {
+        .skip = {
+            {
+                { CDF1(26887) }, { CDF1( 6729) }, { CDF1(10361) },
+                { CDF1(17442) }, { CDF1(15045) }, { CDF1(22478) },
+                { CDF1(29072) }, { CDF1( 2713) }, { CDF1(11861) },
+                { CDF1(20773) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            }, {
+                { CDF1(31903) }, { CDF1( 2044) }, { CDF1( 7528) },
+                { CDF1(14618) }, { CDF1(16182) }, { CDF1(24168) },
+                { CDF1(31037) }, { CDF1( 2786) }, { CDF1(11194) },
+                { CDF1(20155) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            }, {
+                { CDF1(32510) }, { CDF1( 8430) }, { CDF1(17318) },
+                { CDF1(24154) }, { CDF1(23674) }, { CDF1(28789) },
+                { CDF1(32139) }, { CDF1( 3440) }, { CDF1(13117) },
+                { CDF1(22702) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            }, {
+                { CDF1(31671) }, { CDF1( 2056) }, { CDF1(11746) },
+                { CDF1(16852) }, { CDF1(18635) }, { CDF1(24715) },
+                { CDF1(31484) }, { CDF1( 4656) }, { CDF1(16074) },
+                { CDF1(24704) }, { CDF1( 1806) }, { CDF1(14645) },
+                { CDF1(25336) },
+            }, {
+                { CDF1(31539) }, { CDF1( 8433) }, { CDF1(20576) },
+                { CDF1(27904) }, { CDF1(27852) }, { CDF1(30026) },
+                { CDF1(32441) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                { CDF1(16384) },
+            },
+        }, .eob_bin_16 = {
+            {
+                { CDF4( 6708,  8958, 14746, 22133) },
+                { CDF4( 1222,  2074,  4783, 15410) },
+            }, {
+                { CDF4(19575, 21766, 26044, 29709) },
+                { CDF4( 7297, 10767, 19273, 28194) },
+            },
+        }, .eob_bin_32 = {
+            {
+                { CDF5( 4617,  5709,  8446, 13584, 23135) },
+                { CDF5( 1156,  1702,  3675,  9274, 20539) },
+            }, {
+                { CDF5(22086, 24282, 27010, 29770, 31743) },
+                { CDF5( 7699, 10897, 20891, 26926, 31628) },
+            },
+        }, .eob_bin_64 = {
+            {
+                { CDF6( 6307,  7541, 12060, 16358, 22553, 27865) },
+                { CDF6( 1289,  2320,  3971,  7926, 14153, 24291) },
+            }, {
+                { CDF6(24212, 25708, 28268, 30035, 31307, 32049) },
+                { CDF6( 8726, 12378, 19409, 26450, 30038, 32462) },
+            },
+        }, .eob_bin_128 = {
+            {
+                { CDF7( 3472,  4885,  7489, 12481, 18517, 24536, 29635) },
+                { CDF7(  886,  1731,  3271,  8469, 15569, 22126, 28383) },
+            }, {
+                { CDF7(24313, 26062, 28385, 30107, 31217, 31898, 32345) },
+                { CDF7( 9165, 13282, 21150, 30286, 31894, 32571, 32712) },
+            },
+        }, .eob_bin_256 = {
+            {
+                { CDF8( 5348,  7113, 11820, 15924,
+                       22106, 26777, 30334, 31757) },
+                { CDF8( 2453,  4474,  6307,  8777,
+                       16474, 22975, 29000, 31547) },
+            }, {
+                { CDF8(23110, 24597, 27140, 28894,
+                       30167, 30927, 31392, 32094) },
+                { CDF8( 9998, 17661, 25178, 28097,
+                       31308, 32038, 32403, 32695) },
+            },
+        }, .eob_bin_512 = {
+            { CDF9( 5927,  7809, 10923, 14597, 19439,
+                   24135, 28456, 31142, 32060) },
+            { CDF9(21093, 23043, 25742, 27658, 29097,
+                   29716, 30073, 30820, 31956) },
+        }, .eob_bin_1024 = {
+            { CDF10( 6698,  8334, 11961, 15762, 20186,
+                    23862, 27434, 29326, 31082, 32050) },
+            { CDF10(20569, 22426, 25569, 26859, 28053,
+                    28913, 29486, 29724, 29807, 32570) },
+        }, .eob_hi_bit = {
+            {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(20177) },
+                    { CDF1(20789) }, { CDF1(20262) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(21416) },
+                    { CDF1(20855) }, { CDF1(23410) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(20238) },
+                    { CDF1(21057) }, { CDF1(19159) }, { CDF1(22337) },
+                    { CDF1(20159) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(20125) },
+                    { CDF1(20559) }, { CDF1(21707) }, { CDF1(22296) },
+                    { CDF1(17333) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(19941) },
+                    { CDF1(20527) }, { CDF1(21470) }, { CDF1(22487) },
+                    { CDF1(19558) }, { CDF1(22354) }, { CDF1(20331) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(22752) },
+                    { CDF1(25006) }, { CDF1(22075) }, { CDF1(21576) },
+                    { CDF1(17740) }, { CDF1(21690) }, { CDF1(19211) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(21442) },
+                    { CDF1(22358) }, { CDF1(18503) }, { CDF1(20291) },
+                    { CDF1(19945) }, { CDF1(21294) }, { CDF1(21178) },
+                    { CDF1(19400) }, { CDF1(10556) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(24648) },
+                    { CDF1(24949) }, { CDF1(20708) }, { CDF1(23905) },
+                    { CDF1(20501) }, { CDF1( 9558) }, { CDF1( 9423) },
+                    { CDF1(30365) }, { CDF1(19253) },
+                },
+            }, {
+                {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(26064) },
+                    { CDF1(22098) }, { CDF1(19613) }, { CDF1(20525) },
+                    { CDF1(17595) }, { CDF1(16618) }, { CDF1(20497) },
+                    { CDF1(18989) }, { CDF1(15513) },
+                }, {
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+                    { CDF1(16384) }, { CDF1(16384) },
+                },
+            },
+        }, .eob_base_tok = {
+            {
+                {
+                    { CDF2(22497, 31198) }, { CDF2(31715, 32495) },
+                    { CDF2(31606, 32337) }, { CDF2(30388, 31990) },
+                }, {
+                    { CDF2(27877, 31584) }, { CDF2(32170, 32728) },
+                    { CDF2(32155, 32688) }, { CDF2(32219, 32702) },
+                },
+            }, {
+                {
+                    { CDF2(21457, 31043) }, { CDF2(31951, 32483) },
+                    { CDF2(32153, 32562) }, { CDF2(31473, 32215) },
+                }, {
+                    { CDF2(27558, 31151) }, { CDF2(32020, 32640) },
+                    { CDF2(32097, 32575) }, { CDF2(32242, 32719) },
+                },
+            }, {
+                {
+                    { CDF2(19980, 30591) }, { CDF2(32219, 32597) },
+                    { CDF2(32581, 32706) }, { CDF2(31803, 32287) },
+                }, {
+                    { CDF2(26473, 30507) }, { CDF2(32431, 32723) },
+                    { CDF2(32196, 32611) }, { CDF2(31588, 32528) },
+                },
+            }, {
+                {
+                    { CDF2(24647, 30463) }, { CDF2(32412, 32695) },
+                    { CDF2(32468, 32720) }, { CDF2(31269, 32523) },
+                }, {
+                    { CDF2(28482, 31505) }, { CDF2(32152, 32701) },
+                    { CDF2(31732, 32598) }, { CDF2(31767, 32712) },
+                },
+            }, {
+                {
+                    { CDF2(12358, 24977) }, { CDF2(31331, 32385) },
+                    { CDF2(32634, 32756) }, { CDF2(30411, 32548) },
+                }, {
+                    { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+                    { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+                },
+            },
+        }, .base_tok = {
+            {
+                {
+                    { CDF3( 7062, 16472, 22319) },
+                    { CDF3(24538, 32261, 32674) },
+                    { CDF3(13675, 28041, 31779) },
+                    { CDF3( 8590, 20674, 27631) },
+                    { CDF3( 5685, 14675, 22013) },
+                    { CDF3( 3655,  9898, 15731) },
+                    { CDF3(26493, 32418, 32658) },
+                    { CDF3(16376, 29342, 32090) },
+                    { CDF3(10594, 22649, 28970) },
+                    { CDF3( 8176, 17170, 24303) },
+                    { CDF3( 5605, 12694, 19139) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3(23888, 31902, 32542) },
+                    { CDF3(18612, 29687, 31987) },
+                    { CDF3(16245, 24852, 29249) },
+                    { CDF3(15765, 22608, 27559) },
+                    { CDF3(19895, 24699, 27510) },
+                    { CDF3(28401, 32212, 32457) },
+                    { CDF3(15274, 27825, 30980) },
+                    { CDF3( 9364, 18128, 24332) },
+                    { CDF3( 2283,  8193, 15082) },
+                    { CDF3( 1228,  3972,  7881) },
+                    { CDF3(29455, 32469, 32620) },
+                    { CDF3(17981, 28245, 31388) },
+                    { CDF3(10921, 20098, 26240) },
+                    { CDF3( 3743, 11829, 18657) },
+                    { CDF3( 2374,  9593, 15715) },
+                    { CDF3(31068, 32466, 32635) },
+                    { CDF3(20321, 29572, 31971) },
+                    { CDF3(10771, 20255, 27119) },
+                    { CDF3( 2795, 10410, 17361) },
+                    { CDF3( 8192, 16384, 24576) },
+                }, {
+                    { CDF3( 9320, 22102, 27840) },
+                    { CDF3(27057, 32464, 32724) },
+                    { CDF3(16331, 30268, 32309) },
+                    { CDF3(10319, 23935, 29720) },
+                    { CDF3( 6189, 16448, 24106) },
+                    { CDF3( 3589, 10884, 18808) },
+                    { CDF3(29026, 32624, 32748) },
+                    { CDF3(19226, 31507, 32587) },
+                    { CDF3(12692, 26921, 31203) },
+                    { CDF3( 7049, 19532, 27635) },
+                    { CDF3( 7727, 15669, 23252) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3(28056, 32625, 32748) },
+                    { CDF3(22383, 32075, 32669) },
+                    { CDF3(15417, 27098, 31749) },
+                    { CDF3(18127, 26493, 27190) },
+                    { CDF3( 5461, 16384, 21845) },
+                    { CDF3(27982, 32091, 32584) },
+                    { CDF3(19045, 29868, 31972) },
+                    { CDF3(10397, 22266, 27932) },
+                    { CDF3( 5990, 13697, 21500) },
+                    { CDF3( 1792,  6912, 15104) },
+                    { CDF3(28198, 32501, 32718) },
+                    { CDF3(21534, 31521, 32569) },
+                    { CDF3(11109, 25217, 30017) },
+                    { CDF3( 5671, 15124, 26151) },
+                    { CDF3( 4681, 14043, 18725) },
+                    { CDF3(28688, 32580, 32741) },
+                    { CDF3(22576, 32079, 32661) },
+                    { CDF3(10627, 22141, 28340) },
+                    { CDF3( 9362, 14043, 28087) },
+                    { CDF3( 8192, 16384, 24576) },
+                },
+            }, {
+                {
+                    { CDF3( 7754, 16948, 22142) },
+                    { CDF3(25670, 32330, 32691) },
+                    { CDF3(15663, 29225, 31994) },
+                    { CDF3( 9878, 23288, 29158) },
+                    { CDF3( 6419, 17088, 24336) },
+                    { CDF3( 3859, 11003, 17039) },
+                    { CDF3(27562, 32595, 32725) },
+                    { CDF3(17575, 30588, 32399) },
+                    { CDF3(10819, 24838, 30309) },
+                    { CDF3( 7124, 18686, 25916) },
+                    { CDF3( 4479, 12688, 19340) },
+                    { CDF3(28385, 32476, 32673) },
+                    { CDF3(15306, 29005, 31938) },
+                    { CDF3( 8937, 21615, 28322) },
+                    { CDF3( 5982, 15603, 22786) },
+                    { CDF3( 3620, 10267, 16136) },
+                    { CDF3(27280, 32464, 32667) },
+                    { CDF3(15607, 29160, 32004) },
+                    { CDF3( 9091, 22135, 28740) },
+                    { CDF3( 6232, 16632, 24020) },
+                    { CDF3( 4047, 11377, 17672) },
+                    { CDF3(29220, 32630, 32718) },
+                    { CDF3(19650, 31220, 32462) },
+                    { CDF3(13050, 26312, 30827) },
+                    { CDF3( 9228, 20870, 27468) },
+                    { CDF3( 6146, 15149, 21971) },
+                    { CDF3(30169, 32481, 32623) },
+                    { CDF3(17212, 29311, 31554) },
+                    { CDF3( 9911, 21311, 26882) },
+                    { CDF3( 4487, 13314, 20372) },
+                    { CDF3( 2570,  7772, 12889) },
+                    { CDF3(30924, 32613, 32708) },
+                    { CDF3(19490, 30206, 32107) },
+                    { CDF3(11232, 23998, 29276) },
+                    { CDF3( 6769, 17955, 25035) },
+                    { CDF3( 4398, 12623, 19214) },
+                    { CDF3(30609, 32627, 32722) },
+                    { CDF3(19370, 30582, 32287) },
+                    { CDF3(10457, 23619, 29409) },
+                    { CDF3( 6443, 17637, 24834) },
+                    { CDF3( 4645, 13236, 20106) },
+                }, {
+                    { CDF3( 8626, 20271, 26216) },
+                    { CDF3(26707, 32406, 32711) },
+                    { CDF3(16999, 30329, 32286) },
+                    { CDF3(11445, 25123, 30286) },
+                    { CDF3( 6411, 18828, 25601) },
+                    { CDF3( 6801, 12458, 20248) },
+                    { CDF3(29918, 32682, 32748) },
+                    { CDF3(20649, 31739, 32618) },
+                    { CDF3(12879, 27773, 31581) },
+                    { CDF3( 7896, 21751, 28244) },
+                    { CDF3( 5260, 14870, 23698) },
+                    { CDF3(29252, 32593, 32731) },
+                    { CDF3(17072, 30460, 32294) },
+                    { CDF3(10653, 24143, 29365) },
+                    { CDF3( 6536, 17490, 23983) },
+                    { CDF3( 4929, 13170, 20085) },
+                    { CDF3(28137, 32518, 32715) },
+                    { CDF3(18171, 30784, 32407) },
+                    { CDF3(11437, 25436, 30459) },
+                    { CDF3( 7252, 18534, 26176) },
+                    { CDF3( 4126, 13353, 20978) },
+                    { CDF3(31162, 32726, 32748) },
+                    { CDF3(23017, 32222, 32701) },
+                    { CDF3(15629, 29233, 32046) },
+                    { CDF3( 9387, 22621, 29480) },
+                    { CDF3( 6922, 17616, 25010) },
+                    { CDF3(28838, 32265, 32614) },
+                    { CDF3(19701, 30206, 31920) },
+                    { CDF3(11214, 22410, 27933) },
+                    { CDF3( 5320, 14177, 23034) },
+                    { CDF3( 5049, 12881, 17827) },
+                    { CDF3(27484, 32471, 32734) },
+                    { CDF3(21076, 31526, 32561) },
+                    { CDF3(12707, 26303, 31211) },
+                    { CDF3( 8169, 21722, 28219) },
+                    { CDF3( 6045, 19406, 27042) },
+                    { CDF3(27753, 32572, 32745) },
+                    { CDF3(20832, 31878, 32653) },
+                    { CDF3(13250, 27356, 31674) },
+                    { CDF3( 7718, 21508, 29858) },
+                    { CDF3( 7209, 18350, 25559) },
+                },
+            }, {
+                {
+                    { CDF3( 7876, 16901, 21741) },
+                    { CDF3(24001, 31898, 32625) },
+                    { CDF3(14529, 27959, 31451) },
+                    { CDF3( 8273, 20818, 27258) },
+                    { CDF3( 5278, 14673, 21510) },
+                    { CDF3( 2983,  8843, 14039) },
+                    { CDF3(28016, 32574, 32732) },
+                    { CDF3(17471, 30306, 32301) },
+                    { CDF3(10224, 24063, 29728) },
+                    { CDF3( 6602, 17954, 25052) },
+                    { CDF3( 4002, 11585, 17759) },
+                    { CDF3(30190, 32634, 32739) },
+                    { CDF3(17497, 30282, 32270) },
+                    { CDF3(10229, 23729, 29538) },
+                    { CDF3( 6344, 17211, 24440) },
+                    { CDF3( 3849, 11189, 17108) },
+                    { CDF3(28570, 32583, 32726) },
+                    { CDF3(17521, 30161, 32238) },
+                    { CDF3(10153, 23565, 29378) },
+                    { CDF3( 6455, 17341, 24443) },
+                    { CDF3( 3907, 11042, 17024) },
+                    { CDF3(30689, 32715, 32748) },
+                    { CDF3(21546, 31840, 32610) },
+                    { CDF3(13547, 27581, 31459) },
+                    { CDF3( 8912, 21757, 28309) },
+                    { CDF3( 5548, 15080, 22046) },
+                    { CDF3(30783, 32540, 32685) },
+                    { CDF3(17540, 29528, 31668) },
+                    { CDF3(10160, 21468, 26783) },
+                    { CDF3( 4724, 13393, 20054) },
+                    { CDF3( 2702,  8174, 13102) },
+                    { CDF3(31648, 32686, 32742) },
+                    { CDF3(20954, 31094, 32337) },
+                    { CDF3(12420, 25698, 30179) },
+                    { CDF3( 7304, 19320, 26248) },
+                    { CDF3( 4366, 12261, 18864) },
+                    { CDF3(31581, 32723, 32748) },
+                    { CDF3(21373, 31586, 32525) },
+                    { CDF3(12744, 26625, 30885) },
+                    { CDF3( 7431, 20322, 26950) },
+                    { CDF3( 4692, 13323, 20111) },
+                }, {
+                    { CDF3( 7833, 18369, 24095) },
+                    { CDF3(26650, 32273, 32702) },
+                    { CDF3(16371, 29961, 32191) },
+                    { CDF3(11055, 24082, 29629) },
+                    { CDF3( 6892, 18644, 25400) },
+                    { CDF3( 5006, 13057, 19240) },
+                    { CDF3(29834, 32666, 32748) },
+                    { CDF3(19577, 31335, 32570) },
+                    { CDF3(12253, 26509, 31122) },
+                    { CDF3( 7991, 20772, 27711) },
+                    { CDF3( 5677, 15910, 23059) },
+                    { CDF3(30109, 32532, 32720) },
+                    { CDF3(16747, 30166, 32252) },
+                    { CDF3(10134, 23542, 29184) },
+                    { CDF3( 5791, 16176, 23556) },
+                    { CDF3( 4362, 10414, 17284) },
+                    { CDF3(29492, 32626, 32748) },
+                    { CDF3(19894, 31402, 32525) },
+                    { CDF3(12942, 27071, 30869) },
+                    { CDF3( 8346, 21216, 27405) },
+                    { CDF3( 6572, 17087, 23859) },
+                    { CDF3(32035, 32735, 32748) },
+                    { CDF3(22957, 31838, 32618) },
+                    { CDF3(14724, 28572, 31772) },
+                    { CDF3(10364, 23999, 29553) },
+                    { CDF3( 7004, 18433, 25655) },
+                    { CDF3(27528, 32277, 32681) },
+                    { CDF3(16959, 31171, 32096) },
+                    { CDF3(10486, 23593, 27962) },
+                    { CDF3( 8192, 16384, 23211) },
+                    { CDF3( 8937, 17873, 20852) },
+                    { CDF3(27715, 32002, 32615) },
+                    { CDF3(15073, 29491, 31676) },
+                    { CDF3(11264, 24576, 28672) },
+                    { CDF3( 2341, 18725, 23406) },
+                    { CDF3( 7282, 18204, 25486) },
+                    { CDF3(28547, 32213, 32657) },
+                    { CDF3(20788, 29773, 32239) },
+                    { CDF3( 6780, 21469, 30508) },
+                    { CDF3( 5958, 14895, 23831) },
+                    { CDF3(16384, 21845, 27307) },
+                },
+            }, {
+                {
+                    { CDF3( 5992, 14304, 19765) },
+                    { CDF3(22612, 31238, 32456) },
+                    { CDF3(13456, 27162, 31087) },
+                    { CDF3( 8001, 20062, 26504) },
+                    { CDF3( 5168, 14105, 20764) },
+                    { CDF3( 2632,  7771, 12385) },
+                    { CDF3(27034, 32344, 32709) },
+                    { CDF3(15850, 29415, 31997) },
+                    { CDF3( 9494, 22776, 28841) },
+                    { CDF3( 6151, 16830, 23969) },
+                    { CDF3( 3461, 10039, 15722) },
+                    { CDF3(30134, 32569, 32731) },
+                    { CDF3(15638, 29422, 31945) },
+                    { CDF3( 9150, 21865, 28218) },
+                    { CDF3( 5647, 15719, 22676) },
+                    { CDF3( 3402,  9772, 15477) },
+                    { CDF3(28530, 32586, 32735) },
+                    { CDF3(17139, 30298, 32292) },
+                    { CDF3(10200, 24039, 29685) },
+                    { CDF3( 6419, 17674, 24786) },
+                    { CDF3( 3544, 10225, 15824) },
+                    { CDF3(31333, 32726, 32748) },
+                    { CDF3(20618, 31487, 32544) },
+                    { CDF3(12901, 27217, 31232) },
+                    { CDF3( 8624, 21734, 28171) },
+                    { CDF3( 5104, 14191, 20748) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                }, {
+                    { CDF3(11206, 21090, 26561) },
+                    { CDF3(28759, 32279, 32671) },
+                    { CDF3(14171, 27952, 31569) },
+                    { CDF3( 9743, 22907, 29141) },
+                    { CDF3( 6871, 17886, 24868) },
+                    { CDF3( 4960, 13152, 19315) },
+                    { CDF3(31077, 32661, 32748) },
+                    { CDF3(19400, 31195, 32515) },
+                    { CDF3(12752, 26858, 31040) },
+                    { CDF3( 8370, 22098, 28591) },
+                    { CDF3( 5457, 15373, 22298) },
+                    { CDF3(31697, 32706, 32748) },
+                    { CDF3(17860, 30657, 32333) },
+                    { CDF3(12510, 24812, 29261) },
+                    { CDF3( 6180, 19124, 24722) },
+                    { CDF3( 5041, 13548, 17959) },
+                    { CDF3(31552, 32716, 32748) },
+                    { CDF3(21908, 31769, 32623) },
+                    { CDF3(14470, 28201, 31565) },
+                    { CDF3( 9493, 22982, 28608) },
+                    { CDF3( 6858, 17240, 24137) },
+                    { CDF3(32543, 32752, 32756) },
+                    { CDF3(24286, 32097, 32666) },
+                    { CDF3(15958, 29217, 32024) },
+                    { CDF3(10207, 24234, 29958) },
+                    { CDF3( 6929, 18305, 25652) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                },
+            }, {
+                {
+                    { CDF3( 4137, 10847, 15682) },
+                    { CDF3(17824, 27001, 30058) },
+                    { CDF3(10204, 22796, 28291) },
+                    { CDF3( 6076, 15935, 22125) },
+                    { CDF3( 3852, 10937, 16816) },
+                    { CDF3( 2252,  6324, 10131) },
+                    { CDF3(25840, 32016, 32662) },
+                    { CDF3(15109, 28268, 31531) },
+                    { CDF3( 9385, 22231, 28340) },
+                    { CDF3( 6082, 16672, 23479) },
+                    { CDF3( 3318,  9427, 14681) },
+                    { CDF3(30594, 32574, 32718) },
+                    { CDF3(16836, 29552, 31859) },
+                    { CDF3( 9556, 22542, 28356) },
+                    { CDF3( 6305, 16725, 23540) },
+                    { CDF3( 3376,  9895, 15184) },
+                    { CDF3(29383, 32617, 32745) },
+                    { CDF3(18891, 30809, 32401) },
+                    { CDF3(11688, 25942, 30687) },
+                    { CDF3( 7468, 19469, 26651) },
+                    { CDF3( 3909, 11358, 17012) },
+                    { CDF3(31564, 32736, 32748) },
+                    { CDF3(20906, 31611, 32600) },
+                    { CDF3(13191, 27621, 31537) },
+                    { CDF3( 8768, 22029, 28676) },
+                    { CDF3( 5079, 14109, 20906) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                }, {
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                    { CDF3( 8192, 16384, 24576) },
+                },
+            },
+        }, .dc_sign = {
+            { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } },
+            { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } },
+        }, .br_tok = {
+            {
+                {
+                    { CDF3(18315, 24289, 27551) },
+                    { CDF3(16854, 24068, 27835) },
+                    { CDF3(10140, 17927, 23173) },
+                    { CDF3( 6722, 12982, 18267) },
+                    { CDF3( 4661,  9826, 14706) },
+                    { CDF3( 3832,  8165, 12294) },
+                    { CDF3( 2795,  6098,  9245) },
+                    { CDF3(17145, 23326, 26672) },
+                    { CDF3(20733, 27680, 30308) },
+                    { CDF3(16032, 24461, 28546) },
+                    { CDF3(11653, 20093, 25081) },
+                    { CDF3( 9290, 16429, 22086) },
+                    { CDF3( 7796, 14598, 19982) },
+                    { CDF3( 6502, 12378, 17441) },
+                    { CDF3(21681, 27732, 30320) },
+                    { CDF3(22389, 29044, 31261) },
+                    { CDF3(19027, 26731, 30087) },
+                    { CDF3(14739, 23755, 28624) },
+                    { CDF3(11358, 20778, 25511) },
+                    { CDF3(10995, 18073, 24190) },
+                    { CDF3( 9162, 14990, 20617) },
+                }, {
+                    { CDF3(21425, 27952, 30388) },
+                    { CDF3(18062, 25838, 29034) },
+                    { CDF3(11956, 19881, 24808) },
+                    { CDF3( 7718, 15000, 20980) },
+                    { CDF3( 5702, 11254, 16143) },
+                    { CDF3( 4898,  9088, 16864) },
+                    { CDF3( 3679,  6776, 11907) },
+                    { CDF3(23294, 30160, 31663) },
+                    { CDF3(24397, 29896, 31836) },
+                    { CDF3(19245, 27128, 30593) },
+                    { CDF3(13202, 19825, 26404) },
+                    { CDF3(11578, 19297, 23957) },
+                    { CDF3( 8073, 13297, 21370) },
+                    { CDF3( 5461, 10923, 19745) },
+                    { CDF3(27367, 30521, 31934) },
+                    { CDF3(24904, 30671, 31940) },
+                    { CDF3(23075, 28460, 31299) },
+                    { CDF3(14400, 23658, 30417) },
+                    { CDF3(13885, 23882, 28325) },
+                    { CDF3(14746, 22938, 27853) },
+                    { CDF3( 5461, 16384, 27307) },
+                },
+            }, {
+                {
+                    { CDF3(18274, 24813, 27890) },
+                    { CDF3(15537, 23149, 27003) },
+                    { CDF3( 9449, 16740, 21827) },
+                    { CDF3( 6700, 12498, 17261) },
+                    { CDF3( 4988,  9866, 14198) },
+                    { CDF3( 4236,  8147, 11902) },
+                    { CDF3( 2867,  5860,  8654) },
+                    { CDF3(17124, 23171, 26101) },
+                    { CDF3(20396, 27477, 30148) },
+                    { CDF3(16573, 24629, 28492) },
+                    { CDF3(12749, 20846, 25674) },
+                    { CDF3(10233, 17878, 22818) },
+                    { CDF3( 8525, 15332, 20363) },
+                    { CDF3( 6283, 11632, 16255) },
+                    { CDF3(20466, 26511, 29286) },
+                    { CDF3(23059, 29174, 31191) },
+                    { CDF3(19481, 27263, 30241) },
+                    { CDF3(15458, 23631, 28137) },
+                    { CDF3(12416, 20608, 25693) },
+                    { CDF3(10261, 18011, 23261) },
+                    { CDF3( 8016, 14655, 19666) },
+                }, {
+                    { CDF3(17616, 24586, 28112) },
+                    { CDF3(15809, 23299, 27155) },
+                    { CDF3(10767, 18890, 23793) },
+                    { CDF3( 7727, 14255, 18865) },
+                    { CDF3( 6129, 11926, 16882) },
+                    { CDF3( 4482,  9704, 14861) },
+                    { CDF3( 3277,  7452, 11522) },
+                    { CDF3(22956, 28551, 30730) },
+                    { CDF3(22724, 28937, 30961) },
+                    { CDF3(18467, 26324, 29580) },
+                    { CDF3(13234, 20713, 25649) },
+                    { CDF3(11181, 17592, 22481) },
+                    { CDF3( 8291, 18358, 24576) },
+                    { CDF3( 7568, 11881, 14984) },
+                    { CDF3(24948, 29001, 31147) },
+                    { CDF3(25674, 30619, 32151) },
+                    { CDF3(20841, 26793, 29603) },
+                    { CDF3(14669, 24356, 28666) },
+                    { CDF3(11334, 23593, 28219) },
+                    { CDF3( 8922, 14762, 22873) },
+                    { CDF3( 8301, 13544, 20535) },
+                },
+            }, {
+                {
+                    { CDF3(17113, 23733, 27081) },
+                    { CDF3(14139, 21406, 25452) },
+                    { CDF3( 8552, 15002, 19776) },
+                    { CDF3( 5871, 11120, 15378) },
+                    { CDF3( 4455,  8616, 12253) },
+                    { CDF3( 3469,  6910, 10386) },
+                    { CDF3( 2255,  4553,  6782) },
+                    { CDF3(18224, 24376, 27053) },
+                    { CDF3(19290, 26710, 29614) },
+                    { CDF3(14936, 22991, 27184) },
+                    { CDF3(11238, 18951, 23762) },
+                    { CDF3( 8786, 15617, 20588) },
+                    { CDF3( 7317, 13228, 18003) },
+                    { CDF3( 5101,  9512, 13493) },
+                    { CDF3(22639, 28222, 30210) },
+                    { CDF3(23216, 29331, 31307) },
+                    { CDF3(19075, 26762, 29895) },
+                    { CDF3(15014, 23113, 27457) },
+                    { CDF3(11938, 19857, 24752) },
+                    { CDF3( 9942, 17280, 22282) },
+                    { CDF3( 7167, 13144, 17752) },
+                }, {
+                    { CDF3(15820, 22738, 26488) },
+                    { CDF3(13530, 20885, 25216) },
+                    { CDF3( 8395, 15530, 20452) },
+                    { CDF3( 6574, 12321, 16380) },
+                    { CDF3( 5353, 10419, 14568) },
+                    { CDF3( 4613,  8446, 12381) },
+                    { CDF3( 3440,  7158,  9903) },
+                    { CDF3(24247, 29051, 31224) },
+                    { CDF3(22118, 28058, 30369) },
+                    { CDF3(16498, 24768, 28389) },
+                    { CDF3(12920, 21175, 26137) },
+                    { CDF3(10730, 18619, 25352) },
+                    { CDF3(10187, 16279, 22791) },
+                    { CDF3( 9310, 14631, 22127) },
+                    { CDF3(24970, 30558, 32057) },
+                    { CDF3(24801, 29942, 31698) },
+                    { CDF3(22432, 28453, 30855) },
+                    { CDF3(19054, 25680, 29580) },
+                    { CDF3(14392, 23036, 28109) },
+                    { CDF3(12495, 20947, 26650) },
+                    { CDF3(12442, 20326, 26214) },
+                },
+            }, {
+                {
+                    { CDF3(12162, 18785, 22648) },
+                    { CDF3(12749, 19697, 23806) },
+                    { CDF3( 8580, 15297, 20346) },
+                    { CDF3( 6169, 11749, 16543) },
+                    { CDF3( 4836,  9391, 13448) },
+                    { CDF3( 3821,  7711, 11613) },
+                    { CDF3( 2228,  4601,  7070) },
+                    { CDF3(16319, 24725, 28280) },
+                    { CDF3(15698, 23277, 27168) },
+                    { CDF3(12726, 20368, 25047) },
+                    { CDF3( 9912, 17015, 21976) },
+                    { CDF3( 7888, 14220, 19179) },
+                    { CDF3( 6777, 12284, 17018) },
+                    { CDF3( 4492,  8590, 12252) },
+                    { CDF3(23249, 28904, 30947) },
+                    { CDF3(21050, 27908, 30512) },
+                    { CDF3(17440, 25340, 28949) },
+                    { CDF3(14059, 22018, 26541) },
+                    { CDF3(11288, 18903, 23898) },
+                    { CDF3( 9411, 16342, 21428) },
+                    { CDF3( 6278, 11588, 15944) },
+                }, {
+                    { CDF3(13981, 20067, 23226) },
+                    { CDF3(16922, 23580, 26783) },
+                    { CDF3(11005, 19039, 24487) },
+                    { CDF3( 7389, 14218, 19798) },
+                    { CDF3( 5598, 11505, 17206) },
+                    { CDF3( 6090, 11213, 15659) },
+                    { CDF3( 3820,  7371, 10119) },
+                    { CDF3(21082, 26925, 29675) },
+                    { CDF3(21262, 28627, 31128) },
+                    { CDF3(18392, 26454, 30437) },
+                    { CDF3(14870, 22910, 27096) },
+                    { CDF3(12620, 19484, 24908) },
+                    { CDF3( 9290, 16553, 22802) },
+                    { CDF3( 6668, 14288, 20004) },
+                    { CDF3(27704, 31055, 31949) },
+                    { CDF3(24709, 29978, 31788) },
+                    { CDF3(21668, 29264, 31657) },
+                    { CDF3(18295, 26968, 30074) },
+                    { CDF3(16399, 24422, 29313) },
+                    { CDF3(14347, 23026, 28104) },
+                    { CDF3(12370, 19806, 24477) },
+                },
+            },
+        },
+    }
+};
+
+void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr,
+                             CdfContext *const dst,
+                             const CdfContext *const src)
+{
+#define update_cdf_1d(n1d, name) \
+    do { \
+        memcpy(dst->name, src->name, sizeof(dst->name)); \
+        dst->name[n1d] = 0; \
+    } while (0)
+
+#define update_cdf_2d(n1d, n2d, name) \
+    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
+#define update_cdf_3d(n1d, n2d, n3d, name) \
+    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
+#define update_cdf_4d(n1d, n2d, n3d, n4d, name) \
+    for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l])
+
+#define update_bit_0d(name) \
+    do { \
+        dst->name[0] = src->name[0]; \
+        dst->name[1] = 0; \
+    } while (0)
+
+#define update_bit_1d(n1d, name) \
+    for (int i = 0; i < (n1d); i++) update_bit_0d(name[i])
+#define update_bit_2d(n1d, n2d, name) \
+    for (int j = 0; j < (n1d); j++) update_bit_1d(n2d, name[j])
+#define update_bit_3d(n1d, n2d, n3d, name) \
+    for (int k = 0; k < (n1d); k++) update_bit_2d(n2d, n3d, name[k])
+
+    update_bit_1d(N_BS_SIZES, m.use_filter_intra);
+    update_cdf_1d(4, m.filter_intra);
+    update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - 1 - !k, m.uv_mode);
+    update_cdf_2d(8, 6, m.angle_delta);
+    update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 1, 2), m.txsz);
+    update_cdf_3d(2, N_INTRA_PRED_MODES, 6, m.txtp_intra1);
+    update_cdf_3d(3, N_INTRA_PRED_MODES, 4, m.txtp_intra2);
+    update_bit_1d(3, m.skip);
+    update_cdf_3d(N_BL_LEVELS, 4, dav1d_partition_type_count[k], m.partition);
+    update_bit_2d(N_TX_SIZES, 13, coef.skip);
+    update_cdf_3d(2, 2, 4, coef.eob_bin_16);
+    update_cdf_3d(2, 2, 5, coef.eob_bin_32);
+    update_cdf_3d(2, 2, 6, coef.eob_bin_64);
+    update_cdf_3d(2, 2, 7, coef.eob_bin_128);
+    update_cdf_3d(2, 2, 8, coef.eob_bin_256);
+    update_cdf_2d(2, 9, coef.eob_bin_512);
+    update_cdf_2d(2, 10, coef.eob_bin_1024);
+    update_bit_3d(N_TX_SIZES, 2, 11 /*22*/, coef.eob_hi_bit);
+    update_cdf_4d(N_TX_SIZES, 2, 4, 2, coef.eob_base_tok);
+    update_cdf_4d(N_TX_SIZES, 2, 41 /*42*/, 3, coef.base_tok);
+    update_bit_2d(2, 3, coef.dc_sign);
+    update_cdf_4d(4, 2, 21, 3, coef.br_tok);
+    update_cdf_2d(3, DAV1D_MAX_SEGMENTS - 1, m.seg_id);
+    update_cdf_1d(7, m.cfl_sign);
+    update_cdf_2d(6, 15, m.cfl_alpha);
+    update_bit_0d(m.restore_wiener);
+    update_bit_0d(m.restore_sgrproj);
+    update_cdf_1d(2, m.restore_switchable);
+    update_cdf_1d(3, m.delta_q);
+    update_cdf_2d(5, 3, m.delta_lf);
+    update_bit_2d(7, 3, m.pal_y);
+    update_bit_1d(2, m.pal_uv);
+    update_cdf_3d(2, 7, 6, m.pal_sz);
+    update_cdf_4d(2, 7, 5, k + 1, m.color_map);
+    update_bit_2d(7, 3, m.txpart);
+    update_cdf_2d(2, 15, m.txtp_inter1);
+    update_cdf_1d(11, m.txtp_inter2);
+    update_bit_1d(4, m.txtp_inter3);
+
+    if (!(hdr->frame_type & 1)) {
+        update_bit_0d(m.intrabc);
+
+        update_cdf_1d(N_MV_JOINTS - 1, dmv.joint);
+        for (int k = 0; k < 2; k++) {
+            update_cdf_1d(10, dmv.comp[k].classes);
+            update_bit_0d(dmv.comp[k].class0);
+            update_bit_1d(10, dmv.comp[k].classN);
+            update_bit_0d(dmv.comp[k].sign);
+        }
+        return;
+    }
+
+    update_bit_1d(3, m.skip_mode);
+    update_cdf_2d(4, N_INTRA_PRED_MODES - 1, m.y_mode);
+    update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS - 1, m.filter);
+    update_bit_1d(6, m.newmv_mode);
+    update_bit_1d(2, m.globalmv_mode);
+    update_bit_1d(6, m.refmv_mode);
+    update_bit_1d(3, m.drl_bit);
+    update_cdf_2d(8, N_COMP_INTER_PRED_MODES - 1, m.comp_inter_mode);
+    update_bit_1d(4, m.intra);
+    update_bit_1d(5, m.comp);
+    update_bit_1d(5, m.comp_dir);
+    update_bit_1d(6, m.jnt_comp);
+    update_bit_1d(6, m.mask_comp);
+    update_bit_1d(9, m.wedge_comp);
+    update_cdf_2d(9, 15, m.wedge_idx);
+    update_bit_2d(6, 3, m.ref);
+    update_bit_2d(3, 3, m.comp_fwd_ref);
+    update_bit_2d(2, 3, m.comp_bwd_ref);
+    update_bit_2d(3, 3, m.comp_uni_ref);
+    update_bit_1d(3, m.seg_pred);
+    update_bit_1d(4, m.interintra);
+    update_bit_1d(7, m.interintra_wedge);
+    update_cdf_2d(4, 3, m.interintra_mode);
+    update_cdf_2d(N_BS_SIZES, 2, m.motion_mode);
+    update_bit_1d(N_BS_SIZES, m.obmc);
+
+    update_cdf_1d(N_MV_JOINTS - 1, mv.joint);
+    for (int k = 0; k < 2; k++) {
+        update_cdf_1d(10, mv.comp[k].classes);
+        update_bit_0d(mv.comp[k].class0);
+        update_bit_1d(10, mv.comp[k].classN);
+        update_cdf_2d(2, 3, mv.comp[k].class0_fp);
+        update_cdf_1d(3, mv.comp[k].classN_fp);
+        update_bit_0d(mv.comp[k].class0_hp);
+        update_bit_0d(mv.comp[k].classN_hp);
+        update_bit_0d(mv.comp[k].sign);
+    }
+}
+
+/*
+ * CDF threading wrappers.
+ */
+static inline int get_qcat_idx(const int q) {
+    if (q <= 20) return 0;
+    if (q <= 60) return 1;
+    if (q <= 120) return 2;
+    return 3;
+}
+
+void dav1d_cdf_thread_init_static(CdfThreadContext *const cdf, const int qidx) {
+    cdf->ref = NULL;
+    cdf->data.qcat = get_qcat_idx(qidx);
+}
+
+void dav1d_cdf_thread_copy(CdfContext *const dst, const CdfThreadContext *const src) {
+    if (src->ref) {
+        memcpy(dst, src->data.cdf, sizeof(*dst));
+    } else {
+        dst->m = av1_default_cdf;
+        memcpy(dst->kfym, default_kf_y_mode_cdf, sizeof(default_kf_y_mode_cdf));
+        dst->coef = av1_default_coef_cdf[src->data.qcat];
+        memcpy(dst->mv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf));
+        memcpy(dst->dmv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf));
+        dst->mv.comp[0] = dst->mv.comp[1] = dst->dmv.comp[0] = dst->dmv.comp[1] =
+            default_mv_component_cdf;
+    }
+}
+
+int dav1d_cdf_thread_alloc(CdfThreadContext *const cdf,
+                           struct thread_data *const t)
+{
+    cdf->ref = dav1d_ref_create(sizeof(CdfContext) +
+                                (t != NULL) * sizeof(atomic_uint));
+    if (!cdf->ref) return DAV1D_ERR(ENOMEM);
+    cdf->data.cdf = cdf->ref->data;
+    if (t) {
+        cdf->progress = (atomic_uint *) &cdf->data.cdf[1];
+        atomic_init(cdf->progress, 0);
+        cdf->t = t;
+    }
+    return 0;
+}
+
+void dav1d_cdf_thread_ref(CdfThreadContext *const dst,
+                          CdfThreadContext *const src)
+{
+    *dst = *src;
+    if (src->ref)
+        dav1d_ref_inc(src->ref);
+}
+
+void dav1d_cdf_thread_unref(CdfThreadContext *const cdf) {
+    if (cdf->ref)
+        dav1d_ref_dec(&cdf->ref);
+    memset(cdf, 0, sizeof(*cdf));
+}
+
+void dav1d_cdf_thread_wait(CdfThreadContext *const cdf) {
+    if (!cdf->t) return;
+
+    if (atomic_load(cdf->progress)) return;
+    pthread_mutex_lock(&cdf->t->lock);
+    while (!atomic_load(cdf->progress))
+        pthread_cond_wait(&cdf->t->cond, &cdf->t->lock);
+    pthread_mutex_unlock(&cdf->t->lock);
+}
+
+void dav1d_cdf_thread_signal(CdfThreadContext *const cdf) {
+    if (!cdf->t) return;
+
+    pthread_mutex_lock(&cdf->t->lock);
+    atomic_store(cdf->progress, 1);
+    pthread_cond_broadcast(&cdf->t->cond);
+    pthread_mutex_unlock(&cdf->t->lock);
+}
diff --git a/src/cdf.h b/src/cdf.h
new file mode 100644 (file)
index 0000000..6d1c29a
--- /dev/null
+++ b/src/cdf.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CDF_H
+#define DAV1D_SRC_CDF_H
+
+#include <stdint.h>
+
+#include "src/levels.h"
+#include "src/ref.h"
+#include "src/thread_data.h"
+
+/* Buffers padded to [8] or [16] for SIMD where needed. */
+
+typedef struct CdfModeContext {
+    ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32);
+    ALIGN(uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 2], 32);
+    ALIGN(uint16_t wedge_idx[9][16], 32);
+    ALIGN(uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 6], 32);
+    ALIGN(uint16_t cfl_alpha[6][16], 32);
+    ALIGN(uint16_t txtp_inter1[2][16], 32);
+    ALIGN(uint16_t txtp_inter2[12 + 4], 32);
+    ALIGN(uint16_t txtp_intra1[2][N_INTRA_PRED_MODES][7 + 1], 16);
+    ALIGN(uint16_t txtp_intra2[3][N_INTRA_PRED_MODES][5 + 3], 16);
+    ALIGN(uint16_t cfl_sign[8], 16);
+    ALIGN(uint16_t angle_delta[8][8], 16);
+    ALIGN(uint16_t filter_intra[5 + 3], 16);
+    ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16);
+    ALIGN(uint16_t seg_id[3][DAV1D_MAX_SEGMENTS], 16);
+    ALIGN(uint16_t pal_sz[2][7][7 + 1], 16);
+    ALIGN(uint16_t color_map[2][7][5][8], 16);
+    ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8);
+    ALIGN(uint16_t txsz[N_TX_SIZES - 1][3][4], 8);
+    ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8);
+    ALIGN(uint16_t delta_q[4], 8);
+    ALIGN(uint16_t delta_lf[5][4], 8);
+    ALIGN(uint16_t interintra_mode[4][4], 8);
+    ALIGN(uint16_t restore_switchable[3 + 1], 8);
+    ALIGN(uint16_t restore_wiener[2], 4);
+    ALIGN(uint16_t restore_sgrproj[2], 4);
+    ALIGN(uint16_t interintra[7][2], 4);
+    ALIGN(uint16_t interintra_wedge[7][2], 4);
+    ALIGN(uint16_t txtp_inter3[4][2], 4);
+    ALIGN(uint16_t use_filter_intra[N_BS_SIZES][2], 4);
+    ALIGN(uint16_t newmv_mode[6][2], 4);
+    ALIGN(uint16_t globalmv_mode[2][2], 4);
+    ALIGN(uint16_t refmv_mode[6][2], 4);
+    ALIGN(uint16_t drl_bit[3][2], 4);
+    ALIGN(uint16_t intra[4][2], 4);
+    ALIGN(uint16_t comp[5][2], 4);
+    ALIGN(uint16_t comp_dir[5][2], 4);
+    ALIGN(uint16_t jnt_comp[6][2], 4);
+    ALIGN(uint16_t mask_comp[6][2], 4);
+    ALIGN(uint16_t wedge_comp[9][2], 4);
+    ALIGN(uint16_t ref[6][3][2], 4);
+    ALIGN(uint16_t comp_fwd_ref[3][3][2], 4);
+    ALIGN(uint16_t comp_bwd_ref[2][3][2], 4);
+    ALIGN(uint16_t comp_uni_ref[3][3][2], 4);
+    ALIGN(uint16_t txpart[7][3][2], 4);
+    ALIGN(uint16_t skip[3][2], 4);
+    ALIGN(uint16_t skip_mode[3][2], 4);
+    ALIGN(uint16_t seg_pred[3][2], 4);
+    ALIGN(uint16_t obmc[N_BS_SIZES][2], 4);
+    ALIGN(uint16_t pal_y[7][3][2], 4);
+    ALIGN(uint16_t pal_uv[2][2], 4);
+    ALIGN(uint16_t intrabc[2], 4);
+} CdfModeContext;
+
+typedef struct CdfCoefContext {
+    ALIGN(uint16_t eob_bin_16[2][2][5 + 3], 16);
+    ALIGN(uint16_t eob_bin_32[2][2][6 + 2], 16);
+    ALIGN(uint16_t eob_bin_64[2][2][7 + 1], 16);
+    ALIGN(uint16_t eob_bin_128[2][2][8 + 0], 16);
+    ALIGN(uint16_t eob_bin_256[2][2][9 + 7], 32);
+    ALIGN(uint16_t eob_bin_512[2][10 + 6], 32);
+    ALIGN(uint16_t eob_bin_1024[2][11 + 5], 32);
+    ALIGN(uint16_t eob_base_tok[N_TX_SIZES][2][4][4], 8);
+    ALIGN(uint16_t base_tok[N_TX_SIZES][2][41][4], 8);
+    ALIGN(uint16_t br_tok[4 /*5*/][2][21][4], 8);
+    ALIGN(uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2], 4);
+    ALIGN(uint16_t skip[N_TX_SIZES][13][2], 4);
+    ALIGN(uint16_t dc_sign[2][3][2], 4);
+} CdfCoefContext;
+
+typedef struct CdfMvComponent {
+    ALIGN(uint16_t classes[11 + 5], 32);
+    ALIGN(uint16_t class0_fp[2][4], 8);
+    ALIGN(uint16_t classN_fp[4], 8);
+    ALIGN(uint16_t class0_hp[2], 4);
+    ALIGN(uint16_t classN_hp[2], 4);
+    ALIGN(uint16_t class0[2], 4);
+    ALIGN(uint16_t classN[10][2], 4);
+    ALIGN(uint16_t sign[2], 4);
+} CdfMvComponent;
+
+typedef struct CdfMvContext {
+    CdfMvComponent comp[2];
+    ALIGN(uint16_t joint[N_MV_JOINTS], 8);
+} CdfMvContext;
+
+typedef struct CdfContext {
+    CdfModeContext m;
+    ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32);
+    CdfCoefContext coef;
+    CdfMvContext mv, dmv;
+} CdfContext;
+
+typedef struct CdfThreadContext {
+    Dav1dRef *ref; ///< allocation origin
+    union {
+        CdfContext *cdf; // if ref != NULL
+        unsigned qcat; // if ref == NULL, from static CDF tables
+    } data;
+    struct thread_data *t;
+    atomic_uint *progress;
+} CdfThreadContext;
+
+void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx);
+int dav1d_cdf_thread_alloc(CdfThreadContext *cdf, struct thread_data *t);
+void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src);
+void dav1d_cdf_thread_ref(CdfThreadContext *dst, CdfThreadContext *src);
+void dav1d_cdf_thread_unref(CdfThreadContext *cdf);
+void dav1d_cdf_thread_update(const Dav1dFrameHeader *hdr, CdfContext *dst,
+                             const CdfContext *src);
+
+/*
+ * These are binary signals (so a signal is either "done" or "not done").
+ */
+void dav1d_cdf_thread_wait(CdfThreadContext *cdf);
+void dav1d_cdf_thread_signal(CdfThreadContext *cdf);
+
+#endif /* DAV1D_SRC_CDF_H */
diff --git a/src/cpu.c b/src/cpu.c
new file mode 100644 (file)
index 0000000..f8a909f
--- /dev/null
+++ b/src/cpu.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "config.h"
+
+#include <stdint.h>
+
+#include "src/cpu.h"
+
+static unsigned flags = 0;
+
+#if __has_feature(memory_sanitizer)
+// memory sanitizer is inherently incompatible with asm
+static unsigned flags_mask = 0;
+#elif ARCH_X86
+/* Disable AVX-512 by default for the time being */
+static unsigned flags_mask = ~DAV1D_X86_CPU_FLAG_AVX512ICL;
+#else
+static unsigned flags_mask = -1;
+#endif
+
+COLD void dav1d_init_cpu(void) {
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    flags = dav1d_get_cpu_flags_arm();
+#elif ARCH_PPC64LE
+    flags = dav1d_get_cpu_flags_ppc();
+#elif ARCH_X86
+    flags = dav1d_get_cpu_flags_x86();
+#endif
+#endif
+}
+
+COLD unsigned dav1d_get_cpu_flags(void) {
+    return flags & flags_mask;
+}
+
+COLD void dav1d_set_cpu_flags_mask(const unsigned mask) {
+    flags_mask = mask;
+}
diff --git a/src/cpu.h b/src/cpu.h
new file mode 100644 (file)
index 0000000..d5299f2
--- /dev/null
+++ b/src/cpu.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CPU_H
+#define DAV1D_SRC_CPU_H
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "dav1d/common.h"
+
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/cpu.h"
+#elif ARCH_PPC64LE
+#include "src/ppc/cpu.h"
+#elif ARCH_X86
+#include "src/x86/cpu.h"
+#endif
+
+void dav1d_init_cpu(void);
+unsigned dav1d_get_cpu_flags(void);
+DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
+
+#endif /* DAV1D_SRC_CPU_H */
diff --git a/src/ctx.h b/src/ctx.h
new file mode 100644 (file)
index 0000000..d0e1f31
--- /dev/null
+++ b/src/ctx.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CTX_H
+#define DAV1D_SRC_CTX_H
+
+#include <stdint.h>
+
+#include "common/attributes.h"
+
+union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS;
+union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS;
+union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS;
+union alias8 { uint8_t u8; } ATTR_ALIAS;
+
+#define set_ctx_rep4(type, var, off, val) do { \
+        const uint64_t const_val = val; \
+        ((union alias64 *) &var[off +  0])->u64 = const_val; \
+        ((union alias64 *) &var[off +  8])->u64 = const_val; \
+        ((union alias64 *) &var[off + 16])->u64 = const_val; \
+        ((union alias64 *) &var[off + 24])->u64 = const_val; \
+    } while (0)
+#define set_ctx_rep2(type, var, off, val) do { \
+        const uint64_t const_val = val; \
+        ((union alias64 *) &var[off + 0])->u64 = const_val; \
+        ((union alias64 *) &var[off + 8])->u64 = const_val; \
+    } while (0)
+#define set_ctx_rep1(typesz, var, off, val) \
+    ((union alias##typesz *) &var[off])->u##typesz = val
+#define case_set(var, dir, diridx, off) \
+    switch (var) { \
+    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+    case 32: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
+    }
+#define case_set_upto16(var, dir, diridx, off) \
+    switch (var) { \
+    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+    }
+#define case_set_upto32_with_default(var, dir, diridx, off) \
+    switch (var) { \
+    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+    case 32: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
+    default: default_memset(dir, diridx, off, var); break; \
+    }
+#define case_set_upto16_with_default(var, dir, diridx, off) \
+    switch (var) { \
+    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+    default: default_memset(dir, diridx, off, var); break; \
+    }
+
+#endif /* DAV1D_SRC_CTX_H */
diff --git a/src/data.c b/src/data.c
new file mode 100644 (file)
index 0000000..29e83ea
--- /dev/null
@@ -0,0 +1,147 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "dav1d/data.h"
+
+#include "common/attributes.h"
+#include "common/validate.h"
+
+#include "src/data.h"
+#include "src/ref.h"
+
+uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
+    validate_input_or_ret(buf != NULL, NULL);
+
+    buf->ref = dav1d_ref_create(sz);
+    if (!buf->ref) return NULL;
+    buf->data = buf->ref->const_data;
+    buf->sz = buf->m.size = sz;
+    dav1d_data_props_set_defaults(&buf->m);
+
+    return buf->ref->data;
+}
+
+int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
+                             const size_t sz,
+                             void (*const free_callback)(const uint8_t *data,
+                                                         void *cookie),
+                             void *const cookie)
+{
+    validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
+
+    buf->ref = dav1d_ref_wrap(ptr, free_callback, cookie);
+    if (!buf->ref) return DAV1D_ERR(ENOMEM);
+    buf->data = ptr;
+    buf->sz = buf->m.size = sz;
+    dav1d_data_props_set_defaults(&buf->m);
+
+    return 0;
+}
+
+int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
+                                       const uint8_t *const user_data,
+                                       void (*const free_callback)(const uint8_t *user_data,
+                                                                   void *cookie),
+                                       void *const cookie)
+{
+    validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
+
+    buf->m.user_data.ref = dav1d_ref_wrap(user_data, free_callback, cookie);
+    if (!buf->m.user_data.ref) return DAV1D_ERR(ENOMEM);
+    buf->m.user_data.data = user_data;
+
+    return 0;
+}
+
+
+void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
+    validate_input(dst != NULL);
+    validate_input(dst->data == NULL);
+    validate_input(src != NULL);
+
+    if (src->ref) {
+        validate_input(src->data != NULL);
+        dav1d_ref_inc(src->ref);
+    }
+    if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
+    *dst = *src;
+}
+
+void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) {
+    validate_input(dst != NULL);
+    validate_input(dst->data == NULL);
+    validate_input(src != NULL);
+
+    if (src->ref)
+        validate_input(src->data != NULL);
+
+    *dst = *src;
+    memset(src, 0, sizeof(*src));
+}
+
+void dav1d_data_props_copy(Dav1dDataProps *const dst,
+                           const Dav1dDataProps *const src)
+{
+    assert(dst != NULL);
+    assert(src != NULL);
+
+    dav1d_ref_dec(&dst->user_data.ref);
+    *dst = *src;
+    if (dst->user_data.ref) dav1d_ref_inc(dst->user_data.ref);
+}
+
+void dav1d_data_props_set_defaults(Dav1dDataProps *const props) {
+    assert(props != NULL);
+
+    props->timestamp = INT64_MIN;
+    props->duration = 0;
+    props->offset = -1;
+    props->user_data.data = NULL;
+    props->user_data.ref = NULL;
+}
+
+void dav1d_data_unref_internal(Dav1dData *const buf) {
+    validate_input(buf != NULL);
+
+    struct Dav1dRef *user_data_ref = buf->m.user_data.ref;
+    if (buf->ref) {
+        validate_input(buf->data != NULL);
+        dav1d_ref_dec(&buf->ref);
+    }
+    memset(buf, 0, sizeof(*buf));
+    dav1d_ref_dec(&user_data_ref);
+}
diff --git a/src/data.h b/src/data.h
new file mode 100644 (file)
index 0000000..6ebb551
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_DATA_H
+#define DAV1D_SRC_DATA_H
+
+#include "dav1d/data.h"
+
+void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src);
+
+/**
+ * Move a data reference.
+ */
+void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src);
+
+/**
+ * Copy the source properties to the destitionatin and increase the
+ * user_data's reference count (if it's not NULL).
+ */
+void dav1d_data_props_copy(Dav1dDataProps *dst, const Dav1dDataProps *src);
+
+void dav1d_data_props_set_defaults(Dav1dDataProps *props);
+
+uint8_t *dav1d_data_create_internal(Dav1dData *buf, size_t sz);
+int dav1d_data_wrap_internal(Dav1dData *buf, const uint8_t *ptr, size_t sz,
+                             void (*free_callback)(const uint8_t *data,
+                                                   void *user_data),
+                             void *user_data);
+int dav1d_data_wrap_user_data_internal(Dav1dData *buf,
+                                       const uint8_t *user_data,
+                                       void (*free_callback)(const uint8_t *user_data,
+                                                             void *cookie),
+                                       void *cookie);
+void dav1d_data_unref_internal(Dav1dData *buf);
+
+#endif /* DAV1D_SRC_DATA_H */
diff --git a/src/dav1d.rc.in b/src/dav1d.rc.in
new file mode 100644 (file)
index 0000000..ad6aab4
--- /dev/null
@@ -0,0 +1,32 @@
+#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0
+#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@"
+#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0
+#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@"
+
+#include <windows.h>
+
+1 VERSIONINFO
+FILETYPE VFT_DLL
+FILEOS VOS_NT_WINDOWS32
+PRODUCTVERSION PROJECT_VERSION_NUMBER
+FILEVERSION API_VERSION_NUMBER
+BEGIN
+  BLOCK "StringFileInfo"
+  BEGIN
+    BLOCK "040904E4"
+    BEGIN
+      VALUE "CompanyName", "VideoLAN"
+      VALUE "ProductName", "dav1d"
+      VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR
+      VALUE "FileVersion", API_VERSION_NUMBER_STR
+      VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
+      VALUE "InternalName", "dav1d"
+      VALUE "OriginalFilename", "libdav1d.dll"
+      VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
+    END
+  END
+  BLOCK "VarFileInfo"
+  BEGIN
+    VALUE "Translation", 0x409, 1252
+  END
+END
diff --git a/src/decode.c b/src/decode.c
new file mode 100644 (file)
index 0000000..f678215
--- /dev/null
@@ -0,0 +1,3637 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <string.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "dav1d/data.h"
+
+#include "common/intops.h"
+#include "common/mem.h"
+
+#include "src/ctx.h"
+#include "src/decode.h"
+#include "src/dequant_tables.h"
+#include "src/env.h"
+#include "src/film_grain.h"
+#include "src/log.h"
+#include "src/qm.h"
+#include "src/recon.h"
+#include "src/ref.h"
+#include "src/tables.h"
+#include "src/thread_task.h"
+#include "src/warpmv.h"
+
+static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr,
+                              const Dav1dFrameHeader *const frame_hdr,
+                              const int qidx, uint16_t (*dq)[3][2])
+{
+    for (int i = 0; i < (frame_hdr->segmentation.enabled ? 8 : 1); i++) {
+        const int yac = frame_hdr->segmentation.enabled ?
+            iclip_u8(qidx + frame_hdr->segmentation.seg_data.d[i].delta_q) : qidx;
+        const int ydc = iclip_u8(yac + frame_hdr->quant.ydc_delta);
+        const int uac = iclip_u8(yac + frame_hdr->quant.uac_delta);
+        const int udc = iclip_u8(yac + frame_hdr->quant.udc_delta);
+        const int vac = iclip_u8(yac + frame_hdr->quant.vac_delta);
+        const int vdc = iclip_u8(yac + frame_hdr->quant.vdc_delta);
+
+        dq[i][0][0] = dav1d_dq_tbl[seq_hdr->hbd][ydc][0];
+        dq[i][0][1] = dav1d_dq_tbl[seq_hdr->hbd][yac][1];
+        dq[i][1][0] = dav1d_dq_tbl[seq_hdr->hbd][udc][0];
+        dq[i][1][1] = dav1d_dq_tbl[seq_hdr->hbd][uac][1];
+        dq[i][2][0] = dav1d_dq_tbl[seq_hdr->hbd][vdc][0];
+        dq[i][2][1] = dav1d_dq_tbl[seq_hdr->hbd][vac][1];
+    }
+}
+
+static int read_mv_component_diff(Dav1dTileContext *const t,
+                                  CdfMvComponent *const mv_comp,
+                                  const int have_fp)
+{
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dFrameContext *const f = t->f;
+    const int have_hp = f->frame_hdr->hp;
+    const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
+    const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+                                                    mv_comp->classes, 10);
+    int up, fp, hp;
+
+    if (!cl) {
+        up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
+        if (have_fp) {
+            fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                                                 mv_comp->class0_fp[up], 3);
+            hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
+                                                        mv_comp->class0_hp) : 1;
+        } else {
+            fp = 3;
+            hp = 1;
+        }
+    } else {
+        up = 1 << cl;
+        for (int n = 0; n < cl; n++)
+            up |= dav1d_msac_decode_bool_adapt(&ts->msac,
+                                               mv_comp->classN[n]) << n;
+        if (have_fp) {
+            fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                                                 mv_comp->classN_fp, 3);
+            hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
+                                                        mv_comp->classN_hp) : 1;
+        } else {
+            fp = 3;
+            hp = 1;
+        }
+    }
+
+    const int diff = ((up << 3) | (fp << 1) | hp) + 1;
+
+    return sign ? -diff : diff;
+}
+
+static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv,
+                             CdfMvContext *const mv_cdf, const int have_fp)
+{
+    switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
+                                            N_MV_JOINTS - 1))
+    {
+    case MV_JOINT_HV:
+        ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
+        ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp);
+        break;
+    case MV_JOINT_H:
+        ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp);
+        break;
+    case MV_JOINT_V:
+        ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
+        break;
+    default:
+        break;
+    }
+}
+
+static void read_tx_tree(Dav1dTileContext *const t,
+                         const enum RectTxfmSize from,
+                         const int depth, uint16_t *const masks,
+                         const int x_off, const int y_off)
+{
+    const Dav1dFrameContext *const f = t->f;
+    const int bx4 = t->bx & 31, by4 = t->by & 31;
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from];
+    const int txw = t_dim->lw, txh = t_dim->lh;
+    int is_split;
+
+    if (depth < 2 && from > (int) TX_4X4) {
+        const int cat = 2 * (TX_64X64 - t_dim->max) - depth;
+        const int a = t->a->tx[bx4] < txw;
+        const int l = t->l.tx[by4] < txh;
+
+        is_split = dav1d_msac_decode_bool_adapt(&t->ts->msac,
+                       t->ts->cdf.m.txpart[cat][a + l]);
+        if (is_split)
+            masks[depth] |= 1 << (y_off * 4 + x_off);
+    } else {
+        is_split = 0;
+    }
+
+    if (is_split && t_dim->max > TX_8X8) {
+        const enum RectTxfmSize sub = t_dim->sub;
+        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
+        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
+
+        read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 0, y_off * 2 + 0);
+        t->bx += txsw;
+        if (txw >= txh && t->bx < f->bw)
+            read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 1, y_off * 2 + 0);
+        t->bx -= txsw;
+        t->by += txsh;
+        if (txh >= txw && t->by < f->bh) {
+            read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 0, y_off * 2 + 1);
+            t->bx += txsw;
+            if (txw >= txh && t->bx < f->bw)
+                read_tx_tree(t, sub, depth + 1, masks,
+                             x_off * 2 + 1, y_off * 2 + 1);
+            t->bx -= txsw;
+        }
+        t->by -= txsh;
+    } else {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txh)
+        case_set_upto16(t_dim->h, l., 1, by4);
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txw)
+        case_set_upto16(t_dim->w, a->, 0, bx4);
+#undef set_ctx
+    }
+}
+
+static int neg_deinterleave(int diff, int ref, int max) {
+    if (!ref) return diff;
+    if (ref >= (max - 1)) return max - diff - 1;
+    if (2 * ref < max) {
+        if (diff <= 2 * ref) {
+            if (diff & 1)
+                return ref + ((diff + 1) >> 1);
+            else
+                return ref - (diff >> 1);
+        }
+        return diff;
+    } else {
+        if (diff <= 2 * (max - ref - 1)) {
+            if (diff & 1)
+                return ref + ((diff + 1) >> 1);
+            else
+                return ref - (diff >> 1);
+        }
+        return max - (diff + 1);
+    }
+}
+
+static void find_matching_ref(const Dav1dTileContext *const t,
+                              const enum EdgeFlags intra_edge_flags,
+                              const int bw4, const int bh4,
+                              const int w4, const int h4,
+                              const int have_left, const int have_top,
+                              const int ref, uint64_t masks[2])
+{
+    /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5];
+    int count = 0;
+    int have_topleft = have_top && have_left;
+    int have_topright = imax(bw4, bh4) < 32 &&
+                        have_top && t->bx + bw4 < t->ts->tiling.col_end &&
+                        (intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT);
+
+#define bs(rp) dav1d_block_dimensions[(rp)->bs]
+#define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1)
+
+    if (have_top) {
+        const refmvs_block *r2 = &r[-1][t->bx];
+        if (matches(r2)) {
+            masks[0] |= 1;
+            count = 1;
+        }
+        int aw4 = bs(r2)[0];
+        if (aw4 >= bw4) {
+            const int off = t->bx & (aw4 - 1);
+            if (off) have_topleft = 0;
+            if (aw4 - off > bw4) have_topright = 0;
+        } else {
+            unsigned mask = 1 << aw4;
+            for (int x = aw4; x < w4; x += aw4) {
+                r2 += aw4;
+                if (matches(r2)) {
+                    masks[0] |= mask;
+                    if (++count >= 8) return;
+                }
+                aw4 = bs(r2)[0];
+                mask <<= aw4;
+            }
+        }
+    }
+    if (have_left) {
+        /*const*/ refmvs_block *const *r2 = r;
+        if (matches(&r2[0][t->bx - 1])) {
+            masks[1] |= 1;
+            if (++count >= 8) return;
+        }
+        int lh4 = bs(&r2[0][t->bx - 1])[1];
+        if (lh4 >= bh4) {
+            if (t->by & (lh4 - 1)) have_topleft = 0;
+        } else {
+            unsigned mask = 1 << lh4;
+            for (int y = lh4; y < h4; y += lh4) {
+                r2 += lh4;
+                if (matches(&r2[0][t->bx - 1])) {
+                    masks[1] |= mask;
+                    if (++count >= 8) return;
+                }
+                lh4 = bs(&r2[0][t->bx - 1])[1];
+                mask <<= lh4;
+            }
+        }
+    }
+    if (have_topleft && matches(&r[-1][t->bx - 1])) {
+        masks[1] |= 1ULL << 32;
+        if (++count >= 8) return;
+    }
+    if (have_topright && matches(&r[-1][t->bx + bw4])) {
+        masks[0] |= 1ULL << 32;
+    }
+#undef matches
+}
+
+static void derive_warpmv(const Dav1dTileContext *const t,
+                          const int bw4, const int bh4,
+                          const uint64_t masks[2], const union mv mv,
+                          Dav1dWarpedMotionParams *const wmp)
+{
+    int pts[8][2 /* in, out */][2 /* x, y */], np = 0;
+    /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5];
+
+#define add_sample(dx, dy, sx, sy, rp) do { \
+    pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
+    pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \
+    pts[np][1][0] = pts[np][0][0] + (rp)->mv.mv[0].x; \
+    pts[np][1][1] = pts[np][0][1] + (rp)->mv.mv[0].y; \
+    np++; \
+} while (0)
+
+    // use masks[] to find the projectable motion vectors in the edges
+    if ((unsigned) masks[0] == 1 && !(masks[1] >> 32)) {
+        const int off = t->bx & (bs(&r[-1][t->bx])[0] - 1);
+        add_sample(-off, 0, 1, -1, &r[-1][t->bx]);
+    } else for (unsigned off = 0, xmask = (uint32_t) masks[0]; np < 8 && xmask;) { // top
+        const int tz = ctz(xmask);
+        off += tz;
+        xmask >>= tz;
+        add_sample(off, 0, 1, -1, &r[-1][t->bx + off]);
+        xmask &= ~1;
+    }
+    if (np < 8 && masks[1] == 1) {
+        const int off = t->by & (bs(&r[0][t->bx - 1])[1] - 1);
+        add_sample(0, -off, -1, 1, &r[-off][t->bx - 1]);
+    } else for (unsigned off = 0, ymask = (uint32_t) masks[1]; np < 8 && ymask;) { // left
+        const int tz = ctz(ymask);
+        off += tz;
+        ymask >>= tz;
+        add_sample(0, off, -1, 1, &r[off][t->bx - 1]);
+        ymask &= ~1;
+    }
+    if (np < 8 && masks[1] >> 32) // top/left
+        add_sample(0, 0, -1, -1, &r[-1][t->bx - 1]);
+    if (np < 8 && masks[0] >> 32) // top/right
+        add_sample(bw4, 0, 1, -1, &r[-1][t->bx + bw4]);
+    assert(np > 0 && np <= 8);
+#undef bs
+
+    // select according to motion vector difference against a threshold
+    int mvd[8], ret = 0;
+    const int thresh = 4 * iclip(imax(bw4, bh4), 4, 28);
+    for (int i = 0; i < np; i++) {
+        mvd[i] = abs(pts[i][1][0] - pts[i][0][0] - mv.x) +
+                 abs(pts[i][1][1] - pts[i][0][1] - mv.y);
+        if (mvd[i] > thresh)
+            mvd[i] = -1;
+        else
+            ret++;
+    }
+    if (!ret) {
+        ret = 1;
+    } else for (int i = 0, j = np - 1, k = 0; k < np - ret; k++, i++, j--) {
+        while (mvd[i] != -1) i++;
+        while (mvd[j] == -1) j--;
+        assert(i != j);
+        if (i > j) break;
+        // replace the discarded samples;
+        mvd[i] = mvd[j];
+        memcpy(pts[i], pts[j], sizeof(*pts));
+    }
+
+    if (!dav1d_find_affine_int(pts, ret, bw4, bh4, mv, wmp, t->bx, t->by) &&
+        !dav1d_get_shear_params(wmp))
+    {
+        wmp->type = DAV1D_WM_TYPE_AFFINE;
+    } else
+        wmp->type = DAV1D_WM_TYPE_IDENTITY;
+}
+
+static inline int findoddzero(const uint8_t *buf, int len) {
+    for (int n = 0; n < len; n++)
+        if (!buf[n * 2]) return 1;
+    return 0;
+}
+
+static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
+                           const int pl, const int sz_ctx,
+                           const int bx4, const int by4)
+{
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dFrameContext *const f = t->f;
+    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+                                           ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
+    uint16_t cache[16], used_cache[8];
+    int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
+    int n_cache = 0;
+    // don't reuse above palette outside SB64 boundaries
+    int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
+    const uint16_t *l = t->al_pal[1][by4][pl], *a = t->al_pal[0][bx4][pl];
+
+    // fill/sort cache
+    while (l_cache && a_cache) {
+        if (*l < *a) {
+            if (!n_cache || cache[n_cache - 1] != *l)
+                cache[n_cache++] = *l;
+            l++;
+            l_cache--;
+        } else {
+            if (*a == *l) {
+                l++;
+                l_cache--;
+            }
+            if (!n_cache || cache[n_cache - 1] != *a)
+                cache[n_cache++] = *a;
+            a++;
+            a_cache--;
+        }
+    }
+    if (l_cache) {
+        do {
+            if (!n_cache || cache[n_cache - 1] != *l)
+                cache[n_cache++] = *l;
+            l++;
+        } while (--l_cache > 0);
+    } else if (a_cache) {
+        do {
+            if (!n_cache || cache[n_cache - 1] != *a)
+                cache[n_cache++] = *a;
+            a++;
+        } while (--a_cache > 0);
+    }
+
+    // find reused cache entries
+    int i = 0;
+    for (int n = 0; n < n_cache && i < pal_sz; n++)
+        if (dav1d_msac_decode_bool_equi(&ts->msac))
+            used_cache[i++] = cache[n];
+    const int n_used_cache = i;
+
+    // parse new entries
+    uint16_t *const pal = f->frame_thread.pass ?
+        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+                            ((t->bx >> 1) + (t->by & 1))][pl] : t->scratch.pal[pl];
+    if (i < pal_sz) {
+        int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
+
+        if (i < pal_sz) {
+            int bits = f->cur.p.bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
+            const int max = (1 << f->cur.p.bpc) - 1;
+
+            do {
+                const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
+                prev = pal[i++] = imin(prev + delta + !pl, max);
+                if (prev + !pl >= max) {
+                    for (; i < pal_sz; i++)
+                        pal[i] = max;
+                    break;
+                }
+                bits = imin(bits, 1 + ulog2(max - prev - !pl));
+            } while (i < pal_sz);
+        }
+
+        // merge cache+new entries
+        int n = 0, m = n_used_cache;
+        for (i = 0; i < pal_sz; i++) {
+            if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
+                pal[i] = used_cache[n++];
+            } else {
+                assert(m < pal_sz);
+                pal[i] = pal[m++];
+            }
+        }
+    } else {
+        memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
+    }
+
+    if (DEBUG_BLOCK_INFO) {
+        printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
+               pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
+        for (int n = 0; n < n_cache; n++)
+            printf("%c%02x", n ? ' ' : '[', cache[n]);
+        printf("%s, pal=", n_cache ? "]" : "[]");
+        for (int n = 0; n < pal_sz; n++)
+            printf("%c%02x", n ? ' ' : '[', pal[n]);
+        printf("]\n");
+    }
+}
+
+static void read_pal_uv(Dav1dTileContext *const t, Av1Block *const b,
+                        const int sz_ctx, const int bx4, const int by4)
+{
+    read_pal_plane(t, b, 1, sz_ctx, bx4, by4);
+
+    // V pal coding
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dFrameContext *const f = t->f;
+    uint16_t *const pal = f->frame_thread.pass ?
+        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+                            ((t->bx >> 1) + (t->by & 1))][2] : t->scratch.pal[2];
+    if (dav1d_msac_decode_bool_equi(&ts->msac)) {
+        const int bits = f->cur.p.bpc - 4 +
+                         dav1d_msac_decode_bools(&ts->msac, 2);
+        int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
+        const int max = (1 << f->cur.p.bpc) - 1;
+        for (int i = 1; i < b->pal_sz[1]; i++) {
+            int delta = dav1d_msac_decode_bools(&ts->msac, bits);
+            if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
+            prev = pal[i] = (prev + delta) & max;
+        }
+    } else {
+        for (int i = 0; i < b->pal_sz[1]; i++)
+            pal[i] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
+    }
+    if (DEBUG_BLOCK_INFO) {
+        printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
+        for (int n = 0; n < b->pal_sz[1]; n++)
+            printf("%c%02x", n ? ' ' : '[', pal[n]);
+        printf("]\n");
+    }
+}
+
+// meant to be SIMD'able, so that theoretical complexity of this function
+// times block size goes from w4*h4 to w4+h4-1
+// a and b are previous two lines containing (a) top/left entries or (b)
+// top/left entries, with a[0] being either the first top or first left entry,
+// depending on top_offset being 1 or 0, and b being the first top/left entry
+// for whichever has one. left_offset indicates whether the (len-1)th entry
+// has a left neighbour.
+// output is order[] and ctx for each member of this diagonal.
+static void order_palette(const uint8_t *pal_idx, const ptrdiff_t stride,
+                          const int i, const int first, const int last,
+                          uint8_t (*const order)[8], uint8_t *const ctx)
+{
+    int have_top = i > first;
+
+    assert(pal_idx);
+    pal_idx += first + (i - first) * stride;
+    for (int j = first, n = 0; j >= last; have_top = 1, j--, n++, pal_idx += stride - 1) {
+        const int have_left = j > 0;
+
+        assert(have_left || have_top);
+
+#define add(v_in) do { \
+        const int v = v_in; \
+        assert((unsigned)v < 8U); \
+        order[n][o_idx++] = v; \
+        mask |= 1 << v; \
+    } while (0)
+
+        unsigned mask = 0;
+        int o_idx = 0;
+        if (!have_left) {
+            ctx[n] = 0;
+            add(pal_idx[-stride]);
+        } else if (!have_top) {
+            ctx[n] = 0;
+            add(pal_idx[-1]);
+        } else {
+            const int l = pal_idx[-1], t = pal_idx[-stride], tl = pal_idx[-(stride + 1)];
+            const int same_t_l = t == l;
+            const int same_t_tl = t == tl;
+            const int same_l_tl = l == tl;
+            const int same_all = same_t_l & same_t_tl & same_l_tl;
+
+            if (same_all) {
+                ctx[n] = 4;
+                add(t);
+            } else if (same_t_l) {
+                ctx[n] = 3;
+                add(t);
+                add(tl);
+            } else if (same_t_tl | same_l_tl) {
+                ctx[n] = 2;
+                add(tl);
+                add(same_t_tl ? l : t);
+            } else {
+                ctx[n] = 1;
+                add(imin(t, l));
+                add(imax(t, l));
+                add(tl);
+            }
+        }
+        for (unsigned m = 1, bit = 0; m < 0x100; m <<= 1, bit++)
+            if (!(mask & m))
+                order[n][o_idx++] = bit;
+        assert(o_idx == 8);
+#undef add
+    }
+}
+
+static void read_pal_indices(Dav1dTileContext *const t,
+                             uint8_t *const pal_idx,
+                             const Av1Block *const b, const int pl,
+                             const int w4, const int h4,
+                             const int bw4, const int bh4)
+{
+    Dav1dTileState *const ts = t->ts;
+    const ptrdiff_t stride = bw4 * 4;
+    assert(pal_idx);
+    pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
+    uint16_t (*const color_map_cdf)[8] =
+        ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
+    uint8_t (*const order)[8] = t->scratch.pal_order;
+    uint8_t *const ctx = t->scratch.pal_ctx;
+    for (int i = 1; i < 4 * (w4 + h4) - 1; i++) {
+        // top/left-to-bottom/right diagonals ("wave-front")
+        const int first = imin(i, w4 * 4 - 1);
+        const int last = imax(0, i - h4 * 4 + 1);
+        order_palette(pal_idx, stride, i, first, last, order, ctx);
+        for (int j = first, m = 0; j >= last; j--, m++) {
+            const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+                                      color_map_cdf[ctx[m]], b->pal_sz[pl] - 1);
+            pal_idx[(i - j) * stride + j] = order[m][color_idx];
+        }
+    }
+    // fill invisible edges
+    if (bw4 > w4)
+        for (int y = 0; y < 4 * h4; y++)
+            memset(&pal_idx[y * stride + 4 * w4],
+                   pal_idx[y * stride + 4 * w4 - 1], 4 * (bw4 - w4));
+    if (h4 < bh4) {
+        const uint8_t *const src = &pal_idx[stride * (4 * h4 - 1)];
+        for (int y = h4 * 4; y < bh4 * 4; y++)
+            memcpy(&pal_idx[y * stride], src, bw4 * 4);
+    }
+}
+
+static void read_vartx_tree(Dav1dTileContext *const t,
+                            Av1Block *const b, const enum BlockSize bs,
+                            const int bx4, const int by4)
+{
+    const Dav1dFrameContext *const f = t->f;
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+    const int bw4 = b_dim[0], bh4 = b_dim[1];
+
+    // var-tx tree coding
+    uint16_t tx_split[2] = { 0 };
+    b->max_ytx = dav1d_max_txfm_size_for_bs[bs][0];
+    if (!b->skip && (f->frame_hdr->segmentation.lossless[b->seg_id] ||
+                     b->max_ytx == TX_4X4))
+    {
+        b->max_ytx = b->uvtx = TX_4X4;
+        if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir tx, off, TX_4X4)
+            case_set(bh4, l., 1, by4);
+            case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+        }
+    } else if (f->frame_hdr->txfm_mode != DAV1D_TX_SWITCHABLE || b->skip) {
+        if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir tx, off, mul * b_dim[2 + diridx])
+            case_set(bh4, l., 1, by4);
+            case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+        }
+        b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
+    } else {
+        assert(bw4 <= 16 || bh4 <= 16 || b->max_ytx == TX_64X64);
+        int y, x, y_off, x_off;
+        const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
+        for (y = 0, y_off = 0; y < bh4; y += ytx->h, y_off++) {
+            for (x = 0, x_off = 0; x < bw4; x += ytx->w, x_off++) {
+                read_tx_tree(t, b->max_ytx, 0, tx_split, x_off, y_off);
+                // contexts are updated inside read_tx_tree()
+                t->bx += ytx->w;
+            }
+            t->bx -= x;
+            t->by += ytx->h;
+        }
+        t->by -= y;
+        if (DEBUG_BLOCK_INFO)
+            printf("Post-vartxtree[%x/%x]: r=%d\n",
+                   tx_split[0], tx_split[1], t->ts->msac.rng);
+        b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
+    }
+    assert(!(tx_split[0] & ~0x33));
+    b->tx_split0 = (uint8_t)tx_split[0];
+    b->tx_split1 = tx_split[1];
+}
+
+static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
+                                            const int by, const int bx,
+                                            const int w4, int h4,
+                                            const uint8_t *ref_seg_map,
+                                            const ptrdiff_t stride)
+{
+    assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
+    if (dav1d_thread_picture_wait(&f->refp[f->frame_hdr->primary_ref_frame],
+                                  (by + h4) * 4, PLANE_TYPE_BLOCK))
+    {
+        return 8;
+    }
+
+    unsigned seg_id = 8;
+    ref_seg_map += by * stride + bx;
+    do {
+        for (int x = 0; x < w4; x++)
+            seg_id = imin(seg_id, ref_seg_map[x]);
+        ref_seg_map += stride;
+    } while (--h4 > 0 && seg_id);
+    assert(seg_id < 8);
+
+    return seg_id;
+}
+
+static int decode_b(Dav1dTileContext *const t,
+                    const enum BlockLevel bl,
+                    const enum BlockSize bs,
+                    const enum BlockPartition bp,
+                    const enum EdgeFlags intra_edge_flags)
+{
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dFrameContext *const f = t->f;
+    Av1Block b_mem, *const b = f->frame_thread.pass ?
+        &f->frame_thread.b[t->by * f->b4_stride + t->bx] : &b_mem;
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+    const int bx4 = t->bx & 31, by4 = t->by & 31;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+    const int bw4 = b_dim[0], bh4 = b_dim[1];
+    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
+    const int have_left = t->bx > ts->tiling.col_start;
+    const int have_top = t->by > ts->tiling.row_start;
+    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+                           (bw4 > ss_hor || t->bx & 1) &&
+                           (bh4 > ss_ver || t->by & 1);
+
+    if (f->frame_thread.pass == 2) {
+        if (b->intra) {
+            f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
+
+            const enum IntraPredMode y_mode_nofilt =
+                b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
+            rep_macro(type, t->dir intra, off, mul)
+            case_set(bh4, l., 1, by4);
+            case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+            if (f->frame_hdr->frame_type & 1) {
+                refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
+                for (int x = 0; x < bw4; x++) {
+                    r[x].ref.ref[0] = 0;
+                    r[x].bs = bs;
+                }
+                refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5];
+                for (int y = 0; y < bh4 - 1; y++) {
+                    rr[y][t->bx + bw4 - 1].ref.ref[0] = 0;
+                    rr[y][t->bx + bw4 - 1].bs = bs;
+                }
+            }
+
+            if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
+                case_set(cbh4, l., 1, cby4);
+                case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+            }
+        } else {
+            if (f->frame_hdr->frame_type & 1 /* not intrabc */ &&
+                b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP)
+            {
+                if (b->matrix[0] == SHRT_MIN) {
+                    t->warpmv.type = DAV1D_WM_TYPE_IDENTITY;
+                } else {
+                    t->warpmv.type = DAV1D_WM_TYPE_AFFINE;
+                    t->warpmv.matrix[2] = b->matrix[0] + 0x10000;
+                    t->warpmv.matrix[3] = b->matrix[1];
+                    t->warpmv.matrix[4] = b->matrix[2];
+                    t->warpmv.matrix[5] = b->matrix[3] + 0x10000;
+                    dav1d_set_affine_mv2d(bw4, bh4, b->mv2d, &t->warpmv,
+                                          t->bx, t->by);
+                    dav1d_get_shear_params(&t->warpmv);
+#define signabs(v) v < 0 ? '-' : ' ', abs(v)
+                    if (DEBUG_BLOCK_INFO)
+                        printf("[ %c%x %c%x %c%x\n  %c%x %c%x %c%x ]\n"
+                               "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, mv=y:%d,x:%d\n",
+                               signabs(t->warpmv.matrix[0]),
+                               signabs(t->warpmv.matrix[1]),
+                               signabs(t->warpmv.matrix[2]),
+                               signabs(t->warpmv.matrix[3]),
+                               signabs(t->warpmv.matrix[4]),
+                               signabs(t->warpmv.matrix[5]),
+                               signabs(t->warpmv.alpha),
+                               signabs(t->warpmv.beta),
+                               signabs(t->warpmv.gamma),
+                               signabs(t->warpmv.delta),
+                               b->mv2d.y, b->mv2d.x);
+#undef signabs
+                }
+            }
+            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
+
+            const uint8_t *const filter = dav1d_filter_dir[b->filter2d];
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
+            rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
+            rep_macro(type, t->dir intra, off, 0)
+            case_set(bh4, l., 1, by4);
+            case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
+            if (f->frame_hdr->frame_type & 1) {
+                refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
+                for (int x = 0; x < bw4; x++) {
+                    r[x].ref.ref[0] = b->ref[0] + 1;
+                    r[x].mv.mv[0] = b->mv[0];
+                    r[x].bs = bs;
+                }
+                refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5];
+                for (int y = 0; y < bh4 - 1; y++) {
+                    rr[y][t->bx + bw4 - 1].ref.ref[0] = b->ref[0] + 1;
+                    rr[y][t->bx + bw4 - 1].mv.mv[0] = b->mv[0];
+                    rr[y][t->bx + bw4 - 1].bs = bs;
+                }
+            }
+
+            if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+                case_set(cbh4, l., 1, cby4);
+                case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+            }
+        }
+        return 0;
+    }
+
+    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+
+    b->bl = bl;
+    b->bp = bp;
+    b->bs = bs;
+
+    const Dav1dSegmentationData *seg = NULL;
+
+    // segment_id (if seg_feature for skip/ref/gmv is enabled)
+    int seg_pred = 0;
+    if (f->frame_hdr->segmentation.enabled) {
+        if (!f->frame_hdr->segmentation.update_map) {
+            if (f->prev_segmap) {
+                unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
+                                                       f->prev_segmap,
+                                                       f->b4_stride);
+                if (seg_id >= 8) return -1;
+                b->seg_id = seg_id;
+            } else {
+                b->seg_id = 0;
+            }
+            seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
+        } else if (f->frame_hdr->segmentation.seg_data.preskip) {
+            if (f->frame_hdr->segmentation.temporal &&
+                (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
+                                t->l.seg_pred[by4]])))
+            {
+                // temporal predicted seg_id
+                if (f->prev_segmap) {
+                    unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx,
+                                                           w4, h4,
+                                                           f->prev_segmap,
+                                                           f->b4_stride);
+                    if (seg_id >= 8) return -1;
+                    b->seg_id = seg_id;
+                } else {
+                    b->seg_id = 0;
+                }
+            } else {
+                int seg_ctx;
+                const unsigned pred_seg_id =
+                    get_cur_frame_segid(t->by, t->bx, have_top, have_left,
+                                        &seg_ctx, f->cur_segmap, f->b4_stride);
+                const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+                                          ts->cdf.m.seg_id[seg_ctx],
+                                          DAV1D_MAX_SEGMENTS - 1);
+                const unsigned last_active_seg_id =
+                    f->frame_hdr->segmentation.seg_data.last_active_segid;
+                b->seg_id = neg_deinterleave(diff, pred_seg_id,
+                                             last_active_seg_id + 1);
+                if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
+                if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
+            }
+
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-segid[preskip;%d]: r=%d\n",
+                       b->seg_id, ts->msac.rng);
+
+            seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
+        }
+    } else {
+        b->seg_id = 0;
+    }
+
+    // skip_mode
+    if ((!seg || (!seg->globalmv && seg->ref == -1 && !seg->skip)) &&
+        f->frame_hdr->skip_mode_enabled && imin(bw4, bh4) > 1)
+    {
+        const int smctx = t->a->skip_mode[bx4] + t->l.skip_mode[by4];
+        b->skip_mode = dav1d_msac_decode_bool_adapt(&ts->msac,
+                           ts->cdf.m.skip_mode[smctx]);
+        if (DEBUG_BLOCK_INFO)
+            printf("Post-skipmode[%d]: r=%d\n", b->skip_mode, ts->msac.rng);
+    } else {
+        b->skip_mode = 0;
+    }
+
+    // skip
+    if (b->skip_mode || (seg && seg->skip)) {
+        b->skip = 1;
+    } else {
+        const int sctx = t->a->skip[bx4] + t->l.skip[by4];
+        b->skip = dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]);
+        if (DEBUG_BLOCK_INFO)
+            printf("Post-skip[%d]: r=%d\n", b->skip, ts->msac.rng);
+    }
+
+    // segment_id
+    if (f->frame_hdr->segmentation.enabled &&
+        f->frame_hdr->segmentation.update_map &&
+        !f->frame_hdr->segmentation.seg_data.preskip)
+    {
+        if (!b->skip && f->frame_hdr->segmentation.temporal &&
+            (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac,
+                            ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
+                            t->l.seg_pred[by4]])))
+        {
+            // temporal predicted seg_id
+            if (f->prev_segmap) {
+                unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
+                                                       f->prev_segmap,
+                                                       f->b4_stride);
+                if (seg_id >= 8) return -1;
+                b->seg_id = seg_id;
+            } else {
+                b->seg_id = 0;
+            }
+        } else {
+            int seg_ctx;
+            const unsigned pred_seg_id =
+                get_cur_frame_segid(t->by, t->bx, have_top, have_left,
+                                    &seg_ctx, f->cur_segmap, f->b4_stride);
+            if (b->skip) {
+                b->seg_id = pred_seg_id;
+            } else {
+                const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+                                          ts->cdf.m.seg_id[seg_ctx],
+                                          DAV1D_MAX_SEGMENTS - 1);
+                const unsigned last_active_seg_id =
+                    f->frame_hdr->segmentation.seg_data.last_active_segid;
+                b->seg_id = neg_deinterleave(diff, pred_seg_id,
+                                             last_active_seg_id + 1);
+                if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
+            }
+            if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
+        }
+
+        seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
+
+        if (DEBUG_BLOCK_INFO)
+            printf("Post-segid[postskip;%d]: r=%d\n",
+                   b->seg_id, ts->msac.rng);
+    }
+
+    // cdef index
+    if (!b->skip) {
+        const int idx = f->seq_hdr->sb128 ? ((t->bx & 16) >> 4) +
+                                           ((t->by & 16) >> 3) : 0;
+        if (t->cur_sb_cdef_idx_ptr[idx] == -1) {
+            const int v = dav1d_msac_decode_bools(&ts->msac,
+                              f->frame_hdr->cdef.n_bits);
+            t->cur_sb_cdef_idx_ptr[idx] = v;
+            if (bw4 > 16) t->cur_sb_cdef_idx_ptr[idx + 1] = v;
+            if (bh4 > 16) t->cur_sb_cdef_idx_ptr[idx + 2] = v;
+            if (bw4 == 32 && bh4 == 32) t->cur_sb_cdef_idx_ptr[idx + 3] = v;
+
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-cdef_idx[%d]: r=%d\n",
+                        *t->cur_sb_cdef_idx_ptr, ts->msac.rng);
+        }
+    }
+
+    // delta-q/lf
+    if (!(t->bx & (31 >> !f->seq_hdr->sb128)) &&
+        !(t->by & (31 >> !f->seq_hdr->sb128)))
+    {
+        const int prev_qidx = ts->last_qidx;
+        const int have_delta_q = f->frame_hdr->delta.q.present &&
+            (bs != (f->seq_hdr->sb128 ? BS_128x128 : BS_64x64) || !b->skip);
+
+        int8_t prev_delta_lf[4];
+        memcpy(prev_delta_lf, ts->last_delta_lf, 4);
+
+        if (have_delta_q) {
+            int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                                                          ts->cdf.m.delta_q, 3);
+            if (delta_q == 3) {
+                const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
+                delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) +
+                          1 + (1 << n_bits);
+            }
+            if (delta_q) {
+                if (dav1d_msac_decode_bool_equi(&ts->msac)) delta_q = -delta_q;
+                delta_q *= 1 << f->frame_hdr->delta.q.res_log2;
+            }
+            ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255);
+            if (have_delta_q && DEBUG_BLOCK_INFO)
+                printf("Post-delta_q[%d->%d]: r=%d\n",
+                       delta_q, ts->last_qidx, ts->msac.rng);
+
+            if (f->frame_hdr->delta.lf.present) {
+                const int n_lfs = f->frame_hdr->delta.lf.multi ?
+                    f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
+
+                for (int i = 0; i < n_lfs; i++) {
+                    int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                        ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 3);
+                    if (delta_lf == 3) {
+                        const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
+                        delta_lf = dav1d_msac_decode_bools(&ts->msac, n_bits) +
+                                   1 + (1 << n_bits);
+                    }
+                    if (delta_lf) {
+                        if (dav1d_msac_decode_bool_equi(&ts->msac))
+                            delta_lf = -delta_lf;
+                        delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2;
+                    }
+                    ts->last_delta_lf[i] =
+                        iclip(ts->last_delta_lf[i] + delta_lf, -63, 63);
+                    if (have_delta_q && DEBUG_BLOCK_INFO)
+                        printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf,
+                               ts->msac.rng);
+                }
+            }
+        }
+        if (ts->last_qidx == f->frame_hdr->quant.yac) {
+            // assign frame-wide q values to this sb
+            ts->dq = f->dq;
+        } else if (ts->last_qidx != prev_qidx) {
+            // find sb-specific quant parameters
+            init_quant_tables(f->seq_hdr, f->frame_hdr, ts->last_qidx, ts->dqmem);
+            ts->dq = ts->dqmem;
+        }
+        if (!memcmp(ts->last_delta_lf, (int8_t[4]) { 0, 0, 0, 0 }, 4)) {
+            // assign frame-wide lf values to this sb
+            ts->lflvl = f->lf.lvl;
+        } else if (memcmp(ts->last_delta_lf, prev_delta_lf, 4)) {
+            // find sb-specific lf lvl parameters
+            dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf);
+            ts->lflvl = ts->lflvlmem;
+        }
+    }
+
+    if (b->skip_mode) {
+        b->intra = 0;
+    } else if (f->frame_hdr->frame_type & 1) {
+        if (seg && (seg->ref >= 0 || seg->globalmv)) {
+            b->intra = !seg->ref;
+        } else {
+            const int ictx = get_intra_ctx(t->a, &t->l, by4, bx4,
+                                           have_top, have_left);
+            b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac,
+                            ts->cdf.m.intra[ictx]);
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-intra[%d]: r=%d\n", b->intra, ts->msac.rng);
+        }
+    } else if (f->frame_hdr->allow_intrabc) {
+        b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc);
+        if (DEBUG_BLOCK_INFO)
+            printf("Post-intrabcflag[%d]: r=%d\n", b->intra, ts->msac.rng);
+    } else {
+        b->intra = 1;
+    }
+
+    // intra/inter-specific stuff
+    if (b->intra) {
+        uint16_t *const ymode_cdf = f->frame_hdr->frame_type & 1 ?
+            ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
+            ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
+                        [dav1d_intra_mode_context[t->l.mode[by4]]];
+        b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
+                                                     N_INTRA_PRED_MODES - 1);
+        if (DEBUG_BLOCK_INFO)
+            printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
+
+        // angle delta
+        if (b_dim[2] + b_dim[3] >= 2 && b->y_mode >= VERT_PRED &&
+            b->y_mode <= VERT_LEFT_PRED)
+        {
+            uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
+            const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
+            b->y_angle = angle - 3;
+        } else {
+            b->y_angle = 0;
+        }
+
+        if (has_chroma) {
+            const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ?
+                cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
+            uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
+            b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
+                             N_UV_INTRA_PRED_MODES - 1 - !cfl_allowed);
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
+
+            if (b->uv_mode == CFL_PRED) {
+#define SIGN(a) (!!(a) + ((a) > 0))
+                const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+                                     ts->cdf.m.cfl_sign, 7) + 1;
+                const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
+                assert(sign_u == sign / 3);
+                if (sign_u) {
+                    const int ctx = (sign_u == 2) * 3 + sign_v;
+                    b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+                                          ts->cdf.m.cfl_alpha[ctx], 15) + 1;
+                    if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
+                } else {
+                    b->cfl_alpha[0] = 0;
+                }
+                if (sign_v) {
+                    const int ctx = (sign_v == 2) * 3 + sign_u;
+                    b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+                                          ts->cdf.m.cfl_alpha[ctx], 15) + 1;
+                    if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
+                } else {
+                    b->cfl_alpha[1] = 0;
+                }
+#undef SIGN
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-uvalphas[%d/%d]: r=%d\n",
+                           b->cfl_alpha[0], b->cfl_alpha[1], ts->msac.rng);
+            } else if (b_dim[2] + b_dim[3] >= 2 && b->uv_mode >= VERT_PRED &&
+                       b->uv_mode <= VERT_LEFT_PRED)
+            {
+                uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
+                const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
+                b->uv_angle = angle - 3;
+            } else {
+                b->uv_angle = 0;
+            }
+        }
+
+        b->pal_sz[0] = b->pal_sz[1] = 0;
+        if (f->frame_hdr->allow_screen_content_tools &&
+            imax(bw4, bh4) <= 16 && bw4 + bh4 >= 4)
+        {
+            const int sz_ctx = b_dim[2] + b_dim[3] - 2;
+            if (b->y_mode == DC_PRED) {
+                const int pal_ctx = (t->a->pal_sz[bx4] > 0) + (t->l.pal_sz[by4] > 0);
+                const int use_y_pal = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                          ts->cdf.m.pal_y[sz_ctx][pal_ctx]);
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-y_pal[%d]: r=%d\n", use_y_pal, ts->msac.rng);
+                if (use_y_pal)
+                    read_pal_plane(t, b, 0, sz_ctx, bx4, by4);
+            }
+
+            if (has_chroma && b->uv_mode == DC_PRED) {
+                const int pal_ctx = b->pal_sz[0] > 0;
+                const int use_uv_pal = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                           ts->cdf.m.pal_uv[pal_ctx]);
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng);
+                if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates
+                    read_pal_uv(t, b, sz_ctx, bx4, by4);
+            }
+        }
+
+        if (b->y_mode == DC_PRED && !b->pal_sz[0] &&
+            imax(b_dim[2], b_dim[3]) <= 3 && f->seq_hdr->filter_intra)
+        {
+            const int is_filter = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                      ts->cdf.m.use_filter_intra[bs]);
+            if (is_filter) {
+                b->y_mode = FILTER_PRED;
+                b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                                 ts->cdf.m.filter_intra, 4);
+            }
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-filterintramode[%d/%d]: r=%d\n",
+                       b->y_mode, b->y_angle, ts->msac.rng);
+        }
+
+        if (b->pal_sz[0]) {
+            uint8_t *pal_idx;
+            if (f->frame_thread.pass) {
+                assert(ts->frame_thread.pal_idx);
+                pal_idx = ts->frame_thread.pal_idx;
+                ts->frame_thread.pal_idx += bw4 * bh4 * 16;
+            } else
+                pal_idx = t->scratch.pal_idx;
+            read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4);
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-y-pal-indices: r=%d\n", ts->msac.rng);
+        }
+
+        if (has_chroma && b->pal_sz[1]) {
+            uint8_t *pal_idx;
+            if (f->frame_thread.pass) {
+                assert(ts->frame_thread.pal_idx);
+                pal_idx = ts->frame_thread.pal_idx;
+                ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
+            } else
+                pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
+            read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4);
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng);
+        }
+
+        const TxfmInfo *t_dim;
+        if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
+            b->tx = b->uvtx = (int) TX_4X4;
+            t_dim = &dav1d_txfm_dimensions[TX_4X4];
+        } else {
+            b->tx = dav1d_max_txfm_size_for_bs[bs][0];
+            b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
+            t_dim = &dav1d_txfm_dimensions[b->tx];
+            if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) {
+                const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
+                uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
+                int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
+                                imin(t_dim->max, 2));
+
+                while (depth--) {
+                    b->tx = t_dim->sub;
+                    t_dim = &dav1d_txfm_dimensions[b->tx];
+                }
+            }
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-tx[%d]: r=%d\n", b->tx, ts->msac.rng);
+        }
+
+        // reconstruction
+        if (f->frame_thread.pass == 1) {
+            f->bd_fn.read_coef_blocks(t, bs, b);
+        } else {
+            f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
+        }
+
+        if (f->frame_hdr->loopfilter.level_y[0] ||
+            f->frame_hdr->loopfilter.level_y[1])
+        {
+            dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
+                                       (const uint8_t (*)[8][2])
+                                       &ts->lflvl[b->seg_id][0][0][0],
+                                       t->bx, t->by, f->w4, f->h4, bs,
+                                       b->tx, b->uvtx, f->cur.p.layout,
+                                       &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
+                                       has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
+                                       has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
+        }
+
+        // update contexts
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir tx_intra, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
+        rep_macro(type, t->dir tx, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
+        rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
+        rep_macro(type, t->dir pal_sz, off, mul * b->pal_sz[0]); \
+        rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+        rep_macro(type, t->dir skip_mode, off, 0); \
+        rep_macro(type, t->dir intra, off, mul); \
+        rep_macro(type, t->dir skip, off, mul * b->skip); \
+        /* see aomedia bug 2183 for why we use luma coordinates here */ \
+        rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \
+        if (f->frame_hdr->frame_type & 1) { \
+            rep_macro(type, t->dir comp_type, off, mul * COMP_INTER_NONE); \
+            rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \
+            rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \
+            rep_macro(type, t->dir filter[0], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \
+            rep_macro(type, t->dir filter[1], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \
+        }
+        const enum IntraPredMode y_mode_nofilt =
+            b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+        if (b->pal_sz[0]) {
+            uint16_t *const pal = f->frame_thread.pass ?
+                f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+                                    ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
+            for (int x = 0; x < bw4; x++)
+                memcpy(t->al_pal[0][bx4 + x][0], pal, 16);
+            for (int y = 0; y < bh4; y++)
+                memcpy(t->al_pal[1][by4 + y][0], pal, 16);
+        }
+        if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
+                case_set(cbh4, l., 1, cby4);
+                case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+            if (b->pal_sz[1]) {
+                const uint16_t (*const pal)[8] = f->frame_thread.pass ?
+                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) *
+                    (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))] :
+                    t->scratch.pal;
+                // see aomedia bug 2183 for why we use luma coordinates here
+                for (int pl = 1; pl <= 2; pl++) {
+                    for (int x = 0; x < bw4; x++)
+                        memcpy(t->al_pal[0][bx4 + x][pl], pal[pl], 16);
+                    for (int y = 0; y < bh4; y++)
+                        memcpy(t->al_pal[1][by4 + y][pl], pal[pl], 16);
+                }
+            }
+        }
+        if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
+            splat_intraref(&t->rt, t->by, t->bx, bs);
+        }
+    } else if (!(f->frame_hdr->frame_type & 1)) {
+        // intra block copy
+        refmvs_candidate mvstack[8];
+        int n_mvs, ctx;
+        dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+                          (union refmvs_refpair) { .ref = { 0, -1 }},
+                          bs, intra_edge_flags, t->by, t->bx);
+
+        if (mvstack[0].mv.mv[0].n)
+            b->mv[0] = mvstack[0].mv.mv[0];
+        else if (mvstack[1].mv.mv[0].n)
+            b->mv[0] = mvstack[1].mv.mv[0];
+        else {
+            if (t->by - (16 << f->seq_hdr->sb128) < ts->tiling.row_start) {
+                b->mv[0].y = 0;
+                b->mv[0].x = -(512 << f->seq_hdr->sb128) - 2048;
+            } else {
+                b->mv[0].y = -(512 << f->seq_hdr->sb128);
+                b->mv[0].x = 0;
+            }
+        }
+
+        const union mv ref = b->mv[0];
+        read_mv_residual(t, &b->mv[0], &ts->cdf.dmv, 0);
+
+        // clip intrabc motion vector to decoded parts of current tile
+        int border_left = ts->tiling.col_start * 4;
+        int border_top  = ts->tiling.row_start * 4;
+        if (has_chroma) {
+            if (bw4 < 2 &&  ss_hor)
+                border_left += 4;
+            if (bh4 < 2 &&  ss_ver)
+                border_top  += 4;
+        }
+        int src_left   = t->bx * 4 + (b->mv[0].x >> 3);
+        int src_top    = t->by * 4 + (b->mv[0].y >> 3);
+        int src_right  = src_left + bw4 * 4;
+        int src_bottom = src_top  + bh4 * 4;
+        const int border_right = ((ts->tiling.col_end + (bw4 - 1)) & ~(bw4 - 1)) * 4;
+
+        // check against left or right tile boundary and adjust if necessary
+        if (src_left < border_left) {
+            src_right += border_left - src_left;
+            src_left  += border_left - src_left;
+        } else if (src_right > border_right) {
+            src_left  -= src_right - border_right;
+            src_right -= src_right - border_right;
+        }
+        // check against top tile boundary and adjust if necessary
+        if (src_top < border_top) {
+            src_bottom += border_top - src_top;
+            src_top    += border_top - src_top;
+        }
+
+        const int sbx = (t->bx >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
+        const int sby = (t->by >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
+        const int sb_size = 1 << (6 + f->seq_hdr->sb128);
+        // check for overlap with current superblock
+        if (src_bottom > sby && src_right > sbx) {
+            if (src_top - border_top >= src_bottom - sby) {
+                // if possible move src up into the previous suberblock row
+                src_top    -= src_bottom - sby;
+                src_bottom -= src_bottom - sby;
+            } else if (src_left - border_left >= src_right - sbx) {
+                // if possible move src left into the previous suberblock
+                src_left  -= src_right - sbx;
+                src_right -= src_right - sbx;
+            }
+        }
+        // move src up if it is below current superblock row
+        if (src_bottom > sby + sb_size) {
+            src_top    -= src_bottom - (sby + sb_size);
+            src_bottom -= src_bottom - (sby + sb_size);
+        }
+        // error out if mv still overlaps with the current superblock
+        if (src_bottom > sby && src_right > sbx)
+            return -1;
+
+        b->mv[0].x = (src_left - t->bx * 4) * 8;
+        b->mv[0].y = (src_top  - t->by * 4) * 8;
+
+        if (DEBUG_BLOCK_INFO)
+            printf("Post-dmv[%d/%d,ref=%d/%d|%d/%d]: r=%d\n",
+                   b->mv[0].y, b->mv[0].x, ref.y, ref.x,
+                   mvstack[0].mv.mv[0].y, mvstack[0].mv.mv[0].x, ts->msac.rng);
+        read_vartx_tree(t, b, bs, bx4, by4);
+
+        // reconstruction
+        if (f->frame_thread.pass == 1) {
+            f->bd_fn.read_coef_blocks(t, bs, b);
+            b->filter2d = FILTER_2D_BILINEAR;
+        } else {
+            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
+        }
+
+        splat_intrabc_mv(&t->rt, t->by, t->bx, bs, b->mv[0]);
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
+        rep_macro(type, t->dir mode, off, mul * DC_PRED); \
+        rep_macro(type, t->dir pal_sz, off, 0); \
+        /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+        rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
+        rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+        rep_macro(type, t->dir skip_mode, off, 0); \
+        rep_macro(type, t->dir intra, off, 0); \
+        rep_macro(type, t->dir skip, off, mul * b->skip)
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+        if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+            case_set(cbh4, l., 1, cby4);
+            case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+        }
+    } else {
+        // inter-specific mode/mv coding
+        int is_comp, has_subpel_filter;
+
+        if (b->skip_mode) {
+            is_comp = 1;
+        } else if ((!seg || (seg->ref == -1 && !seg->globalmv && !seg->skip)) &&
+                   f->frame_hdr->switchable_comp_refs && imin(bw4, bh4) > 1)
+        {
+            const int ctx = get_comp_ctx(t->a, &t->l, by4, bx4,
+                                         have_top, have_left);
+            is_comp = dav1d_msac_decode_bool_adapt(&ts->msac,
+                          ts->cdf.m.comp[ctx]);
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-compflag[%d]: r=%d\n", is_comp, ts->msac.rng);
+        } else {
+            is_comp = 0;
+        }
+
+        if (b->skip_mode) {
+            b->ref[0] = f->frame_hdr->skip_mode_refs[0];
+            b->ref[1] = f->frame_hdr->skip_mode_refs[1];
+            b->comp_type = COMP_INTER_AVG;
+            b->inter_mode = NEARESTMV_NEARESTMV;
+            b->drl_idx = NEAREST_DRL;
+            has_subpel_filter = 0;
+
+            refmvs_candidate mvstack[8];
+            int n_mvs, ctx;
+            dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+                              (union refmvs_refpair) { .ref = {
+                                    b->ref[0] + 1, b->ref[1] + 1 }},
+                              bs, intra_edge_flags, t->by, t->bx);
+
+            b->mv[0] = mvstack[0].mv.mv[0];
+            b->mv[1] = mvstack[0].mv.mv[1];
+            fix_mv_precision(f->frame_hdr, &b->mv[0]);
+            fix_mv_precision(f->frame_hdr, &b->mv[1]);
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-skipmodeblock[mv=1:y=%d,x=%d,2:y=%d,x=%d,refs=%d+%d\n",
+                       b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
+                       b->ref[0], b->ref[1]);
+        } else if (is_comp) {
+            const int dir_ctx = get_comp_dir_ctx(t->a, &t->l, by4, bx4,
+                                                 have_top, have_left);
+            if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                    ts->cdf.m.comp_dir[dir_ctx]))
+            {
+                // bidir - first reference (fw)
+                const int ctx1 = av1_get_fwd_ref_ctx(t->a, &t->l, by4, bx4,
+                                                     have_top, have_left);
+                if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                        ts->cdf.m.comp_fwd_ref[0][ctx1]))
+                {
+                    const int ctx2 = av1_get_fwd_ref_2_ctx(t->a, &t->l, by4, bx4,
+                                                           have_top, have_left);
+                    b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac,
+                                        ts->cdf.m.comp_fwd_ref[2][ctx2]);
+                } else {
+                    const int ctx2 = av1_get_fwd_ref_1_ctx(t->a, &t->l, by4, bx4,
+                                                           have_top, have_left);
+                    b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                    ts->cdf.m.comp_fwd_ref[1][ctx2]);
+                }
+
+                // second reference (bw)
+                const int ctx3 = av1_get_bwd_ref_ctx(t->a, &t->l, by4, bx4,
+                                                     have_top, have_left);
+                if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                        ts->cdf.m.comp_bwd_ref[0][ctx3]))
+                {
+                    b->ref[1] = 6;
+                } else {
+                    const int ctx4 = av1_get_bwd_ref_1_ctx(t->a, &t->l, by4, bx4,
+                                                           have_top, have_left);
+                    b->ref[1] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac,
+                                        ts->cdf.m.comp_bwd_ref[1][ctx4]);
+                }
+            } else {
+                // unidir
+                const int uctx_p = av1_get_uni_p_ctx(t->a, &t->l, by4, bx4,
+                                                     have_top, have_left);
+                if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                        ts->cdf.m.comp_uni_ref[0][uctx_p]))
+                {
+                    b->ref[0] = 4;
+                    b->ref[1] = 6;
+                } else {
+                    const int uctx_p1 = av1_get_uni_p1_ctx(t->a, &t->l, by4, bx4,
+                                                           have_top, have_left);
+                    b->ref[0] = 0;
+                    b->ref[1] = 1 + dav1d_msac_decode_bool_adapt(&ts->msac,
+                                        ts->cdf.m.comp_uni_ref[1][uctx_p1]);
+                    if (b->ref[1] == 2) {
+                        const int uctx_p2 = av1_get_uni_p2_ctx(t->a, &t->l, by4, bx4,
+                                                               have_top, have_left);
+                        b->ref[1] += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                         ts->cdf.m.comp_uni_ref[2][uctx_p2]);
+                    }
+                }
+            }
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-refs[%d/%d]: r=%d\n",
+                       b->ref[0], b->ref[1], ts->msac.rng);
+
+            refmvs_candidate mvstack[8];
+            int n_mvs, ctx;
+            dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+                              (union refmvs_refpair) { .ref = {
+                                    b->ref[0] + 1, b->ref[1] + 1 }},
+                              bs, intra_edge_flags, t->by, t->bx);
+
+            b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+                                ts->cdf.m.comp_inter_mode[ctx],
+                                N_COMP_INTER_PRED_MODES - 1);
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n",
+                       b->inter_mode, ctx, n_mvs, ts->msac.rng);
+
+            const uint8_t *const im = dav1d_comp_inter_pred_modes[b->inter_mode];
+            b->drl_idx = NEAREST_DRL;
+            if (b->inter_mode == NEWMV_NEWMV) {
+                if (n_mvs > 1) { // NEARER, NEAR or NEARISH
+                    const int drl_ctx_v1 = get_drl_context(mvstack, 0);
+                    b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                      ts->cdf.m.drl_bit[drl_ctx_v1]);
+                    if (b->drl_idx == NEARER_DRL && n_mvs > 2) {
+                        const int drl_ctx_v2 = get_drl_context(mvstack, 1);
+                        b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                          ts->cdf.m.drl_bit[drl_ctx_v2]);
+                    }
+                    if (DEBUG_BLOCK_INFO)
+                        printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n",
+                               b->drl_idx, n_mvs, ts->msac.rng);
+                }
+            } else if (im[0] == NEARMV || im[1] == NEARMV) {
+                b->drl_idx = NEARER_DRL;
+                if (n_mvs > 2) { // NEAR or NEARISH
+                    const int drl_ctx_v2 = get_drl_context(mvstack, 1);
+                    b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                      ts->cdf.m.drl_bit[drl_ctx_v2]);
+                    if (b->drl_idx == NEAR_DRL && n_mvs > 3) {
+                        const int drl_ctx_v3 = get_drl_context(mvstack, 2);
+                        b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                          ts->cdf.m.drl_bit[drl_ctx_v3]);
+                    }
+                    if (DEBUG_BLOCK_INFO)
+                        printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n",
+                               b->drl_idx, n_mvs, ts->msac.rng);
+                }
+            }
+            assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
+
+#define assign_comp_mv(idx) \
+            switch (im[idx]) { \
+            case NEARMV: \
+            case NEARESTMV: \
+                b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
+                fix_mv_precision(f->frame_hdr, &b->mv[idx]); \
+                break; \
+            case GLOBALMV: \
+                has_subpel_filter |= \
+                    f->frame_hdr->gmv[b->ref[idx]].type == DAV1D_WM_TYPE_TRANSLATION; \
+                b->mv[idx] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[idx]], \
+                                        t->bx, t->by, bw4, bh4, f->frame_hdr); \
+                break; \
+            case NEWMV: \
+                b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
+                read_mv_residual(t, &b->mv[idx], &ts->cdf.mv, \
+                                 !f->frame_hdr->force_integer_mv); \
+                break; \
+            }
+            has_subpel_filter = imin(bw4, bh4) == 1 ||
+                                b->inter_mode != GLOBALMV_GLOBALMV;
+            assign_comp_mv(0);
+            assign_comp_mv(1);
+#undef assign_comp_mv
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-residual_mv[1:y=%d,x=%d,2:y=%d,x=%d]: r=%d\n",
+                       b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
+                       ts->msac.rng);
+
+            // jnt_comp vs. seg vs. wedge
+            int is_segwedge = 0;
+            if (f->seq_hdr->masked_compound) {
+                const int mask_ctx = get_mask_comp_ctx(t->a, &t->l, by4, bx4);
+
+                is_segwedge = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                  ts->cdf.m.mask_comp[mask_ctx]);
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-segwedge_vs_jntavg[%d,ctx=%d]: r=%d\n",
+                           is_segwedge, mask_ctx, ts->msac.rng);
+            }
+
+            if (!is_segwedge) {
+                if (f->seq_hdr->jnt_comp) {
+                    const int jnt_ctx =
+                        get_jnt_comp_ctx(f->seq_hdr->order_hint_n_bits,
+                                         f->cur.frame_hdr->frame_offset,
+                                         f->refp[b->ref[0]].p.frame_hdr->frame_offset,
+                                         f->refp[b->ref[1]].p.frame_hdr->frame_offset,
+                                         t->a, &t->l, by4, bx4);
+                    b->comp_type = COMP_INTER_WEIGHTED_AVG +
+                                   dav1d_msac_decode_bool_adapt(&ts->msac,
+                                       ts->cdf.m.jnt_comp[jnt_ctx]);
+                    if (DEBUG_BLOCK_INFO)
+                        printf("Post-jnt_comp[%d,ctx=%d[ac:%d,ar:%d,lc:%d,lr:%d]]: r=%d\n",
+                               b->comp_type == COMP_INTER_AVG,
+                               jnt_ctx, t->a->comp_type[bx4], t->a->ref[0][bx4],
+                               t->l.comp_type[by4], t->l.ref[0][by4],
+                               ts->msac.rng);
+                } else {
+                    b->comp_type = COMP_INTER_AVG;
+                }
+            } else {
+                if (wedge_allowed_mask & (1 << bs)) {
+                    const int ctx = dav1d_wedge_ctx_lut[bs];
+                    b->comp_type = COMP_INTER_WEDGE -
+                                   dav1d_msac_decode_bool_adapt(&ts->msac,
+                                       ts->cdf.m.wedge_comp[ctx]);
+                    if (b->comp_type == COMP_INTER_WEDGE)
+                        b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+                                           ts->cdf.m.wedge_idx[ctx], 15);
+                } else {
+                    b->comp_type = COMP_INTER_SEG;
+                }
+                b->mask_sign = dav1d_msac_decode_bool_equi(&ts->msac);
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-seg/wedge[%d,wedge_idx=%d,sign=%d]: r=%d\n",
+                           b->comp_type == COMP_INTER_WEDGE,
+                           b->wedge_idx, b->mask_sign, ts->msac.rng);
+            }
+        } else {
+            b->comp_type = COMP_INTER_NONE;
+
+            // ref
+            if (seg && seg->ref > 0) {
+                b->ref[0] = seg->ref - 1;
+            } else if (seg && (seg->globalmv || seg->skip)) {
+                b->ref[0] = 0;
+            } else {
+                const int ctx1 = av1_get_ref_ctx(t->a, &t->l, by4, bx4,
+                                                 have_top, have_left);
+                if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                                                 ts->cdf.m.ref[0][ctx1]))
+                {
+                    const int ctx2 = av1_get_ref_2_ctx(t->a, &t->l, by4, bx4,
+                                                       have_top, have_left);
+                    if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                                                     ts->cdf.m.ref[1][ctx2]))
+                    {
+                        b->ref[0] = 6;
+                    } else {
+                        const int ctx3 = av1_get_ref_6_ctx(t->a, &t->l, by4, bx4,
+                                                           have_top, have_left);
+                        b->ref[0] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac,
+                                            ts->cdf.m.ref[5][ctx3]);
+                    }
+                } else {
+                    const int ctx2 = av1_get_ref_3_ctx(t->a, &t->l, by4, bx4,
+                                                       have_top, have_left);
+                    if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                                                     ts->cdf.m.ref[2][ctx2]))
+                    {
+                        const int ctx3 = av1_get_ref_5_ctx(t->a, &t->l, by4, bx4,
+                                                           have_top, have_left);
+                        b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac,
+                                            ts->cdf.m.ref[4][ctx3]);
+                    } else {
+                        const int ctx3 = av1_get_ref_4_ctx(t->a, &t->l, by4, bx4,
+                                                           have_top, have_left);
+                        b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                        ts->cdf.m.ref[3][ctx3]);
+                    }
+                }
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-ref[%d]: r=%d\n", b->ref[0], ts->msac.rng);
+            }
+            b->ref[1] = -1;
+
+            refmvs_candidate mvstack[8];
+            int n_mvs, ctx;
+            dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+                              (union refmvs_refpair) { .ref = { b->ref[0] + 1, -1 }},
+                              bs, intra_edge_flags, t->by, t->bx);
+
+            // mode parsing and mv derivation from ref_mvs
+            if ((seg && (seg->skip || seg->globalmv)) ||
+                dav1d_msac_decode_bool_adapt(&ts->msac,
+                                             ts->cdf.m.newmv_mode[ctx & 7]))
+            {
+                if ((seg && (seg->skip || seg->globalmv)) ||
+                    !dav1d_msac_decode_bool_adapt(&ts->msac,
+                         ts->cdf.m.globalmv_mode[(ctx >> 3) & 1]))
+                {
+                    b->inter_mode = GLOBALMV;
+                    b->mv[0] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[0]],
+                                          t->bx, t->by, bw4, bh4, f->frame_hdr);
+                    has_subpel_filter = imin(bw4, bh4) == 1 ||
+                        f->frame_hdr->gmv[b->ref[0]].type == DAV1D_WM_TYPE_TRANSLATION;
+                } else {
+                    has_subpel_filter = 1;
+                    if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                            ts->cdf.m.refmv_mode[(ctx >> 4) & 15]))
+                    { // NEAREST, NEARER, NEAR or NEARISH
+                        b->inter_mode = NEARMV;
+                        b->drl_idx = NEARER_DRL;
+                        if (n_mvs > 2) { // NEARER, NEAR or NEARISH
+                            const int drl_ctx_v2 = get_drl_context(mvstack, 1);
+                            b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                              ts->cdf.m.drl_bit[drl_ctx_v2]);
+                            if (b->drl_idx == NEAR_DRL && n_mvs > 3) { // NEAR or NEARISH
+                                const int drl_ctx_v3 =
+                                    get_drl_context(mvstack, 2);
+                                b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                                  ts->cdf.m.drl_bit[drl_ctx_v3]);
+                            }
+                        }
+                    } else {
+                        b->inter_mode = NEARESTMV;
+                        b->drl_idx = NEAREST_DRL;
+                    }
+                    assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
+                    b->mv[0] = mvstack[b->drl_idx].mv.mv[0];
+                    if (b->drl_idx < NEAR_DRL)
+                        fix_mv_precision(f->frame_hdr, &b->mv[0]);
+                }
+
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-intermode[%d,drl=%d,mv=y:%d,x:%d,n_mvs=%d]: r=%d\n",
+                           b->inter_mode, b->drl_idx, b->mv[0].y, b->mv[0].x, n_mvs,
+                           ts->msac.rng);
+            } else {
+                has_subpel_filter = 1;
+                b->inter_mode = NEWMV;
+                b->drl_idx = NEAREST_DRL;
+                if (n_mvs > 1) { // NEARER, NEAR or NEARISH
+                    const int drl_ctx_v1 = get_drl_context(mvstack, 0);
+                    b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                      ts->cdf.m.drl_bit[drl_ctx_v1]);
+                    if (b->drl_idx == NEARER_DRL && n_mvs > 2) { // NEAR or NEARISH
+                        const int drl_ctx_v2 = get_drl_context(mvstack, 1);
+                        b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                          ts->cdf.m.drl_bit[drl_ctx_v2]);
+                    }
+                }
+                assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
+                if (n_mvs > 1) {
+                    b->mv[0] = mvstack[b->drl_idx].mv.mv[0];
+                } else {
+                    assert(!b->drl_idx);
+                    b->mv[0] = mvstack[0].mv.mv[0];
+                    fix_mv_precision(f->frame_hdr, &b->mv[0]);
+                }
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-intermode[%d,drl=%d]: r=%d\n",
+                           b->inter_mode, b->drl_idx, ts->msac.rng);
+                read_mv_residual(t, &b->mv[0], &ts->cdf.mv,
+                                 !f->frame_hdr->force_integer_mv);
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-residualmv[mv=y:%d,x:%d]: r=%d\n",
+                           b->mv[0].y, b->mv[0].x, ts->msac.rng);
+            }
+
+            // interintra flags
+            const int ii_sz_grp = dav1d_ymode_size_context[bs];
+            if (f->seq_hdr->inter_intra &&
+                interintra_allowed_mask & (1 << bs) &&
+                dav1d_msac_decode_bool_adapt(&ts->msac,
+                                             ts->cdf.m.interintra[ii_sz_grp]))
+            {
+                b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                                         ts->cdf.m.interintra_mode[ii_sz_grp],
+                                         N_INTER_INTRA_PRED_MODES - 1);
+                const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
+                b->interintra_type = INTER_INTRA_BLEND +
+                                     dav1d_msac_decode_bool_adapt(&ts->msac,
+                                         ts->cdf.m.interintra_wedge[wedge_ctx]);
+                if (b->interintra_type == INTER_INTRA_WEDGE)
+                    b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+                                       ts->cdf.m.wedge_idx[wedge_ctx], 15);
+            } else {
+                b->interintra_type = INTER_INTRA_NONE;
+            }
+            if (DEBUG_BLOCK_INFO && f->seq_hdr->inter_intra &&
+                interintra_allowed_mask & (1 << bs))
+            {
+                printf("Post-interintra[t=%d,m=%d,w=%d]: r=%d\n",
+                       b->interintra_type, b->interintra_mode,
+                       b->wedge_idx, ts->msac.rng);
+            }
+
+            // motion variation
+            if (f->frame_hdr->switchable_motion_mode &&
+                b->interintra_type == INTER_INTRA_NONE && imin(bw4, bh4) >= 2 &&
+                // is not warped global motion
+                !(!f->frame_hdr->force_integer_mv && b->inter_mode == GLOBALMV &&
+                  f->frame_hdr->gmv[b->ref[0]].type > DAV1D_WM_TYPE_TRANSLATION) &&
+                // has overlappable neighbours
+                ((have_left && findoddzero(&t->l.intra[by4 + 1], h4 >> 1)) ||
+                 (have_top && findoddzero(&t->a->intra[bx4 + 1], w4 >> 1))))
+            {
+                // reaching here means the block allows obmc - check warp by
+                // finding matching-ref blocks in top/left edges
+                uint64_t mask[2] = { 0, 0 };
+                find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4,
+                                  have_left, have_top, b->ref[0], mask);
+                const int allow_warp = !f->svc[b->ref[0]][0].scale &&
+                    !f->frame_hdr->force_integer_mv &&
+                    f->frame_hdr->warp_motion && (mask[0] | mask[1]);
+
+                b->motion_mode = allow_warp ?
+                    dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                        ts->cdf.m.motion_mode[bs], 2) :
+                    dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
+                if (b->motion_mode == MM_WARP) {
+                    has_subpel_filter = 0;
+                    derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv);
+#define signabs(v) v < 0 ? '-' : ' ', abs(v)
+                    if (DEBUG_BLOCK_INFO)
+                        printf("[ %c%x %c%x %c%x\n  %c%x %c%x %c%x ]\n"
+                               "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, "
+                               "mv=y:%d,x:%d\n",
+                               signabs(t->warpmv.matrix[0]),
+                               signabs(t->warpmv.matrix[1]),
+                               signabs(t->warpmv.matrix[2]),
+                               signabs(t->warpmv.matrix[3]),
+                               signabs(t->warpmv.matrix[4]),
+                               signabs(t->warpmv.matrix[5]),
+                               signabs(t->warpmv.alpha),
+                               signabs(t->warpmv.beta),
+                               signabs(t->warpmv.gamma),
+                               signabs(t->warpmv.delta),
+                               b->mv[0].y, b->mv[0].x);
+#undef signabs
+                    if (f->frame_thread.pass) {
+                        if (t->warpmv.type == DAV1D_WM_TYPE_AFFINE) {
+                            b->matrix[0] = t->warpmv.matrix[2] - 0x10000;
+                            b->matrix[1] = t->warpmv.matrix[3];
+                            b->matrix[2] = t->warpmv.matrix[4];
+                            b->matrix[3] = t->warpmv.matrix[5] - 0x10000;
+                        } else {
+                            b->matrix[0] = SHRT_MIN;
+                        }
+                    }
+                }
+
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-motionmode[%d]: r=%d [mask: 0x%" PRIu64 "x/0x%"
+                           PRIu64 "x]\n", b->motion_mode, ts->msac.rng, mask[0],
+                            mask[1]);
+            } else {
+                b->motion_mode = MM_TRANSLATION;
+            }
+        }
+
+        // subpel filter
+        enum Dav1dFilterMode filter[2];
+        if (f->frame_hdr->subpel_filter_mode == DAV1D_FILTER_SWITCHABLE) {
+            if (has_subpel_filter) {
+                const int comp = b->comp_type != COMP_INTER_NONE;
+                const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0],
+                                                by4, bx4);
+                filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                               ts->cdf.m.filter[0][ctx1],
+                               DAV1D_N_SWITCHABLE_FILTERS - 1);
+                if (f->seq_hdr->dual_filter) {
+                    const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1,
+                                                    b->ref[0], by4, bx4);
+                    if (DEBUG_BLOCK_INFO)
+                        printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n",
+                               filter[0], ctx1, ts->msac.rng);
+                    filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                                    ts->cdf.m.filter[1][ctx2],
+                                    DAV1D_N_SWITCHABLE_FILTERS - 1);
+                    if (DEBUG_BLOCK_INFO)
+                        printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n",
+                               filter[1], ctx2, ts->msac.rng);
+                } else {
+                    filter[1] = filter[0];
+                    if (DEBUG_BLOCK_INFO)
+                        printf("Post-subpel_filter[%d,ctx=%d]: r=%d\n",
+                               filter[0], ctx1, ts->msac.rng);
+                }
+            } else {
+                filter[0] = filter[1] = DAV1D_FILTER_8TAP_REGULAR;
+            }
+        } else {
+            filter[0] = filter[1] = f->frame_hdr->subpel_filter_mode;
+        }
+        b->filter2d = dav1d_filter_2d[filter[1]][filter[0]];
+
+        read_vartx_tree(t, b, bs, bx4, by4);
+
+        // reconstruction
+        if (f->frame_thread.pass == 1) {
+            f->bd_fn.read_coef_blocks(t, bs, b);
+        } else {
+            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
+        }
+
+        if (f->frame_hdr->loopfilter.level_y[0] ||
+            f->frame_hdr->loopfilter.level_y[1])
+        {
+            const int is_globalmv =
+                b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
+            const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
+                &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
+            const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
+            dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls,
+                                       t->bx, t->by, f->w4, f->h4, b->skip, bs,
+                                       f->frame_hdr->segmentation.lossless[b->seg_id] ?
+                                           (enum RectTxfmSize) TX_4X4 : b->max_ytx,
+                                       tx_split, b->uvtx, f->cur.p.layout,
+                                       &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
+                                       has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
+                                       has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
+        }
+
+        // context updates
+        if (is_comp) {
+            splat_tworef_mv(&t->rt, t->by, t->bx, bs, b->inter_mode,
+                            (refmvs_refpair) { .ref = { b->ref[0], b->ref[1] }},
+                            (refmvs_mvpair) { .mv = { [0] = b->mv[0], [1] = b->mv[1] }});
+        } else {
+            splat_oneref_mv(&t->rt, t->by, t->bx, bs, b->inter_mode,
+                            b->ref[0], b->mv[0], b->interintra_type);
+        }
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+        rep_macro(type, t->dir skip_mode, off, mul * b->skip_mode); \
+        rep_macro(type, t->dir intra, off, 0); \
+        rep_macro(type, t->dir skip, off, mul * b->skip); \
+        rep_macro(type, t->dir pal_sz, off, 0); \
+        /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+        rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
+        rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
+        rep_macro(type, t->dir comp_type, off, mul * b->comp_type); \
+        rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
+        rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
+        rep_macro(type, t->dir mode, off, mul * b->inter_mode); \
+        rep_macro(type, t->dir ref[0], off, mul * b->ref[0]); \
+        rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) b->ref[1]))
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
+        if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+            case_set(cbh4, l., 1, cby4);
+            case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+        }
+    }
+
+    // update contexts
+    if (f->frame_hdr->segmentation.enabled &&
+        f->frame_hdr->segmentation.update_map)
+    {
+        uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx];
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        for (int y = 0; y < bh4; y++) { \
+            rep_macro(type, seg_ptr, 0, mul * b->seg_id); \
+            seg_ptr += f->b4_stride; \
+        }
+        case_set(bw4, NULL, 0, 0);
+#undef set_ctx
+    }
+    if (!b->skip) {
+        uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4];
+        const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
+        const int bx_idx = (bx4 & 16) >> 4;
+        for (int y = 0; y < bh4; y++, noskip_mask++) {
+            (*noskip_mask)[bx_idx] |= mask;
+            if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway
+                (*noskip_mask)[1] |= mask;
+        }
+    }
+
+    return 0;
+}
+
+#if __has_feature(memory_sanitizer)
+
+#include <sanitizer/msan_interface.h>
+
+static int checked_decode_b(Dav1dTileContext *const t,
+                            const enum BlockLevel bl,
+                            const enum BlockSize bs,
+                            const enum BlockPartition bp,
+                            const enum EdgeFlags intra_edge_flags)
+{
+    const Dav1dFrameContext *const f = t->f;
+    const int err = decode_b(t, bl, bs, bp, intra_edge_flags);
+
+    if (err == 0 && !(f->frame_thread.pass & 1)) {
+        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+        const int bw4 = b_dim[0], bh4 = b_dim[1];
+        const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+        const int has_chroma = f->seq_hdr->layout != DAV1D_PIXEL_LAYOUT_I400 &&
+                               (bw4 > ss_hor || t->bx & 1) &&
+                               (bh4 > ss_ver || t->by & 1);
+
+        for (int p = 0; p < 1 + 2 * has_chroma; p++) {
+            const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+            const ptrdiff_t stride = f->cur.stride[!!p];
+            const int bx = t->bx & ~ss_hor;
+            const int by = t->by & ~ss_ver;
+            const int width  = w4 << (2 - ss_hor + (bw4 == ss_hor));
+            const int height = h4 << (2 - ss_ver + (bh4 == ss_ver));
+
+            const uint8_t *data = f->cur.data[p] + (by << (2 - ss_ver)) * stride +
+                                  (bx << (2 - ss_hor + !!f->seq_hdr->hbd));
+
+            for (int y = 0; y < height; data += stride, y++) {
+                const size_t line_sz = width << !!f->seq_hdr->hbd;
+                if (__msan_test_shadow(data, line_sz) != -1) {
+                    fprintf(stderr, "B[%d](%d, %d) w4:%d, h4:%d, row:%d\n",
+                            p, bx, by, w4, h4, y);
+                    __msan_check_mem_is_initialized(data, line_sz);
+                }
+            }
+        }
+    }
+
+    return err;
+}
+
+#define decode_b checked_decode_b
+
+#endif /* defined(__has_feature) */
+
+static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
+                     const EdgeNode *const node)
+{
+    const Dav1dFrameContext *const f = t->f;
+    const int hsz = 16 >> bl;
+    const int have_h_split = f->bw > t->bx + hsz;
+    const int have_v_split = f->bh > t->by + hsz;
+
+    if (!have_h_split && !have_v_split) {
+        assert(bl < BL_8X8);
+        return decode_sb(t, bl + 1, ((const EdgeBranch *) node)->split[0]);
+    }
+
+    uint16_t *pc;
+    enum BlockPartition bp;
+    int ctx, bx8, by8;
+    if (f->frame_thread.pass != 2) {
+        if (0 && bl == BL_64X64)
+            printf("poc=%d,y=%d,x=%d,bl=%d,r=%d\n",
+                   f->frame_hdr->frame_offset, t->by, t->bx, bl, t->ts->msac.rng);
+        bx8 = (t->bx & 31) >> 1;
+        by8 = (t->by & 31) >> 1;
+        ctx = get_partition_ctx(t->a, &t->l, bl, by8, bx8);
+        pc = t->ts->cdf.m.partition[bl][ctx];
+    }
+
+    if (have_h_split && have_v_split) {
+        if (f->frame_thread.pass == 2) {
+            const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
+            bp = b->bl == bl ? b->bp : PARTITION_SPLIT;
+        } else {
+            bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc,
+                                                  dav1d_partition_type_count[bl]);
+            if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
+                (bp == PARTITION_V || bp == PARTITION_V4 ||
+                 bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
+            {
+                return 1;
+            }
+            if (DEBUG_BLOCK_INFO)
+                printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
+                       f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, bp,
+                       t->ts->msac.rng);
+        }
+        const uint8_t *const b = dav1d_block_sizes[bl][bp];
+
+        switch (bp) {
+        case PARTITION_NONE:
+            if (decode_b(t, bl, b[0], PARTITION_NONE, node->o))
+                return -1;
+            break;
+        case PARTITION_H:
+            if (decode_b(t, bl, b[0], PARTITION_H, node->h[0]))
+                return -1;
+            t->by += hsz;
+            if (decode_b(t, bl, b[0], PARTITION_H, node->h[1]))
+                return -1;
+            t->by -= hsz;
+            break;
+        case PARTITION_V:
+            if (decode_b(t, bl, b[0], PARTITION_V, node->v[0]))
+                return -1;
+            t->bx += hsz;
+            if (decode_b(t, bl, b[0], PARTITION_V, node->v[1]))
+                return -1;
+            t->bx -= hsz;
+            break;
+        case PARTITION_SPLIT:
+            if (bl == BL_8X8) {
+                const EdgeTip *const tip = (const EdgeTip *) node;
+                assert(hsz == 1);
+                if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[0]))
+                    return -1;
+                const enum Filter2d tl_filter = t->tl_4x4_filter;
+                t->bx++;
+                if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[1]))
+                    return -1;
+                t->bx--;
+                t->by++;
+                if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[2]))
+                    return -1;
+                t->bx++;
+                t->tl_4x4_filter = tl_filter;
+                if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[3]))
+                    return -1;
+                t->bx--;
+                t->by--;
+            } else {
+                const EdgeBranch *const branch = (const EdgeBranch *) node;
+                if (decode_sb(t, bl + 1, branch->split[0]))
+                    return 1;
+                t->bx += hsz;
+                if (decode_sb(t, bl + 1, branch->split[1]))
+                    return 1;
+                t->bx -= hsz;
+                t->by += hsz;
+                if (decode_sb(t, bl + 1, branch->split[2]))
+                    return 1;
+                t->bx += hsz;
+                if (decode_sb(t, bl + 1, branch->split[3]))
+                    return 1;
+                t->bx -= hsz;
+                t->by -= hsz;
+            }
+            break;
+        case PARTITION_T_TOP_SPLIT: {
+            const EdgeBranch *const branch = (const EdgeBranch *) node;
+            if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, branch->tts[0]))
+                return -1;
+            t->bx += hsz;
+            if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, branch->tts[1]))
+                return -1;
+            t->bx -= hsz;
+            t->by += hsz;
+            if (decode_b(t, bl, b[1], PARTITION_T_TOP_SPLIT, branch->tts[2]))
+                return -1;
+            t->by -= hsz;
+            break;
+        }
+        case PARTITION_T_BOTTOM_SPLIT: {
+            const EdgeBranch *const branch = (const EdgeBranch *) node;
+            if (decode_b(t, bl, b[0], PARTITION_T_BOTTOM_SPLIT, branch->tbs[0]))
+                return -1;
+            t->by += hsz;
+            if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, branch->tbs[1]))
+                return -1;
+            t->bx += hsz;
+            if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, branch->tbs[2]))
+                return -1;
+            t->bx -= hsz;
+            t->by -= hsz;
+            break;
+        }
+        case PARTITION_T_LEFT_SPLIT: {
+            const EdgeBranch *const branch = (const EdgeBranch *) node;
+            if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, branch->tls[0]))
+                return -1;
+            t->by += hsz;
+            if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, branch->tls[1]))
+                return -1;
+            t->by -= hsz;
+            t->bx += hsz;
+            if (decode_b(t, bl, b[1], PARTITION_T_LEFT_SPLIT, branch->tls[2]))
+                return -1;
+            t->bx -= hsz;
+            break;
+        }
+        case PARTITION_T_RIGHT_SPLIT: {
+            const EdgeBranch *const branch = (const EdgeBranch *) node;
+            if (decode_b(t, bl, b[0], PARTITION_T_RIGHT_SPLIT, branch->trs[0]))
+                return -1;
+            t->bx += hsz;
+            if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, branch->trs[1]))
+                return -1;
+            t->by += hsz;
+            if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, branch->trs[2]))
+                return -1;
+            t->by -= hsz;
+            t->bx -= hsz;
+            break;
+        }
+        case PARTITION_H4: {
+            const EdgeBranch *const branch = (const EdgeBranch *) node;
+            if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[0]))
+                return -1;
+            t->by += hsz >> 1;
+            if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[1]))
+                return -1;
+            t->by += hsz >> 1;
+            if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[2]))
+                return -1;
+            t->by += hsz >> 1;
+            if (t->by < f->bh)
+                if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[3]))
+                    return -1;
+            t->by -= hsz * 3 >> 1;
+            break;
+        }
+        case PARTITION_V4: {
+            const EdgeBranch *const branch = (const EdgeBranch *) node;
+            if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[0]))
+                return -1;
+            t->bx += hsz >> 1;
+            if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[1]))
+                return -1;
+            t->bx += hsz >> 1;
+            if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[2]))
+                return -1;
+            t->bx += hsz >> 1;
+            if (t->bx < f->bw)
+                if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[3]))
+                    return -1;
+            t->bx -= hsz * 3 >> 1;
+            break;
+        }
+        default: assert(0);
+        }
+    } else if (have_h_split) {
+        unsigned is_split;
+        if (f->frame_thread.pass == 2) {
+            const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
+            is_split = b->bl != bl;
+        } else {
+            is_split = dav1d_msac_decode_bool(&t->ts->msac,
+                           gather_top_partition_prob(pc, bl));
+            if (DEBUG_BLOCK_INFO)
+                printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
+                       f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
+                       is_split ? PARTITION_SPLIT : PARTITION_H, t->ts->msac.rng);
+        }
+
+        assert(bl < BL_8X8);
+        if (is_split) {
+            const EdgeBranch *const branch = (const EdgeBranch *) node;
+            bp = PARTITION_SPLIT;
+            if (decode_sb(t, bl + 1, branch->split[0])) return 1;
+            t->bx += hsz;
+            if (decode_sb(t, bl + 1, branch->split[1])) return 1;
+            t->bx -= hsz;
+        } else {
+            bp = PARTITION_H;
+            if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_H][0],
+                         PARTITION_H, node->h[0]))
+                return -1;
+        }
+    } else {
+        assert(have_v_split);
+        unsigned is_split;
+        if (f->frame_thread.pass == 2) {
+            const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
+            is_split = b->bl != bl;
+        } else {
+            is_split = dav1d_msac_decode_bool(&t->ts->msac,
+                           gather_left_partition_prob(pc, bl));
+            if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split)
+                return 1;
+            if (DEBUG_BLOCK_INFO)
+                printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
+                       f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
+                       is_split ? PARTITION_SPLIT : PARTITION_V, t->ts->msac.rng);
+        }
+
+        assert(bl < BL_8X8);
+        if (is_split) {
+            const EdgeBranch *const branch = (const EdgeBranch *) node;
+            bp = PARTITION_SPLIT;
+            if (decode_sb(t, bl + 1, branch->split[0])) return 1;
+            t->by += hsz;
+            if (decode_sb(t, bl + 1, branch->split[2])) return 1;
+            t->by -= hsz;
+        } else {
+            bp = PARTITION_V;
+            if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_V][0],
+                         PARTITION_V, node->v[0]))
+                return -1;
+        }
+    }
+
+    if (f->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \
+        rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp])
+        case_set_upto16(hsz,,,);
+#undef set_ctx
+    }
+
+    return 0;
+}
+
+static void reset_context(BlockContext *const ctx, const int keyframe, const int pass) {
+    memset(ctx->intra, keyframe, sizeof(ctx->intra));
+    memset(ctx->uvmode, DC_PRED, sizeof(ctx->uvmode));
+    if (keyframe)
+        memset(ctx->mode, DC_PRED, sizeof(ctx->mode));
+
+    if (pass == 2) return;
+
+    memset(ctx->partition, 0, sizeof(ctx->partition));
+    memset(ctx->skip, 0, sizeof(ctx->skip));
+    memset(ctx->skip_mode, 0, sizeof(ctx->skip_mode));
+    memset(ctx->tx_lpf_y, 2, sizeof(ctx->tx_lpf_y));
+    memset(ctx->tx_lpf_uv, 1, sizeof(ctx->tx_lpf_uv));
+    memset(ctx->tx_intra, -1, sizeof(ctx->tx_intra));
+    memset(ctx->tx, TX_64X64, sizeof(ctx->tx));
+    if (!keyframe) {
+        memset(ctx->ref, -1, sizeof(ctx->ref));
+        memset(ctx->comp_type, 0, sizeof(ctx->comp_type));
+        memset(ctx->mode, NEARESTMV, sizeof(ctx->mode));
+    }
+    memset(ctx->lcoef, 0x40, sizeof(ctx->lcoef));
+    memset(ctx->ccoef, 0x40, sizeof(ctx->ccoef));
+    memset(ctx->filter, DAV1D_N_SWITCHABLE_FILTERS, sizeof(ctx->filter));
+    memset(ctx->seg_pred, 0, sizeof(ctx->seg_pred));
+    memset(ctx->pal_sz, 0, sizeof(ctx->pal_sz));
+}
+
+// { Y+U+V, Y+U } * 4
+static const uint8_t ss_size_mul[4][2] = {
+    [DAV1D_PIXEL_LAYOUT_I400] = {  4, 4 },
+    [DAV1D_PIXEL_LAYOUT_I420] = {  6, 5 },
+    [DAV1D_PIXEL_LAYOUT_I422] = {  8, 6 },
+    [DAV1D_PIXEL_LAYOUT_I444] = { 12, 8 },
+};
+
+static void setup_tile(Dav1dTileState *const ts,
+                       const Dav1dFrameContext *const f,
+                       const uint8_t *const data, const size_t sz,
+                       const int tile_row, const int tile_col,
+                       const int tile_start_off)
+{
+    const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
+    const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
+    const int col_sb_end = f->frame_hdr->tiling.col_start_sb[tile_col + 1];
+    const int row_sb_start = f->frame_hdr->tiling.row_start_sb[tile_row];
+    const int row_sb_end = f->frame_hdr->tiling.row_start_sb[tile_row + 1];
+    const int sb_shift = f->sb_shift;
+
+    const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
+    ts->frame_thread.pal_idx = f->frame_thread.pal_idx ?
+        &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] :
+        NULL;
+
+    ts->frame_thread.cf = f->frame_thread.cf ?
+        (uint8_t*)f->frame_thread.cf +
+            (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
+        NULL;
+
+    dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
+    ts->last_qidx = f->frame_hdr->quant.yac;
+    memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));
+
+    dav1d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update);
+
+    ts->tiling.row = tile_row;
+    ts->tiling.col = tile_col;
+    ts->tiling.col_start = col_sb_start << sb_shift;
+    ts->tiling.col_end = imin(col_sb_end << sb_shift, f->bw);
+    ts->tiling.row_start = row_sb_start << sb_shift;
+    ts->tiling.row_end = imin(row_sb_end << sb_shift, f->bh);
+
+    // Reference Restoration Unit (used for exp coding)
+    int sb_idx, unit_idx;
+    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+        // vertical components only
+        sb_idx = (ts->tiling.row_start >> 5) * f->sr_sb128w;
+        unit_idx = (ts->tiling.row_start & 16) >> 3;
+    } else {
+        sb_idx = (ts->tiling.row_start >> 5) * f->sb128w + col_sb128_start;
+        unit_idx = ((ts->tiling.row_start & 16) >> 3) +
+                   ((ts->tiling.col_start & 16) >> 4);
+    }
+    for (int p = 0; p < 3; p++) {
+        if (!((f->lf.restore_planes >> p) & 1U))
+            continue;
+
+        if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+            const int d = f->frame_hdr->super_res.width_scale_denominator;
+            const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
+            const int rnd = (8 << unit_size_log2) - 1, shift = unit_size_log2 + 3;
+            const int x = ((4 * ts->tiling.col_start * d >> ss_hor) + rnd) >> shift;
+            const int px_x = x << (unit_size_log2 + ss_hor);
+            const int u_idx = unit_idx + ((px_x & 64) >> 6);
+            const int sb128x = px_x >> 7;
+            if (sb128x >= f->sr_sb128w) continue;
+            ts->lr_ref[p] = &f->lf.lr_mask[sb_idx + sb128x].lr[p][u_idx];
+        } else {
+            ts->lr_ref[p] = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
+        }
+
+        ts->lr_ref[p]->filter_v[0] = 3;
+        ts->lr_ref[p]->filter_v[1] = -7;
+        ts->lr_ref[p]->filter_v[2] = 15;
+        ts->lr_ref[p]->filter_h[0] = 3;
+        ts->lr_ref[p]->filter_h[1] = -7;
+        ts->lr_ref[p]->filter_h[2] = 15;
+        ts->lr_ref[p]->sgr_weights[0] = -32;
+        ts->lr_ref[p]->sgr_weights[1] = 31;
+    }
+
+    if (f->n_tc > 1)
+        atomic_init(&ts->progress, row_sb_start);
+}
+
+static void read_restoration_info(Dav1dTileContext *const t,
+                                  Av1RestorationUnit *const lr, const int p,
+                                  const enum Dav1dRestorationType frame_type)
+{
+    const Dav1dFrameContext *const f = t->f;
+    Dav1dTileState *const ts = t->ts;
+
+    if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
+        const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                               ts->cdf.m.restore_switchable, 2);
+        lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ :
+                                          DAV1D_RESTORATION_WIENER :
+                                          DAV1D_RESTORATION_NONE;
+    } else {
+        const unsigned type =
+            dav1d_msac_decode_bool_adapt(&ts->msac,
+                frame_type == DAV1D_RESTORATION_WIENER ?
+                ts->cdf.m.restore_wiener : ts->cdf.m.restore_sgrproj);
+        lr->type = type ? frame_type : DAV1D_RESTORATION_NONE;
+    }
+
+    if (lr->type == DAV1D_RESTORATION_WIENER) {
+        lr->filter_v[0] = p ? 0 :
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->filter_v[0] + 5, 16, 1) - 5;
+        lr->filter_v[1] =
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->filter_v[1] + 23, 32, 2) - 23;
+        lr->filter_v[2] =
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->filter_v[2] + 17, 64, 3) - 17;
+
+        lr->filter_h[0] = p ? 0 :
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->filter_h[0] + 5, 16, 1) - 5;
+        lr->filter_h[1] =
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->filter_h[1] + 23, 32, 2) - 23;
+        lr->filter_h[2] =
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->filter_h[2] + 17, 64, 3) - 17;
+        memcpy(lr->sgr_weights, ts->lr_ref[p]->sgr_weights, sizeof(lr->sgr_weights));
+        ts->lr_ref[p] = lr;
+        if (DEBUG_BLOCK_INFO)
+            printf("Post-lr_wiener[pl=%d,v[%d,%d,%d],h[%d,%d,%d]]: r=%d\n",
+                   p, lr->filter_v[0], lr->filter_v[1],
+                   lr->filter_v[2], lr->filter_h[0],
+                   lr->filter_h[1], lr->filter_h[2], ts->msac.rng);
+    } else if (lr->type == DAV1D_RESTORATION_SGRPROJ) {
+        const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4);
+        lr->sgr_idx = idx;
+        lr->sgr_weights[0] = dav1d_sgr_params[idx][0] ?
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 :
+            0;
+        lr->sgr_weights[1] = dav1d_sgr_params[idx][1] ?
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 :
+            95;
+        memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v));
+        memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h));
+        ts->lr_ref[p] = lr;
+        if (DEBUG_BLOCK_INFO)
+            printf("Post-lr_sgrproj[pl=%d,idx=%d,w[%d,%d]]: r=%d\n",
+                   p, lr->sgr_idx, lr->sgr_weights[0],
+                   lr->sgr_weights[1], ts->msac.rng);
+    }
+}
+
+int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
+    const Dav1dFrameContext *const f = t->f;
+    const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64;
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dContext *const c = f->c;
+    const int sb_step = f->sb_step;
+    const int tile_row = ts->tiling.row, tile_col = ts->tiling.col;
+    const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
+    const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
+
+    if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
+        dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start,
+                                     ts->tiling.col_end, ts->tiling.row_start,
+                                     ts->tiling.row_end, t->by >> f->sb_shift,
+                                     ts->tiling.row);
+    }
+
+    reset_context(&t->l, !(f->frame_hdr->frame_type & 1), f->frame_thread.pass);
+    if (f->frame_thread.pass == 2) {
+        for (t->bx = ts->tiling.col_start,
+             t->a = f->a + col_sb128_start + tile_row * f->sb128w;
+             t->bx < ts->tiling.col_end; t->bx += sb_step)
+        {
+            if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
+                return 1;
+            if (decode_sb(t, root_bl, c->intra_edge.root[root_bl]))
+                return 1;
+            if (t->bx & 16 || f->seq_hdr->sb128)
+                t->a++;
+        }
+        f->bd_fn.backup_ipred_edge(t);
+        return 0;
+    }
+
+    // error out on symbol decoder overread
+    if (ts->msac.cnt < -15) return 1;
+
+    if (f->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) {
+        if (c->n_fc > 1) for (int n = 0; n < 7; n++)
+            if (dav1d_thread_picture_wait(&f->refp[n], 4 * (t->by + sb_step),
+                                          PLANE_TYPE_BLOCK))
+            {
+                return 1;
+            }
+        dav1d_refmvs_load_tmvs(&f->rf, ts->tiling.row,
+                               ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
+                               t->by >> 1, (t->by + sb_step) >> 1);
+    }
+    memset(t->pal_sz_uv[1], 0, sizeof(*t->pal_sz_uv));
+    const int sb128y = t->by >> 5;
+    for (t->bx = ts->tiling.col_start, t->a = f->a + col_sb128_start + tile_row * f->sb128w,
+         t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start;
+         t->bx < ts->tiling.col_end; t->bx += sb_step)
+    {
+        if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
+            return 1;
+        if (root_bl == BL_128X128) {
+            t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx;
+            t->cur_sb_cdef_idx_ptr[0] = -1;
+            t->cur_sb_cdef_idx_ptr[1] = -1;
+            t->cur_sb_cdef_idx_ptr[2] = -1;
+            t->cur_sb_cdef_idx_ptr[3] = -1;
+        } else {
+            t->cur_sb_cdef_idx_ptr =
+                &t->lf_mask->cdef_idx[((t->bx & 16) >> 4) +
+                                      ((t->by & 16) >> 3)];
+            t->cur_sb_cdef_idx_ptr[0] = -1;
+        }
+        // Restoration filter
+        for (int p = 0; p < 3; p++) {
+            if (!((f->lf.restore_planes >> p) & 1U))
+                continue;
+
+            const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+            const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
+            const int y = t->by * 4 >> ss_ver;
+            const int h = (f->cur.p.h + ss_ver) >> ss_ver;
+
+            const int unit_size = 1 << unit_size_log2;
+            const unsigned mask = unit_size - 1;
+            if (y & mask) continue;
+            const int half_unit = unit_size >> 1;
+            // Round half up at frame boundaries, if there's more than one
+            // restoration unit
+            if (y && y + half_unit > h) continue;
+
+            const enum Dav1dRestorationType frame_type = f->frame_hdr->restoration.type[p];
+
+            if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+                const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+                const int n_units = imax(1, (w + half_unit) >> unit_size_log2);
+
+                const int d = f->frame_hdr->super_res.width_scale_denominator;
+                const int rnd = unit_size * 8 - 1, shift = unit_size_log2 + 3;
+                const int x0 = ((4 *  t->bx            * d >> ss_hor) + rnd) >> shift;
+                const int x1 = ((4 * (t->bx + sb_step) * d >> ss_hor) + rnd) >> shift;
+
+                for (int x = x0; x < imin(x1, n_units); x++) {
+                    const int px_x = x << (unit_size_log2 + ss_hor);
+                    const int sb_idx = (t->by >> 5) * f->sr_sb128w + (px_x >> 7);
+                    const int unit_idx = ((t->by & 16) >> 3) + ((px_x & 64) >> 6);
+                    Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
+
+                    read_restoration_info(t, lr, p, frame_type);
+                }
+            } else {
+                const int x = 4 * t->bx >> ss_hor;
+                if (x & mask) continue;
+                const int w = (f->cur.p.w + ss_hor) >> ss_hor;
+                // Round half up at frame boundaries, if there's more than one
+                // restoration unit
+                if (x && x + half_unit > w) continue;
+                const int sb_idx = (t->by >> 5) * f->sr_sb128w + (t->bx >> 5);
+                const int unit_idx = ((t->by & 16) >> 3) + ((t->bx & 16) >> 4);
+                Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
+
+                read_restoration_info(t, lr, p, frame_type);
+            }
+        }
+        if (decode_sb(t, root_bl, c->intra_edge.root[root_bl]))
+            return 1;
+        if (t->bx & 16 || f->seq_hdr->sb128) {
+            t->a++;
+            t->lf_mask++;
+        }
+    }
+
+    if (f->n_tc > 1 && f->frame_hdr->frame_type & 1) {
+        dav1d_refmvs_save_tmvs(&t->rt,
+                               ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
+                               t->by >> 1, (t->by + sb_step) >> 1);
+    }
+
+    // backup pre-loopfilter pixels for intra prediction of the next sbrow
+    if (f->frame_thread.pass != 1)
+        f->bd_fn.backup_ipred_edge(t);
+
+    // backup t->a/l.tx_lpf_y/uv at tile boundaries to use them to "fix"
+    // up the initial value in neighbour tiles when running the loopfilter
+    int align_h = (f->bh + 31) & ~31;
+    memcpy(&f->lf.tx_lpf_right_edge[0][align_h * tile_col + t->by],
+           &t->l.tx_lpf_y[t->by & 16], sb_step);
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    align_h >>= ss_ver;
+    memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> ss_ver)],
+           &t->l.tx_lpf_uv[(t->by & 16) >> ss_ver], sb_step >> ss_ver);
+
+    return 0;
+}
+
+int dav1d_decode_frame(Dav1dFrameContext *const f) {
+    const Dav1dContext *const c = f->c;
+    int retval = DAV1D_ERR(ENOMEM);
+
+    if (f->n_tc > 1) {
+        const int titsati_sz = f->frame_hdr->tiling.cols * f->sbh;
+        if (titsati_sz != f->tile_thread.titsati_sz) {
+            freep(&f->tile_thread.task_idx_to_sby_and_tile_idx);
+            f->tile_thread.task_idx_to_sby_and_tile_idx =
+                malloc(sizeof(*f->tile_thread.task_idx_to_sby_and_tile_idx) *
+                       titsati_sz);
+            if (!f->tile_thread.task_idx_to_sby_and_tile_idx) {
+                f->tile_thread.titsati_sz = 0;
+                goto error;
+            }
+            f->tile_thread.titsati_sz = titsati_sz;
+        }
+        if (f->tile_thread.titsati_init[0] != f->frame_hdr->tiling.cols ||
+            f->tile_thread.titsati_init[1] != f->frame_hdr->tiling.rows ||
+            memcmp(f->frame_hdr->tiling.row_start_sb, f->tile_thread.titsati_index_rows,
+                   sizeof(*f->tile_thread.titsati_index_rows) *
+                       (f->frame_hdr->tiling.rows + 1)))
+        {
+            for (int tile_row = 0, tile_idx = 0;
+                 tile_row < f->frame_hdr->tiling.rows; tile_row++)
+            {
+                for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
+                     sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++)
+                {
+                    for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols;
+                         tile_col++, tile_idx++)
+                    {
+                        f->tile_thread.task_idx_to_sby_and_tile_idx[tile_idx][0] = sby;
+                        f->tile_thread.task_idx_to_sby_and_tile_idx[tile_idx][1] =
+                            tile_row * f->frame_hdr->tiling.cols + tile_col;
+                    }
+                }
+            }
+            f->tile_thread.titsati_init[0] = f->frame_hdr->tiling.cols;
+            f->tile_thread.titsati_init[1] = f->frame_hdr->tiling.rows;
+            memcpy(f->tile_thread.titsati_index_rows, f->frame_hdr->tiling.row_start_sb,
+                   sizeof(*f->tile_thread.titsati_index_rows) *
+                       (f->frame_hdr->tiling.rows + 1));
+        }
+    }
+
+    const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
+    if (n_ts != f->n_ts) {
+        if (c->n_fc > 1) {
+            freep(&f->frame_thread.tile_start_off);
+            f->frame_thread.tile_start_off =
+                malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts);
+            if (!f->frame_thread.tile_start_off) {
+                for (int n = 0; n < f->n_ts; n++) {
+                    Dav1dTileState *const ts = &f->ts[n];
+                    pthread_cond_destroy(&ts->tile_thread.cond);
+                    pthread_mutex_destroy(&ts->tile_thread.lock);
+                }
+                f->n_ts = 0;
+                goto error;
+            }
+        }
+        Dav1dTileState *ts_new = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32);
+        if (!ts_new) goto error;
+        if (n_ts > f->n_ts) {
+            if (f->ts) {
+                memcpy(ts_new, f->ts, sizeof(*f->ts) * f->n_ts);
+                dav1d_free_aligned(f->ts);
+            }
+            f->ts = ts_new;
+            for (int n = f->n_ts; n < n_ts; f->n_ts = ++n) {
+                Dav1dTileState *const ts = &f->ts[n];
+                if (pthread_mutex_init(&ts->tile_thread.lock, NULL)) goto error;
+                if (pthread_cond_init(&ts->tile_thread.cond, NULL)) {
+                    pthread_mutex_destroy(&ts->tile_thread.lock);
+                    goto error;
+                }
+            }
+        } else {
+            for (int n = n_ts; n < f->n_ts; n++) {
+                Dav1dTileState *const ts = &f->ts[n];
+                pthread_cond_destroy(&ts->tile_thread.cond);
+                pthread_mutex_destroy(&ts->tile_thread.lock);
+            }
+            memcpy(ts_new, f->ts, sizeof(*f->ts) * n_ts);
+            dav1d_free_aligned(f->ts);
+            f->n_ts = n_ts;
+            f->ts = ts_new;
+        }
+    }
+
+    const int a_sz = f->sb128w * f->frame_hdr->tiling.rows;
+    if (a_sz != f->a_sz) {
+        freep(&f->a);
+        f->a = malloc(sizeof(*f->a) * a_sz);
+        if (!f->a) {
+            f->a_sz = 0;
+            goto error;
+        }
+        f->a_sz = a_sz;
+    }
+
+    const int num_sb128 = f->sb128w * f->sb128h;
+    const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
+    const int hbd = !!f->seq_hdr->hbd;
+    if (c->n_fc > 1) {
+        int tile_idx = 0;
+        for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
+            int row_off = f->frame_hdr->tiling.row_start_sb[tile_row] *
+                          f->sb_step * 4 * f->sb128w * 128;
+            int b_diff = (f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
+                          f->frame_hdr->tiling.row_start_sb[tile_row]) * f->sb_step * 4;
+            for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
+                f->frame_thread.tile_start_off[tile_idx++] = row_off + b_diff *
+                    f->frame_hdr->tiling.col_start_sb[tile_col] * f->sb_step * 4;
+            }
+        }
+
+        const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
+        if (cf_sz != f->frame_thread.cf_sz) {
+            dav1d_freep_aligned(&f->frame_thread.cf);
+            f->frame_thread.cf =
+                dav1d_alloc_aligned((size_t)cf_sz * 128 * 128 / 2, 32);
+            if (!f->frame_thread.cf) {
+                f->frame_thread.cf_sz = 0;
+                goto error;
+            }
+            memset(f->frame_thread.cf, 0, (size_t)cf_sz * 128 * 128 / 2);
+            f->frame_thread.cf_sz = cf_sz;
+        }
+
+        if (f->frame_hdr->allow_screen_content_tools) {
+            if (num_sb128 != f->frame_thread.pal_sz) {
+                dav1d_freep_aligned(&f->frame_thread.pal);
+                f->frame_thread.pal =
+                    dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) *
+                                        num_sb128 * 16 * 16, 32);
+                if (!f->frame_thread.pal) {
+                    f->frame_thread.pal_sz = 0;
+                    goto error;
+                }
+                f->frame_thread.pal_sz = num_sb128;
+            }
+
+            const int pal_idx_sz = num_sb128 * size_mul[1];
+            if (pal_idx_sz != f->frame_thread.pal_idx_sz) {
+                dav1d_freep_aligned(&f->frame_thread.pal_idx);
+                f->frame_thread.pal_idx =
+                    dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) *
+                                        pal_idx_sz * 128 * 128 / 4, 32);
+                if (!f->frame_thread.pal_idx) {
+                    f->frame_thread.pal_idx_sz = 0;
+                    goto error;
+                }
+                f->frame_thread.pal_idx_sz = pal_idx_sz;
+            }
+        } else if (f->frame_thread.pal) {
+            dav1d_freep_aligned(&f->frame_thread.pal);
+            dav1d_freep_aligned(&f->frame_thread.pal_idx);
+            f->frame_thread.pal_sz = f->frame_thread.pal_idx_sz = 0;
+        }
+    }
+
+    // update allocation of block contexts for above
+    const ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1];
+    if (y_stride != f->lf.cdef_line_sz[0] || uv_stride != f->lf.cdef_line_sz[1]) {
+        dav1d_free_aligned(f->lf.cdef_line_buf);
+        size_t alloc_sz = 64;
+        alloc_sz += (y_stride  < 0 ? -y_stride  : y_stride ) * 4;
+        alloc_sz += (uv_stride < 0 ? -uv_stride : uv_stride) * 8;
+        uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32);
+        if (!ptr) {
+            f->lf.cdef_line_sz[0] = f->lf.cdef_line_sz[1] = 0;
+            goto error;
+        }
+
+        ptr += 32;
+        if (y_stride < 0) {
+            f->lf.cdef_line[0][0] = ptr - y_stride * 1;
+            f->lf.cdef_line[1][0] = ptr - y_stride * 3;
+            ptr -= y_stride * 4;
+        } else {
+            f->lf.cdef_line[0][0] = ptr + y_stride * 0;
+            f->lf.cdef_line[1][0] = ptr + y_stride * 2;
+            ptr += y_stride * 4;
+        }
+        if (uv_stride < 0) {
+            f->lf.cdef_line[0][1] = ptr - uv_stride * 1;
+            f->lf.cdef_line[0][2] = ptr - uv_stride * 3;
+            f->lf.cdef_line[1][1] = ptr - uv_stride * 5;
+            f->lf.cdef_line[1][2] = ptr - uv_stride * 7;
+        } else {
+            f->lf.cdef_line[0][1] = ptr + uv_stride * 0;
+            f->lf.cdef_line[0][2] = ptr + uv_stride * 2;
+            f->lf.cdef_line[1][1] = ptr + uv_stride * 4;
+            f->lf.cdef_line[1][2] = ptr + uv_stride * 6;
+        }
+
+        f->lf.cdef_line_sz[0] = (int) y_stride;
+        f->lf.cdef_line_sz[1] = (int) uv_stride;
+    }
+
+    const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;
+    if (lr_line_sz != f->lf.lr_line_sz) {
+        dav1d_freep_aligned(&f->lf.lr_lpf_line[0]);
+        uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * 3 * 12, 32);
+        if (!lr_ptr) {
+            f->lf.lr_line_sz = 0;
+            goto error;
+        }
+
+        for (int pl = 0; pl <= 2; pl++) {
+            f->lf.lr_lpf_line[pl] = lr_ptr;
+            lr_ptr += lr_line_sz * 12;
+        }
+
+        f->lf.lr_line_sz = lr_line_sz;
+    }
+
+    // update allocation for loopfilter masks
+    if (num_sb128 != f->lf.mask_sz) {
+        freep(&f->lf.mask);
+        freep(&f->lf.level);
+        f->lf.mask = malloc(sizeof(*f->lf.mask) * num_sb128);
+        // over-allocate by 3 bytes since some of the SIMD implementations
+        // index this from the level type and can thus over-read by up to 3
+        f->lf.level = malloc(sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);
+        if (!f->lf.mask || !f->lf.level) {
+            f->lf.mask_sz = 0;
+            goto error;
+        }
+        if (c->n_fc > 1) {
+            freep(&f->frame_thread.b);
+            freep(&f->frame_thread.cbi);
+            f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) *
+                                       num_sb128 * 32 * 32);
+            f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) *
+                                         num_sb128 * 32 * 32);
+            if (!f->frame_thread.b || !f->frame_thread.cbi) {
+                f->lf.mask_sz = 0;
+                goto error;
+            }
+        }
+        f->lf.mask_sz = num_sb128;
+    }
+
+    f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7;
+    const int lr_mask_sz = f->sr_sb128w * f->sb128h;
+    if (lr_mask_sz != f->lf.lr_mask_sz) {
+        freep(&f->lf.lr_mask);
+        f->lf.lr_mask = malloc(sizeof(*f->lf.lr_mask) * lr_mask_sz);
+        if (!f->lf.lr_mask) {
+            f->lf.lr_mask_sz = 0;
+            goto error;
+        }
+        f->lf.lr_mask_sz = lr_mask_sz;
+    }
+    f->lf.restore_planes =
+        ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
+        ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
+        ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
+    if (f->frame_hdr->loopfilter.sharpness != f->lf.last_sharpness) {
+        dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr->loopfilter.sharpness);
+        f->lf.last_sharpness = f->frame_hdr->loopfilter.sharpness;
+    }
+    dav1d_calc_lf_values(f->lf.lvl, f->frame_hdr, (int8_t[4]) { 0, 0, 0, 0 });
+    memset(f->lf.mask, 0, sizeof(*f->lf.mask) * num_sb128);
+
+    const int ipred_edge_sz = f->sbh * f->sb128w << hbd;
+    if (ipred_edge_sz != f->ipred_edge_sz) {
+        dav1d_freep_aligned(&f->ipred_edge[0]);
+        uint8_t *ptr = f->ipred_edge[0] =
+            dav1d_alloc_aligned(ipred_edge_sz * 128 * 3, 32);
+        if (!ptr) {
+            f->ipred_edge_sz = 0;
+            goto error;
+        }
+        f->ipred_edge[1] = ptr + ipred_edge_sz * 128 * 1;
+        f->ipred_edge[2] = ptr + ipred_edge_sz * 128 * 2;
+        f->ipred_edge_sz = ipred_edge_sz;
+    }
+
+    const int re_sz = f->sb128h * f->frame_hdr->tiling.cols;
+    if (re_sz != f->lf.re_sz) {
+        freep(&f->lf.tx_lpf_right_edge[0]);
+        f->lf.tx_lpf_right_edge[0] = malloc(re_sz * 32 * 2);
+        if (!f->lf.tx_lpf_right_edge[0]) {
+            f->lf.re_sz = 0;
+            goto error;
+        }
+        f->lf.tx_lpf_right_edge[1] = f->lf.tx_lpf_right_edge[0] + re_sz * 32;
+        f->lf.re_sz = re_sz;
+    }
+
+    // init ref mvs
+    if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
+        const int ret =
+            dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr,
+                                    f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, f->n_tc);
+        if (ret < 0) goto error;
+    }
+    retval = DAV1D_ERR(EINVAL);
+
+    // setup dequant tables
+    init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
+    if (f->frame_hdr->quant.qm)
+        for (int j = 0; j < N_RECT_TX_SIZES; j++) {
+            f->qm[0][j][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][j];
+            f->qm[0][j][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][j];
+            f->qm[0][j][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][j];
+        }
+    for (int i = f->frame_hdr->quant.qm; i < 2; i++)
+        for (int tx = 0; tx < N_RECT_TX_SIZES; tx++)
+            for (int pl = 0; pl < 3; pl++)
+                f->qm[i][tx][pl] = dav1d_qm_tbl[15][!!pl][tx];
+
+    // setup jnt_comp weights
+    if (f->frame_hdr->switchable_comp_refs) {
+        for (int i = 0; i < 7; i++) {
+            const unsigned ref0poc = f->refp[i].p.frame_hdr->frame_offset;
+
+            for (int j = i + 1; j < 7; j++) {
+                const unsigned ref1poc = f->refp[j].p.frame_hdr->frame_offset;
+
+                const unsigned d1 =
+                    imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref0poc,
+                                          f->cur.frame_hdr->frame_offset)), 31);
+                const unsigned d0 =
+                    imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref1poc,
+                                          f->cur.frame_hdr->frame_offset)), 31);
+                const int order = d0 <= d1;
+
+                static const uint8_t quant_dist_weight[3][2] = {
+                    { 2, 3 }, { 2, 5 }, { 2, 7 }
+                };
+                static const uint8_t quant_dist_lookup_table[4][2] = {
+                    { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 }
+                };
+
+                int k;
+                for (k = 0; k < 3; k++) {
+                    const int c0 = quant_dist_weight[k][order];
+                    const int c1 = quant_dist_weight[k][!order];
+                    const int d0_c0 = d0 * c0;
+                    const int d1_c1 = d1 * c1;
+                    if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
+                }
+
+                f->jnt_weights[i][j] = quant_dist_lookup_table[k][order];
+            }
+        }
+    }
+
+    /* Init loopfilter pointers. Increasing NULL pointers is technically UB,
+     * so just point the chroma pointers in 4:0:0 to the luma plane here to
+     * avoid having additional in-loop branches in various places. We never
+     * dereference those pointers so it doesn't really matter what they
+     * point at, as long as the pointers are valid. */
+    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
+    f->lf.mask_ptr = f->lf.mask;
+    f->lf.p[0] = f->cur.data[0];
+    f->lf.p[1] = f->cur.data[has_chroma ? 1 : 0];
+    f->lf.p[2] = f->cur.data[has_chroma ? 2 : 0];
+    f->lf.sr_p[0] = f->sr_cur.p.data[0];
+    f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0];
+    f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0];
+    f->lf.tile_row = 1;
+
+    dav1d_cdf_thread_wait(&f->in_cdf);
+    if (f->frame_hdr->refresh_context)
+        dav1d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf);
+
+    // parse individual tiles per tile group
+    int update_set = 0, tile_row = 0, tile_col = 0;
+    for (int i = 0; i < f->n_tile_data; i++) {
+        const uint8_t *data = f->tile[i].data.data;
+        size_t size = f->tile[i].data.sz;
+
+        for (int j = f->tile[i].start; j <= f->tile[i].end; j++) {
+            size_t tile_sz;
+            if (j == f->tile[i].end) {
+                tile_sz = size;
+            } else {
+                if (f->frame_hdr->tiling.n_bytes > size) goto error;
+                tile_sz = 0;
+                for (unsigned k = 0; k < f->frame_hdr->tiling.n_bytes; k++)
+                    tile_sz |= (unsigned)*data++ << (k * 8);
+                tile_sz++;
+                size -= f->frame_hdr->tiling.n_bytes;
+                if (tile_sz > size) goto error;
+            }
+
+            setup_tile(&f->ts[j], f, data, tile_sz, tile_row, tile_col++,
+                       c->n_fc > 1 ? f->frame_thread.tile_start_off[j] : 0);
+
+            if (tile_col == f->frame_hdr->tiling.cols) {
+                tile_col = 0;
+                tile_row++;
+            }
+            if (j == f->frame_hdr->tiling.update && f->frame_hdr->refresh_context)
+                update_set = 1;
+            data += tile_sz;
+            size -= tile_sz;
+        }
+    }
+
+    // 2-pass decoding:
+    // - enabled for frame-threading, so that one frame can do symbol parsing
+    //   as another (or multiple) are doing reconstruction. One advantage here
+    //   is that although reconstruction is limited by reference availability,
+    //   symbol parsing is not. Therefore, symbol parsing can effectively use
+    //   row and col tile threading, but reconstruction only col tile threading;
+    // - pass 0 means no 2-pass;
+    // - pass 1 means symbol parsing only;
+    // - pass 2 means reconstruction and loop filtering.
+
+    const int uses_2pass = c->n_fc > 1 && f->frame_hdr->refresh_context;
+    for (f->frame_thread.pass = uses_2pass;
+         f->frame_thread.pass <= 2 * uses_2pass; f->frame_thread.pass++)
+    {
+        const enum PlaneType progress_plane_type =
+            f->frame_thread.pass == 0 ? PLANE_TYPE_ALL :
+            f->frame_thread.pass == 1 ? PLANE_TYPE_BLOCK : PLANE_TYPE_Y;
+
+        for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++)
+            reset_context(&f->a[n], !(f->frame_hdr->frame_type & 1), f->frame_thread.pass);
+
+        if (f->n_tc == 1) {
+            Dav1dTileContext *const t = f->tc;
+
+            // no tile threading - we explicitly interleave tile/sbrow decoding
+            // and post-filtering, so that the full process runs in-line, so
+            // that frame threading is still possible
+            for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
+                const int sbh_end =
+                    imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh);
+                for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
+                     sby < sbh_end; sby++)
+                {
+                    t->by = sby << (4 + f->seq_hdr->sb128);
+                    const int by_end = (t->by + f->sb_step) >> 1;
+                    if (f->frame_thread.pass <= 1 && f->frame_hdr->use_ref_frame_mvs) {
+                        if (c->n_fc > 1) for (int n = 0; n < 7; n++)
+                            if (dav1d_thread_picture_wait(&f->refp[n],
+                                                          4 * (t->by + f->sb_step),
+                                                          PLANE_TYPE_BLOCK))
+                            {
+                                return 1;
+                            }
+                        dav1d_refmvs_load_tmvs(&f->rf, tile_row,
+                                               0, f->bw >> 1, t->by >> 1, by_end);
+                    }
+                    for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
+                        t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
+
+                        if (dav1d_decode_tile_sbrow(t)) goto error;
+                    }
+                    if (f->frame_thread.pass <= 1 && f->frame_hdr->frame_type & 1) {
+                        dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end);
+                    }
+
+                    // loopfilter + cdef + restoration
+                    if (f->frame_thread.pass != 1)
+                        f->bd_fn.filter_sbrow(f, sby);
+                    dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4,
+                                                progress_plane_type);
+                }
+            }
+        } else {
+            // signal available tasks to worker threads
+            int num_tasks;
+
+            pthread_mutex_lock(&f->tile_thread.lock);
+            assert(!f->tile_thread.tasks_left);
+            if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr->tiling.cols) {
+                // we can (or in fact, if >, we need to) do full tile decoding.
+                // loopfilter happens below
+                num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
+            } else {
+                // we need to interleave sbrow decoding for all tile cols in a
+                // tile row, since otherwise subsequent threads will be blocked
+                // waiting for the post-filter to complete
+                num_tasks = f->sbh * f->frame_hdr->tiling.cols;
+            }
+            f->tile_thread.num_tasks = f->tile_thread.tasks_left = num_tasks;
+            pthread_cond_broadcast(&f->tile_thread.cond);
+            pthread_mutex_unlock(&f->tile_thread.lock);
+
+            // loopfilter + cdef + restoration
+            for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
+                for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
+                     sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++)
+                {
+                    for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols;
+                         tile_col++)
+                    {
+                        int progress;
+                        Dav1dTileState *const ts =
+                            &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
+
+                        if ((progress = atomic_load(&ts->progress)) <= sby) {
+                            pthread_mutex_lock(&ts->tile_thread.lock);
+                            while ((progress = atomic_load(&ts->progress)) <= sby)
+                                pthread_cond_wait(&ts->tile_thread.cond,
+                                                  &ts->tile_thread.lock);
+                            pthread_mutex_unlock(&ts->tile_thread.lock);
+                        }
+                        if (progress == TILE_ERROR) {
+                            dav1d_thread_picture_signal(&f->sr_cur, FRAME_ERROR,
+                                                        PLANE_TYPE_ALL);
+                            const uint64_t all_mask = ~0ULL >> (64 - f->n_tc);
+                            pthread_mutex_lock(&f->tile_thread.lock);
+                            while (f->tile_thread.available != all_mask)
+                                pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock);
+                            pthread_mutex_unlock(&f->tile_thread.lock);
+                            goto error;
+                        }
+                    }
+
+                    // loopfilter + cdef + restoration
+                    if (f->frame_thread.pass != 1)
+                        f->bd_fn.filter_sbrow(f, sby);
+                    dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4,
+                                                progress_plane_type);
+                }
+            }
+
+            const uint64_t all_mask = ~0ULL >> (64 - f->n_tc);
+            pthread_mutex_lock(&f->tile_thread.lock);
+            while (f->tile_thread.available != all_mask)
+                pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock);
+            pthread_mutex_unlock(&f->tile_thread.lock);
+        }
+
+        if (f->frame_thread.pass <= 1 && f->frame_hdr->refresh_context) {
+            // cdf update
+            if (update_set)
+                dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
+                                        &f->ts[f->frame_hdr->tiling.update].cdf);
+            dav1d_cdf_thread_signal(&f->out_cdf);
+        }
+        if (f->frame_thread.pass == 1) {
+            assert(c->n_fc > 1);
+            for (int tile_idx = 0;
+                 tile_idx < f->frame_hdr->tiling.rows * f->frame_hdr->tiling.cols;
+                 tile_idx++)
+            {
+                Dav1dTileState *const ts = &f->ts[tile_idx];
+                const size_t tile_start_off =
+                    (size_t) f->frame_thread.tile_start_off[tile_idx];
+                ts->frame_thread.pal_idx = f->frame_thread.pal_idx ?
+                    &f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4] :
+                    NULL;
+                ts->frame_thread.cf = f->frame_thread.cf ?
+                    (uint8_t*)f->frame_thread.cf +
+                        ((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
+                    NULL;
+                if (f->n_tc > 0) {
+                    const unsigned row_sb_start =
+                        f->frame_hdr->tiling.row_start_sb[ts->tiling.row];
+                    atomic_init(&ts->progress, row_sb_start);
+                }
+            }
+        }
+    }
+
+    retval = 0;
+error:
+    dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR,
+                                PLANE_TYPE_ALL);
+    for (int i = 0; i < 7; i++) {
+        if (f->refp[i].p.data[0])
+            dav1d_thread_picture_unref(&f->refp[i]);
+        dav1d_ref_dec(&f->ref_mvs_ref[i]);
+    }
+
+    dav1d_picture_unref_internal(&f->cur);
+    dav1d_thread_picture_unref(&f->sr_cur);
+    dav1d_cdf_thread_unref(&f->in_cdf);
+    if (f->frame_hdr->refresh_context) {
+        dav1d_cdf_thread_signal(&f->out_cdf);
+        dav1d_cdf_thread_unref(&f->out_cdf);
+    }
+    dav1d_ref_dec(&f->cur_segmap_ref);
+    dav1d_ref_dec(&f->prev_segmap_ref);
+    dav1d_ref_dec(&f->mvs_ref);
+    dav1d_ref_dec(&f->seq_hdr_ref);
+    dav1d_ref_dec(&f->frame_hdr_ref);
+
+    for (int i = 0; i < f->n_tile_data; i++)
+        dav1d_data_unref_internal(&f->tile[i].data);
+
+    return retval;
+}
+
+static int get_upscale_x0(const int in_w, const int out_w, const int step) {
+    const int err = out_w * step - (in_w << 14);
+    const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1);
+    return x0 & 0x3fff;
+}
+
+int dav1d_submit_frame(Dav1dContext *const c) {
+    Dav1dFrameContext *f;
+    int res = -1;
+
+    // wait for c->out_delayed[next] and move into c->out if visible
+    Dav1dThreadPicture *out_delayed;
+    if (c->n_fc > 1) {
+        const unsigned next = c->frame_thread.next++;
+        if (c->frame_thread.next == c->n_fc)
+            c->frame_thread.next = 0;
+
+        f = &c->fc[next];
+        pthread_mutex_lock(&f->frame_thread.td.lock);
+        while (f->n_tile_data > 0)
+            pthread_cond_wait(&f->frame_thread.td.cond,
+                              &f->frame_thread.td.lock);
+        out_delayed = &c->frame_thread.out_delayed[next];
+        if (out_delayed->p.data[0]) {
+            const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
+                                                           memory_order_relaxed);
+            if (out_delayed->visible && progress != FRAME_ERROR)
+                dav1d_picture_ref(&c->out, &out_delayed->p);
+            dav1d_thread_picture_unref(out_delayed);
+        }
+    } else {
+        f = c->fc;
+    }
+
+    f->seq_hdr = c->seq_hdr;
+    f->seq_hdr_ref = c->seq_hdr_ref;
+    dav1d_ref_inc(f->seq_hdr_ref);
+    f->frame_hdr = c->frame_hdr;
+    f->frame_hdr_ref = c->frame_hdr_ref;
+    c->frame_hdr = NULL;
+    c->frame_hdr_ref = NULL;
+    f->dsp = &c->dsp[f->seq_hdr->hbd];
+
+    const int bpc = 8 + 2 * f->seq_hdr->hbd;
+
+    if (!f->dsp->ipred.intra_pred[DC_PRED]) {
+        Dav1dDSPContext *const dsp = &c->dsp[f->seq_hdr->hbd];
+
+        switch (bpc) {
+#define assign_bitdepth_case(bd) \
+            dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \
+            dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
+            dav1d_itx_dsp_init_##bd##bpc(&dsp->itx, bpc); \
+            dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
+            dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \
+            dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
+            dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
+            break
+#if CONFIG_8BPC
+        case 8:
+            assign_bitdepth_case(8);
+#endif
+#if CONFIG_16BPC
+        case 10:
+        case 12:
+            assign_bitdepth_case(16);
+#endif
+#undef assign_bitdepth_case
+        default:
+            dav1d_log(c, "Compiled without support for %d-bit decoding\n",
+                    8 + 2 * f->seq_hdr->hbd);
+            res = DAV1D_ERR(ENOPROTOOPT);
+            goto error;
+        }
+    }
+
+#define assign_bitdepth_case(bd) \
+        f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
+        f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
+        f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
+        f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
+        f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc
+    if (!f->seq_hdr->hbd) {
+#if CONFIG_8BPC
+        assign_bitdepth_case(8);
+#endif
+    } else {
+#if CONFIG_16BPC
+        assign_bitdepth_case(16);
+#endif
+    }
+#undef assign_bitdepth_case
+
+    int ref_coded_width[7];
+    if (f->frame_hdr->frame_type & 1) {
+        if (f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE) {
+            const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
+            if (!c->refs[pri_ref].p.p.data[0]) {
+                res = DAV1D_ERR(EINVAL);
+                goto error;
+            }
+        }
+        for (int i = 0; i < 7; i++) {
+            const int refidx = f->frame_hdr->refidx[i];
+            if (!c->refs[refidx].p.p.data[0] ||
+                f->frame_hdr->width[0] * 2 < c->refs[refidx].p.p.p.w ||
+                f->frame_hdr->height * 2 < c->refs[refidx].p.p.p.h ||
+                f->frame_hdr->width[0] > c->refs[refidx].p.p.p.w * 16 ||
+                f->frame_hdr->height > c->refs[refidx].p.p.p.h * 16 ||
+                f->seq_hdr->layout != c->refs[refidx].p.p.p.layout ||
+                bpc != c->refs[refidx].p.p.p.bpc)
+            {
+                for (int j = 0; j < i; j++)
+                    dav1d_thread_picture_unref(&f->refp[j]);
+                res = DAV1D_ERR(EINVAL);
+                goto error;
+            }
+            dav1d_thread_picture_ref(&f->refp[i], &c->refs[refidx].p);
+            ref_coded_width[i] = c->refs[refidx].p.p.frame_hdr->width[0];
+            if (f->frame_hdr->width[0] != c->refs[refidx].p.p.p.w ||
+                f->frame_hdr->height != c->refs[refidx].p.p.p.h)
+            {
+#define scale_fac(ref_sz, this_sz) \
+    ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
+                f->svc[i][0].scale = scale_fac(c->refs[refidx].p.p.p.w,
+                                               f->frame_hdr->width[0]);
+                f->svc[i][1].scale = scale_fac(c->refs[refidx].p.p.p.h,
+                                               f->frame_hdr->height);
+                f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4;
+                f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4;
+            } else {
+                f->svc[i][0].scale = 0;
+            }
+            f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION &&
+                                     !f->frame_hdr->force_integer_mv &&
+                                     !dav1d_get_shear_params(&f->frame_hdr->gmv[i]) &&
+                                     !f->svc[i][0].scale;
+        }
+    }
+
+    // setup entropy
+    if (f->frame_hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+        dav1d_cdf_thread_init_static(&f->in_cdf, f->frame_hdr->quant.yac);
+    } else {
+        const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
+        dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]);
+    }
+    if (f->frame_hdr->refresh_context) {
+        res = dav1d_cdf_thread_alloc(&f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL);
+        if (res < 0) goto error;
+    }
+
+    // FIXME qsort so tiles are in order (for frame threading)
+    if (f->n_tile_data_alloc < c->n_tile_data) {
+        freep(&f->tile);
+        assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile));
+        f->tile = malloc(c->n_tile_data * sizeof(*f->tile));
+        if (!f->tile) {
+            f->n_tile_data_alloc = f->n_tile_data = 0;
+            res = DAV1D_ERR(ENOMEM);
+            goto error;
+        }
+        f->n_tile_data_alloc = c->n_tile_data;
+    }
+    memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
+    memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
+    f->n_tile_data = c->n_tile_data;
+    c->n_tile_data = 0;
+
+    // allocate frame
+    res = dav1d_thread_picture_alloc(c, f, bpc);
+    if (res < 0) goto error;
+
+    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+        res = dav1d_picture_alloc_copy(c, &f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
+        if (res < 0) goto error;
+    } else {
+        dav1d_picture_ref(&f->cur, &f->sr_cur.p);
+    }
+
+    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+        f->resize_step[0] = scale_fac(f->cur.p.w, f->sr_cur.p.p.w);
+        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int in_cw = (f->cur.p.w + ss_hor) >> ss_hor;
+        const int out_cw = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+        f->resize_step[1] = scale_fac(in_cw, out_cw);
+#undef scale_fac
+        f->resize_start[0] = get_upscale_x0(f->cur.p.w, f->sr_cur.p.p.w, f->resize_step[0]);
+        f->resize_start[1] = get_upscale_x0(in_cw, out_cw, f->resize_step[1]);
+    }
+
+    // move f->cur into output queue
+    if (c->n_fc == 1) {
+        if (f->frame_hdr->show_frame)
+            dav1d_picture_ref(&c->out, &f->sr_cur.p);
+    } else {
+        dav1d_thread_picture_ref(out_delayed, &f->sr_cur);
+    }
+
+    f->w4 = (f->frame_hdr->width[0] + 3) >> 2;
+    f->h4 = (f->frame_hdr->height + 3) >> 2;
+    f->bw = ((f->frame_hdr->width[0] + 7) >> 3) << 1;
+    f->bh = ((f->frame_hdr->height + 7) >> 3) << 1;
+    f->sb128w = (f->bw + 31) >> 5;
+    f->sb128h = (f->bh + 31) >> 5;
+    f->sb_shift = 4 + f->seq_hdr->sb128;
+    f->sb_step = 16 << f->seq_hdr->sb128;
+    f->sbh = (f->bh + f->sb_step - 1) >> f->sb_shift;
+    f->b4_stride = (f->bw + 31) & ~31;
+    f->bitdepth_max = (1 << f->cur.p.bpc) - 1;
+
+    // ref_mvs
+    if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
+        f->mvs_ref = dav1d_ref_create(f->sb128h * 16 * (f->b4_stride >> 1) *
+                                      sizeof(*f->mvs));
+        if (!f->mvs_ref) {
+            res = DAV1D_ERR(ENOMEM);
+            goto error;
+        }
+        f->mvs = f->mvs_ref->data;
+        if (!f->frame_hdr->allow_intrabc) {
+            for (int i = 0; i < 7; i++)
+                f->refpoc[i] = f->refp[i].p.frame_hdr->frame_offset;
+        } else {
+            memset(f->refpoc, 0, sizeof(f->refpoc));
+        }
+        if (f->frame_hdr->use_ref_frame_mvs) {
+            for (int i = 0; i < 7; i++) {
+                const int refidx = f->frame_hdr->refidx[i];
+                if (c->refs[refidx].refmvs != NULL &&
+                    ref_coded_width[i] == f->cur.p.w &&
+                    f->refp[i].p.p.h == f->cur.p.h)
+                {
+                    f->ref_mvs_ref[i] = c->refs[refidx].refmvs;
+                    dav1d_ref_inc(f->ref_mvs_ref[i]);
+                    f->ref_mvs[i] = c->refs[refidx].refmvs->data;
+                } else {
+                    f->ref_mvs[i] = NULL;
+                    f->ref_mvs_ref[i] = NULL;
+                }
+                memcpy(f->refrefpoc[i], c->refs[refidx].refpoc,
+                       sizeof(*f->refrefpoc));
+            }
+        } else {
+            memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
+        }
+    } else {
+        f->mvs_ref = NULL;
+        memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
+    }
+
+    // segmap
+    if (f->frame_hdr->segmentation.enabled) {
+        // By default, the previous segmentation map is not initialised.
+        f->prev_segmap_ref = NULL;
+        f->prev_segmap = NULL;
+
+        // We might need a previous frame's segmentation map. This
+        // happens if there is either no update or a temporal update.
+        if (f->frame_hdr->segmentation.temporal || !f->frame_hdr->segmentation.update_map) {
+            const int pri_ref = f->frame_hdr->primary_ref_frame;
+            assert(pri_ref != DAV1D_PRIMARY_REF_NONE);
+            const int ref_w = ((ref_coded_width[pri_ref] + 7) >> 3) << 1;
+            const int ref_h = ((f->refp[pri_ref].p.p.h + 7) >> 3) << 1;
+            if (ref_w == f->bw && ref_h == f->bh) {
+                f->prev_segmap_ref = c->refs[f->frame_hdr->refidx[pri_ref]].segmap;
+                if (f->prev_segmap_ref) {
+                    dav1d_ref_inc(f->prev_segmap_ref);
+                    f->prev_segmap = f->prev_segmap_ref->data;
+                }
+            }
+        }
+
+        if (f->frame_hdr->segmentation.update_map) {
+            // We're updating an existing map, but need somewhere to
+            // put the new values. Allocate them here (the data
+            // actually gets set elsewhere)
+            f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h);
+            if (!f->cur_segmap_ref) {
+                dav1d_ref_dec(&f->prev_segmap_ref);
+                res = DAV1D_ERR(ENOMEM);
+                goto error;
+            }
+            f->cur_segmap = f->cur_segmap_ref->data;
+        } else if (f->prev_segmap_ref) {
+            // We're not updating an existing map, and we have a valid
+            // reference. Use that.
+            f->cur_segmap_ref = f->prev_segmap_ref;
+            dav1d_ref_inc(f->cur_segmap_ref);
+            f->cur_segmap = f->prev_segmap_ref->data;
+        } else {
+            // We need to make a new map. Allocate one here and zero it out.
+            f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h);
+            if (!f->cur_segmap_ref) {
+                res = DAV1D_ERR(ENOMEM);
+                goto error;
+            }
+            f->cur_segmap = f->cur_segmap_ref->data;
+            memset(f->cur_segmap_ref->data, 0, f->b4_stride * 32 * f->sb128h);
+        }
+    } else {
+        f->cur_segmap = NULL;
+        f->cur_segmap_ref = NULL;
+        f->prev_segmap_ref = NULL;
+    }
+
+    // update references etc.
+    const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags;
+    for (int i = 0; i < 8; i++) {
+        if (refresh_frame_flags & (1 << i)) {
+            if (c->refs[i].p.p.data[0])
+                dav1d_thread_picture_unref(&c->refs[i].p);
+            dav1d_thread_picture_ref(&c->refs[i].p, &f->sr_cur);
+
+            dav1d_cdf_thread_unref(&c->cdf[i]);
+            if (f->frame_hdr->refresh_context) {
+                dav1d_cdf_thread_ref(&c->cdf[i], &f->out_cdf);
+            } else {
+                dav1d_cdf_thread_ref(&c->cdf[i], &f->in_cdf);
+            }
+
+            dav1d_ref_dec(&c->refs[i].segmap);
+            c->refs[i].segmap = f->cur_segmap_ref;
+            if (f->cur_segmap_ref)
+                dav1d_ref_inc(f->cur_segmap_ref);
+            dav1d_ref_dec(&c->refs[i].refmvs);
+            if (!f->frame_hdr->allow_intrabc) {
+                c->refs[i].refmvs = f->mvs_ref;
+                if (f->mvs_ref)
+                    dav1d_ref_inc(f->mvs_ref);
+            }
+            memcpy(c->refs[i].refpoc, f->refpoc, sizeof(f->refpoc));
+        }
+    }
+
+    if (c->n_fc == 1) {
+        if ((res = dav1d_decode_frame(f)) < 0) {
+            dav1d_picture_unref_internal(&c->out);
+            for (int i = 0; i < 8; i++) {
+                if (refresh_frame_flags & (1 << i)) {
+                    if (c->refs[i].p.p.data[0])
+                        dav1d_thread_picture_unref(&c->refs[i].p);
+                    dav1d_cdf_thread_unref(&c->cdf[i]);
+                    dav1d_ref_dec(&c->refs[i].segmap);
+                    dav1d_ref_dec(&c->refs[i].refmvs);
+                }
+            }
+            return res;
+        }
+    } else {
+        pthread_cond_signal(&f->frame_thread.td.cond);
+        pthread_mutex_unlock(&f->frame_thread.td.lock);
+    }
+
+    return 0;
+error:
+    dav1d_cdf_thread_unref(&f->in_cdf);
+    if (f->frame_hdr->refresh_context)
+        dav1d_cdf_thread_unref(&f->out_cdf);
+    for (int i = 0; i < 7; i++) {
+        if (f->refp[i].p.data[0])
+            dav1d_thread_picture_unref(&f->refp[i]);
+        dav1d_ref_dec(&f->ref_mvs_ref[i]);
+    }
+    if (c->n_fc == 1)
+        dav1d_picture_unref_internal(&c->out);
+    else
+        dav1d_thread_picture_unref(out_delayed);
+    dav1d_picture_unref_internal(&f->cur);
+    dav1d_thread_picture_unref(&f->sr_cur);
+    dav1d_ref_dec(&f->mvs_ref);
+    dav1d_ref_dec(&f->seq_hdr_ref);
+    dav1d_ref_dec(&f->frame_hdr_ref);
+
+    for (int i = 0; i < f->n_tile_data; i++)
+        dav1d_data_unref_internal(&f->tile[i].data);
+    f->n_tile_data = 0;
+
+    if (c->n_fc > 1) {
+        pthread_cond_signal(&f->frame_thread.td.cond);
+        pthread_mutex_unlock(&f->frame_thread.td.lock);
+    }
+
+    return res;
+}
diff --git a/src/decode.h b/src/decode.h
new file mode 100644 (file)
index 0000000..1eae585
--- /dev/null
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_DECODE_H
+#define DAV1D_SRC_DECODE_H
+
+#include "src/internal.h"
+
+int dav1d_submit_frame(Dav1dContext *c);
+
+#endif /* DAV1D_SRC_DECODE_H */
diff --git a/src/dequant_tables.c b/src/dequant_tables.c
new file mode 100644 (file)
index 0000000..5d80111
--- /dev/null
@@ -0,0 +1,229 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/dequant_tables.h"
+
+const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2] = {
+    {
+        {    4,    4, }, {    8,    8, }, {    8,    9, }, {    9,   10, },
+        {   10,   11, }, {   11,   12, }, {   12,   13, }, {   12,   14, },
+        {   13,   15, }, {   14,   16, }, {   15,   17, }, {   16,   18, },
+        {   17,   19, }, {   18,   20, }, {   19,   21, }, {   19,   22, },
+        {   20,   23, }, {   21,   24, }, {   22,   25, }, {   23,   26, },
+        {   24,   27, }, {   25,   28, }, {   26,   29, }, {   26,   30, },
+        {   27,   31, }, {   28,   32, }, {   29,   33, }, {   30,   34, },
+        {   31,   35, }, {   32,   36, }, {   32,   37, }, {   33,   38, },
+        {   34,   39, }, {   35,   40, }, {   36,   41, }, {   37,   42, },
+        {   38,   43, }, {   38,   44, }, {   39,   45, }, {   40,   46, },
+        {   41,   47, }, {   42,   48, }, {   43,   49, }, {   43,   50, },
+        {   44,   51, }, {   45,   52, }, {   46,   53, }, {   47,   54, },
+        {   48,   55, }, {   48,   56, }, {   49,   57, }, {   50,   58, },
+        {   51,   59, }, {   52,   60, }, {   53,   61, }, {   53,   62, },
+        {   54,   63, }, {   55,   64, }, {   56,   65, }, {   57,   66, },
+        {   57,   67, }, {   58,   68, }, {   59,   69, }, {   60,   70, },
+        {   61,   71, }, {   62,   72, }, {   62,   73, }, {   63,   74, },
+        {   64,   75, }, {   65,   76, }, {   66,   77, }, {   66,   78, },
+        {   67,   79, }, {   68,   80, }, {   69,   81, }, {   70,   82, },
+        {   70,   83, }, {   71,   84, }, {   72,   85, }, {   73,   86, },
+        {   74,   87, }, {   74,   88, }, {   75,   89, }, {   76,   90, },
+        {   77,   91, }, {   78,   92, }, {   78,   93, }, {   79,   94, },
+        {   80,   95, }, {   81,   96, }, {   81,   97, }, {   82,   98, },
+        {   83,   99, }, {   84,  100, }, {   85,  101, }, {   85,  102, },
+        {   87,  104, }, {   88,  106, }, {   90,  108, }, {   92,  110, },
+        {   93,  112, }, {   95,  114, }, {   96,  116, }, {   98,  118, },
+        {   99,  120, }, {  101,  122, }, {  102,  124, }, {  104,  126, },
+        {  105,  128, }, {  107,  130, }, {  108,  132, }, {  110,  134, },
+        {  111,  136, }, {  113,  138, }, {  114,  140, }, {  116,  142, },
+        {  117,  144, }, {  118,  146, }, {  120,  148, }, {  121,  150, },
+        {  123,  152, }, {  125,  155, }, {  127,  158, }, {  129,  161, },
+        {  131,  164, }, {  134,  167, }, {  136,  170, }, {  138,  173, },
+        {  140,  176, }, {  142,  179, }, {  144,  182, }, {  146,  185, },
+        {  148,  188, }, {  150,  191, }, {  152,  194, }, {  154,  197, },
+        {  156,  200, }, {  158,  203, }, {  161,  207, }, {  164,  211, },
+        {  166,  215, }, {  169,  219, }, {  172,  223, }, {  174,  227, },
+        {  177,  231, }, {  180,  235, }, {  182,  239, }, {  185,  243, },
+        {  187,  247, }, {  190,  251, }, {  192,  255, }, {  195,  260, },
+        {  199,  265, }, {  202,  270, }, {  205,  275, }, {  208,  280, },
+        {  211,  285, }, {  214,  290, }, {  217,  295, }, {  220,  300, },
+        {  223,  305, }, {  226,  311, }, {  230,  317, }, {  233,  323, },
+        {  237,  329, }, {  240,  335, }, {  243,  341, }, {  247,  347, },
+        {  250,  353, }, {  253,  359, }, {  257,  366, }, {  261,  373, },
+        {  265,  380, }, {  269,  387, }, {  272,  394, }, {  276,  401, },
+        {  280,  408, }, {  284,  416, }, {  288,  424, }, {  292,  432, },
+        {  296,  440, }, {  300,  448, }, {  304,  456, }, {  309,  465, },
+        {  313,  474, }, {  317,  483, }, {  322,  492, }, {  326,  501, },
+        {  330,  510, }, {  335,  520, }, {  340,  530, }, {  344,  540, },
+        {  349,  550, }, {  354,  560, }, {  359,  571, }, {  364,  582, },
+        {  369,  593, }, {  374,  604, }, {  379,  615, }, {  384,  627, },
+        {  389,  639, }, {  395,  651, }, {  400,  663, }, {  406,  676, },
+        {  411,  689, }, {  417,  702, }, {  423,  715, }, {  429,  729, },
+        {  435,  743, }, {  441,  757, }, {  447,  771, }, {  454,  786, },
+        {  461,  801, }, {  467,  816, }, {  475,  832, }, {  482,  848, },
+        {  489,  864, }, {  497,  881, }, {  505,  898, }, {  513,  915, },
+        {  522,  933, }, {  530,  951, }, {  539,  969, }, {  549,  988, },
+        {  559, 1007, }, {  569, 1026, }, {  579, 1046, }, {  590, 1066, },
+        {  602, 1087, }, {  614, 1108, }, {  626, 1129, }, {  640, 1151, },
+        {  654, 1173, }, {  668, 1196, }, {  684, 1219, }, {  700, 1243, },
+        {  717, 1267, }, {  736, 1292, }, {  755, 1317, }, {  775, 1343, },
+        {  796, 1369, }, {  819, 1396, }, {  843, 1423, }, {  869, 1451, },
+        {  896, 1479, }, {  925, 1508, }, {  955, 1537, }, {  988, 1567, },
+        { 1022, 1597, }, { 1058, 1628, }, { 1098, 1660, }, { 1139, 1692, },
+        { 1184, 1725, }, { 1232, 1759, }, { 1282, 1793, }, { 1336, 1828, },
+    }, {
+        {    4,    4, }, {    9,    9, }, {   10,   11, }, {   13,   13, },
+        {   15,   16, }, {   17,   18, }, {   20,   21, }, {   22,   24, },
+        {   25,   27, }, {   28,   30, }, {   31,   33, }, {   34,   37, },
+        {   37,   40, }, {   40,   44, }, {   43,   48, }, {   47,   51, },
+        {   50,   55, }, {   53,   59, }, {   57,   63, }, {   60,   67, },
+        {   64,   71, }, {   68,   75, }, {   71,   79, }, {   75,   83, },
+        {   78,   88, }, {   82,   92, }, {   86,   96, }, {   90,  100, },
+        {   93,  105, }, {   97,  109, }, {  101,  114, }, {  105,  118, },
+        {  109,  122, }, {  113,  127, }, {  116,  131, }, {  120,  136, },
+        {  124,  140, }, {  128,  145, }, {  132,  149, }, {  136,  154, },
+        {  140,  158, }, {  143,  163, }, {  147,  168, }, {  151,  172, },
+        {  155,  177, }, {  159,  181, }, {  163,  186, }, {  166,  190, },
+        {  170,  195, }, {  174,  199, }, {  178,  204, }, {  182,  208, },
+        {  185,  213, }, {  189,  217, }, {  193,  222, }, {  197,  226, },
+        {  200,  231, }, {  204,  235, }, {  208,  240, }, {  212,  244, },
+        {  215,  249, }, {  219,  253, }, {  223,  258, }, {  226,  262, },
+        {  230,  267, }, {  233,  271, }, {  237,  275, }, {  241,  280, },
+        {  244,  284, }, {  248,  289, }, {  251,  293, }, {  255,  297, },
+        {  259,  302, }, {  262,  306, }, {  266,  311, }, {  269,  315, },
+        {  273,  319, }, {  276,  324, }, {  280,  328, }, {  283,  332, },
+        {  287,  337, }, {  290,  341, }, {  293,  345, }, {  297,  349, },
+        {  300,  354, }, {  304,  358, }, {  307,  362, }, {  310,  367, },
+        {  314,  371, }, {  317,  375, }, {  321,  379, }, {  324,  384, },
+        {  327,  388, }, {  331,  392, }, {  334,  396, }, {  337,  401, },
+        {  343,  409, }, {  350,  417, }, {  356,  425, }, {  362,  433, },
+        {  369,  441, }, {  375,  449, }, {  381,  458, }, {  387,  466, },
+        {  394,  474, }, {  400,  482, }, {  406,  490, }, {  412,  498, },
+        {  418,  506, }, {  424,  514, }, {  430,  523, }, {  436,  531, },
+        {  442,  539, }, {  448,  547, }, {  454,  555, }, {  460,  563, },
+        {  466,  571, }, {  472,  579, }, {  478,  588, }, {  484,  596, },
+        {  490,  604, }, {  499,  616, }, {  507,  628, }, {  516,  640, },
+        {  525,  652, }, {  533,  664, }, {  542,  676, }, {  550,  688, },
+        {  559,  700, }, {  567,  713, }, {  576,  725, }, {  584,  737, },
+        {  592,  749, }, {  601,  761, }, {  609,  773, }, {  617,  785, },
+        {  625,  797, }, {  634,  809, }, {  644,  825, }, {  655,  841, },
+        {  666,  857, }, {  676,  873, }, {  687,  889, }, {  698,  905, },
+        {  708,  922, }, {  718,  938, }, {  729,  954, }, {  739,  970, },
+        {  749,  986, }, {  759, 1002, }, {  770, 1018, }, {  782, 1038, },
+        {  795, 1058, }, {  807, 1078, }, {  819, 1098, }, {  831, 1118, },
+        {  844, 1138, }, {  856, 1158, }, {  868, 1178, }, {  880, 1198, },
+        {  891, 1218, }, {  906, 1242, }, {  920, 1266, }, {  933, 1290, },
+        {  947, 1314, }, {  961, 1338, }, {  975, 1362, }, {  988, 1386, },
+        { 1001, 1411, }, { 1015, 1435, }, { 1030, 1463, }, { 1045, 1491, },
+        { 1061, 1519, }, { 1076, 1547, }, { 1090, 1575, }, { 1105, 1603, },
+        { 1120, 1631, }, { 1137, 1663, }, { 1153, 1695, }, { 1170, 1727, },
+        { 1186, 1759, }, { 1202, 1791, }, { 1218, 1823, }, { 1236, 1859, },
+        { 1253, 1895, }, { 1271, 1931, }, { 1288, 1967, }, { 1306, 2003, },
+        { 1323, 2039, }, { 1342, 2079, }, { 1361, 2119, }, { 1379, 2159, },
+        { 1398, 2199, }, { 1416, 2239, }, { 1436, 2283, }, { 1456, 2327, },
+        { 1476, 2371, }, { 1496, 2415, }, { 1516, 2459, }, { 1537, 2507, },
+        { 1559, 2555, }, { 1580, 2603, }, { 1601, 2651, }, { 1624, 2703, },
+        { 1647, 2755, }, { 1670, 2807, }, { 1692, 2859, }, { 1717, 2915, },
+        { 1741, 2971, }, { 1766, 3027, }, { 1791, 3083, }, { 1817, 3143, },
+        { 1844, 3203, }, { 1871, 3263, }, { 1900, 3327, }, { 1929, 3391, },
+        { 1958, 3455, }, { 1990, 3523, }, { 2021, 3591, }, { 2054, 3659, },
+        { 2088, 3731, }, { 2123, 3803, }, { 2159, 3876, }, { 2197, 3952, },
+        { 2236, 4028, }, { 2276, 4104, }, { 2319, 4184, }, { 2363, 4264, },
+        { 2410, 4348, }, { 2458, 4432, }, { 2508, 4516, }, { 2561, 4604, },
+        { 2616, 4692, }, { 2675, 4784, }, { 2737, 4876, }, { 2802, 4972, },
+        { 2871, 5068, }, { 2944, 5168, }, { 3020, 5268, }, { 3102, 5372, },
+        { 3188, 5476, }, { 3280, 5584, }, { 3375, 5692, }, { 3478, 5804, },
+        { 3586, 5916, }, { 3702, 6032, }, { 3823, 6148, }, { 3953, 6268, },
+        { 4089, 6388, }, { 4236, 6512, }, { 4394, 6640, }, { 4559, 6768, },
+        { 4737, 6900, }, { 4929, 7036, }, { 5130, 7172, }, { 5347, 7312, },
+    }, {
+        {     4,     4 }, {    12,    13 }, {    18,    19 }, {    25,    27 },
+        {    33,    35 }, {    41,    44 }, {    50,    54 }, {    60,    64 },
+        {    70,    75 }, {    80,    87 }, {    91,    99 }, {   103,   112 },
+        {   115,   126 }, {   127,   139 }, {   140,   154 }, {   153,   168 },
+        {   166,   183 }, {   180,   199 }, {   194,   214 }, {   208,   230 },
+        {   222,   247 }, {   237,   263 }, {   251,   280 }, {   266,   297 },
+        {   281,   314 }, {   296,   331 }, {   312,   349 }, {   327,   366 },
+        {   343,   384 }, {   358,   402 }, {   374,   420 }, {   390,   438 },
+        {   405,   456 }, {   421,   475 }, {   437,   493 }, {   453,   511 },
+        {   469,   530 }, {   484,   548 }, {   500,   567 }, {   516,   586 },
+        {   532,   604 }, {   548,   623 }, {   564,   642 }, {   580,   660 },
+        {   596,   679 }, {   611,   698 }, {   627,   716 }, {   643,   735 },
+        {   659,   753 }, {   674,   772 }, {   690,   791 }, {   706,   809 },
+        {   721,   828 }, {   737,   846 }, {   752,   865 }, {   768,   884 },
+        {   783,   902 }, {   798,   920 }, {   814,   939 }, {   829,   957 },
+        {   844,   976 }, {   859,   994 }, {   874,  1012 }, {   889,  1030 },
+        {   904,  1049 }, {   919,  1067 }, {   934,  1085 }, {   949,  1103 },
+        {   964,  1121 }, {   978,  1139 }, {   993,  1157 }, {  1008,  1175 },
+        {  1022,  1193 }, {  1037,  1211 }, {  1051,  1229 }, {  1065,  1246 },
+        {  1080,  1264 }, {  1094,  1282 }, {  1108,  1299 }, {  1122,  1317 },
+        {  1136,  1335 }, {  1151,  1352 }, {  1165,  1370 }, {  1179,  1387 },
+        {  1192,  1405 }, {  1206,  1422 }, {  1220,  1440 }, {  1234,  1457 },
+        {  1248,  1474 }, {  1261,  1491 }, {  1275,  1509 }, {  1288,  1526 },
+        {  1302,  1543 }, {  1315,  1560 }, {  1329,  1577 }, {  1342,  1595 },
+        {  1368,  1627 }, {  1393,  1660 }, {  1419,  1693 }, {  1444,  1725 },
+        {  1469,  1758 }, {  1494,  1791 }, {  1519,  1824 }, {  1544,  1856 },
+        {  1569,  1889 }, {  1594,  1922 }, {  1618,  1954 }, {  1643,  1987 },
+        {  1668,  2020 }, {  1692,  2052 }, {  1717,  2085 }, {  1741,  2118 },
+        {  1765,  2150 }, {  1789,  2183 }, {  1814,  2216 }, {  1838,  2248 },
+        {  1862,  2281 }, {  1885,  2313 }, {  1909,  2346 }, {  1933,  2378 },
+        {  1957,  2411 }, {  1992,  2459 }, {  2027,  2508 }, {  2061,  2556 },
+        {  2096,  2605 }, {  2130,  2653 }, {  2165,  2701 }, {  2199,  2750 },
+        {  2233,  2798 }, {  2267,  2847 }, {  2300,  2895 }, {  2334,  2943 },
+        {  2367,  2992 }, {  2400,  3040 }, {  2434,  3088 }, {  2467,  3137 },
+        {  2499,  3185 }, {  2532,  3234 }, {  2575,  3298 }, {  2618,  3362 },
+        {  2661,  3426 }, {  2704,  3491 }, {  2746,  3555 }, {  2788,  3619 },
+        {  2830,  3684 }, {  2872,  3748 }, {  2913,  3812 }, {  2954,  3876 },
+        {  2995,  3941 }, {  3036,  4005 }, {  3076,  4069 }, {  3127,  4149 },
+        {  3177,  4230 }, {  3226,  4310 }, {  3275,  4390 }, {  3324,  4470 },
+        {  3373,  4550 }, {  3421,  4631 }, {  3469,  4711 }, {  3517,  4791 },
+        {  3565,  4871 }, {  3621,  4967 }, {  3677,  5064 }, {  3733,  5160 },
+        {  3788,  5256 }, {  3843,  5352 }, {  3897,  5448 }, {  3951,  5544 },
+        {  4005,  5641 }, {  4058,  5737 }, {  4119,  5849 }, {  4181,  5961 },
+        {  4241,  6073 }, {  4301,  6185 }, {  4361,  6297 }, {  4420,  6410 },
+        {  4479,  6522 }, {  4546,  6650 }, {  4612,  6778 }, {  4677,  6906 },
+        {  4742,  7034 }, {  4807,  7162 }, {  4871,  7290 }, {  4942,  7435 },
+        {  5013,  7579 }, {  5083,  7723 }, {  5153,  7867 }, {  5222,  8011 },
+        {  5291,  8155 }, {  5367,  8315 }, {  5442,  8475 }, {  5517,  8635 },
+        {  5591,  8795 }, {  5665,  8956 }, {  5745,  9132 }, {  5825,  9308 },
+        {  5905,  9484 }, {  5984,  9660 }, {  6063,  9836 }, {  6149, 10028 },
+        {  6234, 10220 }, {  6319, 10412 }, {  6404, 10604 }, {  6495, 10812 },
+        {  6587, 11020 }, {  6678, 11228 }, {  6769, 11437 }, {  6867, 11661 },
+        {  6966, 11885 }, {  7064, 12109 }, {  7163, 12333 }, {  7269, 12573 },
+        {  7376, 12813 }, {  7483, 13053 }, {  7599, 13309 }, {  7715, 13565 },
+        {  7832, 13821 }, {  7958, 14093 }, {  8085, 14365 }, {  8214, 14637 },
+        {  8352, 14925 }, {  8492, 15213 }, {  8635, 15502 }, {  8788, 15806 },
+        {  8945, 16110 }, {  9104, 16414 }, {  9275, 16734 }, {  9450, 17054 },
+        {  9639, 17390 }, {  9832, 17726 }, { 10031, 18062 }, { 10245, 18414 },
+        { 10465, 18766 }, { 10702, 19134 }, { 10946, 19502 }, { 11210, 19886 },
+        { 11482, 20270 }, { 11776, 20670 }, { 12081, 21070 }, { 12409, 21486 },
+        { 12750, 21902 }, { 13118, 22334 }, { 13501, 22766 }, { 13913, 23214 },
+        { 14343, 23662 }, { 14807, 24126 }, { 15290, 24590 }, { 15812, 25070 },
+        { 16356, 25551 }, { 16943, 26047 }, { 17575, 26559 }, { 18237, 27071 },
+        { 18949, 27599 }, { 19718, 28143 }, { 20521, 28687 }, { 21387, 29247 },
+    }
+};
diff --git a/src/dequant_tables.h b/src/dequant_tables.h
new file mode 100644 (file)
index 0000000..66bb3b5
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_DEQUANT_TABLES_H
+#define DAV1D_SRC_DEQUANT_TABLES_H
+
+#include <stdint.h>
+
+#include "src/levels.h"
+
+extern const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2];
+
+#endif /* DAV1D_SRC_DEQUANT_TABLES_H */
diff --git a/src/env.h b/src/env.h
new file mode 100644 (file)
index 0000000..7b91c4c
--- /dev/null
+++ b/src/env.h
@@ -0,0 +1,521 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ENV_H
+#define DAV1D_SRC_ENV_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "src/levels.h"
+#include "src/refmvs.h"
+#include "src/tables.h"
+
+typedef struct BlockContext {
+    uint8_t ALIGN(mode[32], 8);
+    uint8_t ALIGN(lcoef[32], 8);
+    uint8_t ALIGN(ccoef[2][32], 8);
+    uint8_t ALIGN(seg_pred[32], 8);
+    uint8_t ALIGN(skip[32], 8);
+    uint8_t ALIGN(skip_mode[32], 8);
+    uint8_t ALIGN(intra[32], 8);
+    uint8_t ALIGN(comp_type[32], 8);
+    int8_t ALIGN(ref[2][32], 8); // -1 means intra
+    uint8_t ALIGN(filter[2][32], 8); // 3 means unset
+    int8_t ALIGN(tx_intra[32], 8);
+    int8_t ALIGN(tx[32], 8);
+    uint8_t ALIGN(tx_lpf_y[32], 8);
+    uint8_t ALIGN(tx_lpf_uv[32], 8);
+    uint8_t ALIGN(partition[16], 8);
+    uint8_t ALIGN(uvmode[32], 8);
+    uint8_t ALIGN(pal_sz[32], 8);
+} BlockContext;
+
+static inline int get_intra_ctx(const BlockContext *const a,
+                                const BlockContext *const l,
+                                const int yb4, const int xb4,
+                                const int have_top, const int have_left)
+{
+    if (have_left) {
+        if (have_top) {
+            const int ctx = l->intra[yb4] + a->intra[xb4];
+            return ctx + (ctx == 2);
+        } else
+            return l->intra[yb4] * 2;
+    } else {
+        return have_top ? a->intra[xb4] * 2 : 0;
+    }
+}
+
+static inline int get_tx_ctx(const BlockContext *const a,
+                             const BlockContext *const l,
+                             const TxfmInfo *const max_tx,
+                             const int yb4, const int xb4)
+{
+    return (l->tx_intra[yb4] >= max_tx->lh) + (a->tx_intra[xb4] >= max_tx->lw);
+}
+
+static inline int get_partition_ctx(const BlockContext *const a,
+                                    const BlockContext *const l,
+                                    const enum BlockLevel bl,
+                                    const int yb8, const int xb8)
+{
+    return ((a->partition[xb8] >> (4 - bl)) & 1) +
+          (((l->partition[yb8] >> (4 - bl)) & 1) << 1);
+}
+
+static inline unsigned gather_left_partition_prob(const uint16_t *const in,
+                                                  const enum BlockLevel bl)
+{
+    unsigned out = in[PARTITION_H - 1] - in[PARTITION_H];
+    // Exploit the fact that cdfs for PARTITION_SPLIT, PARTITION_T_TOP_SPLIT,
+    // PARTITION_T_BOTTOM_SPLIT and PARTITION_T_LEFT_SPLIT are neighbors.
+    out += in[PARTITION_SPLIT - 1] - in[PARTITION_T_LEFT_SPLIT];
+    if (bl != BL_128X128)
+        out += in[PARTITION_H4 - 1] - in[PARTITION_H4];
+    return out;
+}
+
+static inline unsigned gather_top_partition_prob(const uint16_t *const in,
+                                                 const enum BlockLevel bl)
+{
+    // Exploit the fact that cdfs for PARTITION_V, PARTITION_SPLIT and
+    // PARTITION_T_TOP_SPLIT are neighbors.
+    unsigned out = in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT];
+    // Exploit the facts that cdfs for PARTITION_T_LEFT_SPLIT and
+    // PARTITION_T_RIGHT_SPLIT are neighbors, the probability for
+    // PARTITION_V4 is always zero, and the probability for
+    // PARTITION_T_RIGHT_SPLIT is zero in 128x128 blocks.
+    out += in[PARTITION_T_LEFT_SPLIT - 1];
+    if (bl != BL_128X128)
+        out += in[PARTITION_V4 - 1] - in[PARTITION_T_RIGHT_SPLIT];
+    return out;
+}
+
+static inline enum TxfmType get_uv_inter_txtp(const TxfmInfo *const uvt_dim,
+                                              const enum TxfmType ytxtp)
+{
+    if (uvt_dim->max == TX_32X32)
+        return ytxtp == IDTX ? IDTX : DCT_DCT;
+    if (uvt_dim->min == TX_16X16 &&
+        ((1 << ytxtp) & ((1 << H_FLIPADST) | (1 << V_FLIPADST) |
+                         (1 << H_ADST) | (1 << V_ADST))))
+    {
+        return DCT_DCT;
+    }
+
+    return ytxtp;
+}
+
+static inline int get_filter_ctx(const BlockContext *const a,
+                                 const BlockContext *const l,
+                                 const int comp, const int dir, const int ref,
+                                 const int yb4, const int xb4)
+{
+    const int a_filter = (a->ref[0][xb4] == ref || a->ref[1][xb4] == ref) ?
+                         a->filter[dir][xb4] : DAV1D_N_SWITCHABLE_FILTERS;
+    const int l_filter = (l->ref[0][yb4] == ref || l->ref[1][yb4] == ref) ?
+                         l->filter[dir][yb4] : DAV1D_N_SWITCHABLE_FILTERS;
+
+    if (a_filter == l_filter) {
+        return comp * 4 + a_filter;
+    } else if (a_filter == DAV1D_N_SWITCHABLE_FILTERS) {
+        return comp * 4 + l_filter;
+    } else if (l_filter == DAV1D_N_SWITCHABLE_FILTERS) {
+        return comp * 4 + a_filter;
+    } else {
+        return comp * 4 + DAV1D_N_SWITCHABLE_FILTERS;
+    }
+}
+
+static inline int get_comp_ctx(const BlockContext *const a,
+                               const BlockContext *const l,
+                               const int yb4, const int xb4,
+                               const int have_top, const int have_left)
+{
+    if (have_top) {
+        if (have_left) {
+            if (a->comp_type[xb4]) {
+                if (l->comp_type[yb4]) {
+                    return 4;
+                } else {
+                    // 4U means intra (-1) or bwd (>= 4)
+                    return 2 + ((unsigned)l->ref[0][yb4] >= 4U);
+                }
+            } else if (l->comp_type[yb4]) {
+                // 4U means intra (-1) or bwd (>= 4)
+                return 2 + ((unsigned)a->ref[0][xb4] >= 4U);
+            } else {
+                return (l->ref[0][yb4] >= 4) ^ (a->ref[0][xb4] >= 4);
+            }
+        } else {
+            return a->comp_type[xb4] ? 3 : a->ref[0][xb4] >= 4;
+        }
+    } else if (have_left) {
+        return l->comp_type[yb4] ? 3 : l->ref[0][yb4] >= 4;
+    } else {
+        return 1;
+    }
+}
+
+static inline int get_comp_dir_ctx(const BlockContext *const a,
+                                   const BlockContext *const l,
+                                   const int yb4, const int xb4,
+                                   const int have_top, const int have_left)
+{
+#define has_uni_comp(edge, off) \
+    ((edge->ref[0][off] < 4) == (edge->ref[1][off] < 4))
+
+    if (have_top && have_left) {
+        const int a_intra = a->intra[xb4], l_intra = l->intra[yb4];
+
+        if (a_intra && l_intra) return 2;
+        if (a_intra || l_intra) {
+            const BlockContext *const edge = a_intra ? l : a;
+            const int off = a_intra ? yb4 : xb4;
+
+            if (edge->comp_type[off] == COMP_INTER_NONE) return 2;
+            return 1 + 2 * has_uni_comp(edge, off);
+        }
+
+        const int a_comp = a->comp_type[xb4] != COMP_INTER_NONE;
+        const int l_comp = l->comp_type[yb4] != COMP_INTER_NONE;
+        const int a_ref0 = a->ref[0][xb4], l_ref0 = l->ref[0][yb4];
+
+        if (!a_comp && !l_comp) {
+            return 1 + 2 * ((a_ref0 >= 4) == (l_ref0 >= 4));
+        } else if (!a_comp || !l_comp) {
+            const BlockContext *const edge = a_comp ? a : l;
+            const int off = a_comp ? xb4 : yb4;
+
+            if (!has_uni_comp(edge, off)) return 1;
+            return 3 + ((a_ref0 >= 4) == (l_ref0 >= 4));
+        } else {
+            const int a_uni = has_uni_comp(a, xb4), l_uni = has_uni_comp(l, yb4);
+
+            if (!a_uni && !l_uni) return 0;
+            if (!a_uni || !l_uni) return 2;
+            return 3 + ((a_ref0 == 4) == (l_ref0 == 4));
+        }
+    } else if (have_top || have_left) {
+        const BlockContext *const edge = have_left ? l : a;
+        const int off = have_left ? yb4 : xb4;
+
+        if (edge->intra[off]) return 2;
+        if (edge->comp_type[off] == COMP_INTER_NONE) return 2;
+        return 4 * has_uni_comp(edge, off);
+    } else {
+        return 2;
+    }
+}
+
+static inline int get_poc_diff(const int order_hint_n_bits,
+                               const int poc0, const int poc1)
+{
+    if (!order_hint_n_bits) return 0;
+    const int mask = 1 << (order_hint_n_bits - 1);
+    const int diff = poc0 - poc1;
+    return (diff & (mask - 1)) - (diff & mask);
+}
+
+static inline int get_jnt_comp_ctx(const int order_hint_n_bits,
+                                   const unsigned poc, const unsigned ref0poc,
+                                   const unsigned ref1poc,
+                                   const BlockContext *const a,
+                                   const BlockContext *const l,
+                                   const int yb4, const int xb4)
+{
+    const unsigned d0 = abs(get_poc_diff(order_hint_n_bits, ref0poc, poc));
+    const unsigned d1 = abs(get_poc_diff(order_hint_n_bits, poc, ref1poc));
+    const int offset = d0 == d1;
+    const int a_ctx = a->comp_type[xb4] >= COMP_INTER_AVG ||
+                      a->ref[0][xb4] == 6;
+    const int l_ctx = l->comp_type[yb4] >= COMP_INTER_AVG ||
+                      l->ref[0][yb4] == 6;
+
+    return 3 * offset + a_ctx + l_ctx;
+}
+
+static inline int get_mask_comp_ctx(const BlockContext *const a,
+                                    const BlockContext *const l,
+                                    const int yb4, const int xb4)
+{
+    const int a_ctx = a->comp_type[xb4] >= COMP_INTER_SEG ? 1 :
+                      a->ref[0][xb4] == 6 ? 3 : 0;
+    const int l_ctx = l->comp_type[yb4] >= COMP_INTER_SEG ? 1 :
+                      l->ref[0][yb4] == 6 ? 3 : 0;
+
+    return imin(a_ctx + l_ctx, 5);
+}
+
+#define av1_get_ref_2_ctx av1_get_bwd_ref_ctx
+#define av1_get_ref_3_ctx av1_get_fwd_ref_ctx
+#define av1_get_ref_4_ctx av1_get_fwd_ref_1_ctx
+#define av1_get_ref_5_ctx av1_get_fwd_ref_2_ctx
+#define av1_get_ref_6_ctx av1_get_bwd_ref_1_ctx
+#define av1_get_uni_p_ctx av1_get_ref_ctx
+#define av1_get_uni_p2_ctx av1_get_fwd_ref_2_ctx
+
+static inline int av1_get_ref_ctx(const BlockContext *const a,
+                                  const BlockContext *const l,
+                                  const int yb4, const int xb4,
+                                  int have_top, int have_left)
+{
+    int cnt[2] = { 0 };
+
+    if (have_top && !a->intra[xb4]) {
+        cnt[a->ref[0][xb4] >= 4]++;
+        if (a->comp_type[xb4]) cnt[a->ref[1][xb4] >= 4]++;
+    }
+
+    if (have_left && !l->intra[yb4]) {
+        cnt[l->ref[0][yb4] >= 4]++;
+        if (l->comp_type[yb4]) cnt[l->ref[1][yb4] >= 4]++;
+    }
+
+    return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int av1_get_fwd_ref_ctx(const BlockContext *const a,
+                                      const BlockContext *const l,
+                                      const int yb4, const int xb4,
+                                      const int have_top, const int have_left)
+{
+    int cnt[4] = { 0 };
+
+    if (have_top && !a->intra[xb4]) {
+        if (a->ref[0][xb4] < 4) cnt[a->ref[0][xb4]]++;
+        if (a->comp_type[xb4] && a->ref[1][xb4] < 4) cnt[a->ref[1][xb4]]++;
+    }
+
+    if (have_left && !l->intra[yb4]) {
+        if (l->ref[0][yb4] < 4) cnt[l->ref[0][yb4]]++;
+        if (l->comp_type[yb4] && l->ref[1][yb4] < 4) cnt[l->ref[1][yb4]]++;
+    }
+
+    cnt[0] += cnt[1];
+    cnt[2] += cnt[3];
+
+    return cnt[0] == cnt[2] ? 1 : cnt[0] < cnt[2] ? 0 : 2;
+}
+
+static inline int av1_get_fwd_ref_1_ctx(const BlockContext *const a,
+                                        const BlockContext *const l,
+                                        const int yb4, const int xb4,
+                                        const int have_top, const int have_left)
+{
+    int cnt[2] = { 0 };
+
+    if (have_top && !a->intra[xb4]) {
+        if (a->ref[0][xb4] < 2) cnt[a->ref[0][xb4]]++;
+        if (a->comp_type[xb4] && a->ref[1][xb4] < 2) cnt[a->ref[1][xb4]]++;
+    }
+
+    if (have_left && !l->intra[yb4]) {
+        if (l->ref[0][yb4] < 2) cnt[l->ref[0][yb4]]++;
+        if (l->comp_type[yb4] && l->ref[1][yb4] < 2) cnt[l->ref[1][yb4]]++;
+    }
+
+    return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int av1_get_fwd_ref_2_ctx(const BlockContext *const a,
+                                        const BlockContext *const l,
+                                        const int yb4, const int xb4,
+                                        const int have_top, const int have_left)
+{
+    int cnt[2] = { 0 };
+
+    if (have_top && !a->intra[xb4]) {
+        if ((a->ref[0][xb4] ^ 2U) < 2) cnt[a->ref[0][xb4] - 2]++;
+        if (a->comp_type[xb4] && (a->ref[1][xb4] ^ 2U) < 2) cnt[a->ref[1][xb4] - 2]++;
+    }
+
+    if (have_left && !l->intra[yb4]) {
+        if ((l->ref[0][yb4] ^ 2U) < 2) cnt[l->ref[0][yb4] - 2]++;
+        if (l->comp_type[yb4] && (l->ref[1][yb4] ^ 2U) < 2) cnt[l->ref[1][yb4] - 2]++;
+    }
+
+    return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int av1_get_bwd_ref_ctx(const BlockContext *const a,
+                                      const BlockContext *const l,
+                                      const int yb4, const int xb4,
+                                      const int have_top, const int have_left)
+{
+    int cnt[3] = { 0 };
+
+    if (have_top && !a->intra[xb4]) {
+        if (a->ref[0][xb4] >= 4) cnt[a->ref[0][xb4] - 4]++;
+        if (a->comp_type[xb4] && a->ref[1][xb4] >= 4) cnt[a->ref[1][xb4] - 4]++;
+    }
+
+    if (have_left && !l->intra[yb4]) {
+        if (l->ref[0][yb4] >= 4) cnt[l->ref[0][yb4] - 4]++;
+        if (l->comp_type[yb4] && l->ref[1][yb4] >= 4) cnt[l->ref[1][yb4] - 4]++;
+    }
+
+    cnt[1] += cnt[0];
+
+    return cnt[2] == cnt[1] ? 1 : cnt[1] < cnt[2] ? 0 : 2;
+}
+
+static inline int av1_get_bwd_ref_1_ctx(const BlockContext *const a,
+                                        const BlockContext *const l,
+                                        const int yb4, const int xb4,
+                                        const int have_top, const int have_left)
+{
+    int cnt[3] = { 0 };
+
+    if (have_top && !a->intra[xb4]) {
+        if (a->ref[0][xb4] >= 4) cnt[a->ref[0][xb4] - 4]++;
+        if (a->comp_type[xb4] && a->ref[1][xb4] >= 4) cnt[a->ref[1][xb4] - 4]++;
+    }
+
+    if (have_left && !l->intra[yb4]) {
+        if (l->ref[0][yb4] >= 4) cnt[l->ref[0][yb4] - 4]++;
+        if (l->comp_type[yb4] && l->ref[1][yb4] >= 4) cnt[l->ref[1][yb4] - 4]++;
+    }
+
+    return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int av1_get_uni_p1_ctx(const BlockContext *const a,
+                                     const BlockContext *const l,
+                                     const int yb4, const int xb4,
+                                     const int have_top, const int have_left)
+{
+    int cnt[3] = { 0 };
+
+    if (have_top && !a->intra[xb4]) {
+        if (a->ref[0][xb4] - 1U < 3) cnt[a->ref[0][xb4] - 1]++;
+        if (a->comp_type[xb4] && a->ref[1][xb4] - 1U < 3) cnt[a->ref[1][xb4] - 1]++;
+    }
+
+    if (have_left && !l->intra[yb4]) {
+        if (l->ref[0][yb4] - 1U < 3) cnt[l->ref[0][yb4] - 1]++;
+        if (l->comp_type[yb4] && l->ref[1][yb4] - 1U < 3) cnt[l->ref[1][yb4] - 1]++;
+    }
+
+    cnt[1] += cnt[2];
+
+    return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int get_drl_context(const refmvs_candidate *const ref_mv_stack,
+                                  const int ref_idx)
+{
+    if (ref_mv_stack[ref_idx].weight >= 640)
+        return ref_mv_stack[ref_idx + 1].weight < 640;
+
+    return ref_mv_stack[ref_idx + 1].weight < 640 ? 2 : 0;
+}
+
+static inline unsigned get_cur_frame_segid(const int by, const int bx,
+                                           const int have_top,
+                                           const int have_left,
+                                           int *const seg_ctx,
+                                           const uint8_t *cur_seg_map,
+                                           const ptrdiff_t stride)
+{
+    cur_seg_map += bx + by * stride;
+    if (have_left && have_top) {
+        const int l = cur_seg_map[-1];
+        const int a = cur_seg_map[-stride];
+        const int al = cur_seg_map[-(stride + 1)];
+
+        if (l == a && al == l) *seg_ctx = 2;
+        else if (l == a || al == l || a == al) *seg_ctx = 1;
+        else *seg_ctx = 0;
+        return a == al ? a : l;
+    } else {
+        *seg_ctx = 0;
+        return have_left ? cur_seg_map[-1] : have_top ? cur_seg_map[-stride] : 0;
+    }
+}
+
+static inline void fix_int_mv_precision(mv *const mv) {
+    mv->x = (mv->x - (mv->x >> 15) + 3) & ~7U;
+    mv->y = (mv->y - (mv->y >> 15) + 3) & ~7U;
+}
+
+static inline void fix_mv_precision(const Dav1dFrameHeader *const hdr,
+                                    mv *const mv)
+{
+    if (hdr->force_integer_mv) {
+        fix_int_mv_precision(mv);
+    } else if (!hdr->hp) {
+        mv->x = (mv->x - (mv->x >> 15)) & ~1U;
+        mv->y = (mv->y - (mv->y >> 15)) & ~1U;
+    }
+}
+
+static inline mv get_gmv_2d(const Dav1dWarpedMotionParams *const gmv,
+                            const int bx4, const int by4,
+                            const int bw4, const int bh4,
+                            const Dav1dFrameHeader *const hdr)
+{
+    switch (gmv->type) {
+    case DAV1D_WM_TYPE_ROT_ZOOM:
+        assert(gmv->matrix[5] ==  gmv->matrix[2]);
+        assert(gmv->matrix[4] == -gmv->matrix[3]);
+        // fall-through
+    default:
+    case DAV1D_WM_TYPE_AFFINE: {
+        const int x = bx4 * 4 + bw4 * 2 - 1;
+        const int y = by4 * 4 + bh4 * 2 - 1;
+        const int xc = (gmv->matrix[2] - (1 << 16)) * x +
+                       gmv->matrix[3] * y + gmv->matrix[0];
+        const int yc = (gmv->matrix[5] - (1 << 16)) * y +
+                       gmv->matrix[4] * x + gmv->matrix[1];
+        const int shift = 16 - (3 - !hdr->hp);
+        const int round = (1 << shift) >> 1;
+        mv res = (mv) {
+            .y = apply_sign(((abs(yc) + round) >> shift) << !hdr->hp, yc),
+            .x = apply_sign(((abs(xc) + round) >> shift) << !hdr->hp, xc),
+        };
+        if (hdr->force_integer_mv)
+            fix_int_mv_precision(&res);
+        return res;
+    }
+    case DAV1D_WM_TYPE_TRANSLATION: {
+        mv res = (mv) {
+            .y = gmv->matrix[0] >> 13,
+            .x = gmv->matrix[1] >> 13,
+        };
+        if (hdr->force_integer_mv)
+            fix_int_mv_precision(&res);
+        return res;
+    }
+    case DAV1D_WM_TYPE_IDENTITY:
+        return (mv) { .x = 0, .y = 0 };
+    }
+}
+
+#endif /* DAV1D_SRC_ENV_H */
diff --git a/src/ext/x86/x86inc.asm b/src/ext/x86/x86inc.asm
new file mode 100644 (file)
index 0000000..c252e54
--- /dev/null
@@ -0,0 +1,1809 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2019 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Henrik Gramner <henrik@gramner.com>
+;*          Anton Mitrofanov <BugMaster@narod.ru>
+;*          Fiona Glaser <fiona@x264.com>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible.  Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well.  Send patches or ideas
+; to x264-devel@videolan.org .
+
+%include "config.asm"
+
+%ifndef private_prefix
+    %define private_prefix dav1d
+%endif
+
+%ifndef public_prefix
+    %define public_prefix private_prefix
+%endif
+
+%ifndef STACK_ALIGNMENT
+    %if ARCH_X86_64
+        %define STACK_ALIGNMENT 16
+    %else
+        %define STACK_ALIGNMENT 4
+    %endif
+%endif
+
+%define WIN64  0
+%define UNIX64 0
+%if ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,win64
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,x64
+        %define WIN64  1
+    %else
+        %define UNIX64 1
+    %endif
+%endif
+
+%define FORMAT_ELF 0
+%define FORMAT_MACHO 0
+%ifidn __OUTPUT_FORMAT__,elf
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf64
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,macho
+    %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho32
+    %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+    %define FORMAT_MACHO 1
+%endif
+
+%ifdef PREFIX
+    %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
+%endif
+
+%macro SECTION_RODATA 0-1 16
+    %ifidn __OUTPUT_FORMAT__,win32
+        SECTION .rdata align=%1
+    %elif WIN64
+        SECTION .rdata align=%1
+    %else
+        SECTION .rodata align=%1
+    %endif
+%endmacro
+
+%if ARCH_X86_64
+    %define PIC 1 ; always use PIC on x86-64
+    default rel
+%elifidn __OUTPUT_FORMAT__,win32
+    %define PIC 0 ; PIC isn't used on 32-bit Windows
+%elifndef PIC
+    %define PIC 0
+%endif
+
+%define HAVE_PRIVATE_EXTERN 1
+%ifdef __NASM_VER__
+    %use smartalign
+    %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
+        %define HAVE_PRIVATE_EXTERN 0
+    %endif
+%endif
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = (optional) stack size to be allocated. The stack will be aligned before
+;      allocating the specified stack size. If the required stack alignment is
+;      larger than the known stack alignment the stack will be manually aligned
+;      and an extra register will be allocated to hold the original stack
+;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
+;      register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,7,0x40, dst, src, tmp
+; declares a function (foo) that automatically loads two arguments (dst and
+; src) into registers, uses one additional register (tmp) plus 7 vector
+; registers (m0-m6) and allocates 0x40 bytes of stack space.
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Use this instead of RET if it's a branch target.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+    %define r%1q %2
+    %define r%1d %2d
+    %define r%1w %2w
+    %define r%1b %2b
+    %define r%1h %2h
+    %define %2q %2
+    %if %0 == 2
+        %define r%1m  %2d
+        %define r%1mp %2
+    %elif ARCH_X86_64 ; memory
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp qword r %+ %1 %+ m
+    %else
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp dword r %+ %1 %+ m
+    %endif
+    %define r%1  %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+    %define r%1q r%1
+    %define e%1q r%1
+    %define r%1d e%1
+    %define e%1d e%1
+    %define r%1w %1
+    %define e%1w %1
+    %define r%1h %3
+    %define e%1h %3
+    %define r%1b %2
+    %define e%1b %2
+    %if ARCH_X86_64 == 0
+        %define r%1 e%1
+    %endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1h t%1 %+ h
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+    %define gprsize 8
+%else
+    %define gprsize 4
+%endif
+
+%macro LEA 2
+%if ARCH_X86_64
+    lea %1, [%2]
+%elif PIC
+    call $+5 ; special-cased to not affect the RSB on most CPU:s
+    pop %1
+    add %1, (%2)-$+1
+%else
+    mov %1, %2
+%endif
+%endmacro
+
+%macro PUSH 1
+    push %1
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset+gprsize
+    %endif
+%endmacro
+
+%macro POP 1
+    pop %1
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset-gprsize
+    %endif
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            PUSH r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            pop r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+    %rep %0
+        %if %1 < num_args
+            mov r%1, r %+ %1 %+ mp
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SUB 2
+    sub %1, %2
+    %ifidn %1, rstk
+        %assign stack_offset stack_offset+(%2)
+    %endif
+%endmacro
+
+%macro ADD 2
+    add %1, %2
+    %ifidn %1, rstk
+        %assign stack_offset stack_offset-(%2)
+    %endif
+%endmacro
+
+%macro movifnidn 2
+    %ifnidn %1, %2
+        mov %1, %2
+    %endif
+%endmacro
+
+%if ARCH_X86_64 == 0
+    %define movsxd movifnidn
+%endif
+
+%macro movsxdifnidn 2
+    %ifnidn %1, %2
+        movsxd %1, %2
+    %endif
+%endmacro
+
+%macro ASSERT 1
+    %if (%1) == 0
+        %error assertion ``%1'' failed
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+    %ifdef n_arg_names
+        %assign %%i 0
+        %rep n_arg_names
+            CAT_UNDEF arg_name %+ %%i, q
+            CAT_UNDEF arg_name %+ %%i, d
+            CAT_UNDEF arg_name %+ %%i, w
+            CAT_UNDEF arg_name %+ %%i, h
+            CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name %+ %%i, m
+            CAT_UNDEF arg_name %+ %%i, mp
+            CAT_UNDEF arg_name, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+
+    %xdefine %%stack_offset stack_offset
+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+    %assign %%i 0
+    %rep %0
+        %xdefine %1q r %+ %%i %+ q
+        %xdefine %1d r %+ %%i %+ d
+        %xdefine %1w r %+ %%i %+ w
+        %xdefine %1h r %+ %%i %+ h
+        %xdefine %1b r %+ %%i %+ b
+        %xdefine %1m r %+ %%i %+ m
+        %xdefine %1mp r %+ %%i %+ mp
+        CAT_XDEFINE arg_name, %%i, %1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+    %xdefine stack_offset %%stack_offset
+    %assign n_arg_names %0
+%endmacro
+
+%define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
+
+%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only)
+    %ifnum %1
+        %if %1 != 0
+            %assign %%pad 0
+            %assign stack_size %1
+            %if stack_size < 0
+                %assign stack_size -stack_size
+            %endif
+            %if WIN64
+                %assign %%pad %%pad + 32 ; shadow space
+                %if mmsize != 8
+                    %assign xmm_regs_used %2
+                    %if xmm_regs_used > 8
+                        %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
+                    %endif
+                %endif
+            %endif
+            %if required_stack_alignment <= STACK_ALIGNMENT
+                ; maintain the current stack alignment
+                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+                SUB rsp, stack_size_padded
+            %else
+                %assign %%reg_num (regs_used - 1)
+                %xdefine rstk r %+ %%reg_num
+                ; align stack, and save original stack location directly above
+                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
+                ; stack in a single instruction (i.e. mov rsp, rstk or mov
+                ; rsp, [rsp+stack_size_padded])
+                %if %1 < 0 ; need to store rsp on stack
+                    %xdefine rstkm [rsp + stack_size + %%pad]
+                    %assign %%pad %%pad + gprsize
+                %else ; can keep rsp in rstk during whole function
+                    %xdefine rstkm rstk
+                %endif
+                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
+                mov rstk, rsp
+                and rsp, ~(required_stack_alignment-1)
+                sub rsp, stack_size_padded
+                movifnidn rstkm, rstk
+            %endif
+            WIN64_PUSH_XMM
+        %endif
+    %endif
+%endmacro
+
+%macro SETUP_STACK_POINTER 0-1 0
+    %ifnum %1
+        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
+            %if %1 > 0
+                ; Reserve an additional register for storing the original stack pointer, but avoid using
+                ; eax/rax for this purpose since it can potentially get overwritten as a return value.
+                %assign regs_used (regs_used + 1)
+                %if ARCH_X86_64 && regs_used == 7
+                    %assign regs_used 8
+                %elif ARCH_X86_64 == 0 && regs_used == 1
+                    %assign regs_used 2
+                %endif
+            %endif
+            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+                ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
+                ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
+                %assign regs_used 5 + UNIX64 * 3
+            %endif
+        %endif
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS_INTERNAL 3+
+    %ifnum %2
+        DEFINE_ARGS %3
+    %elif %1 == 4
+        DEFINE_ARGS %2
+    %elif %1 > 4
+        DEFINE_ARGS %2, %3
+    %endif
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0,  rcx
+DECLARE_REG 1,  rdx
+DECLARE_REG 2,  R8
+DECLARE_REG 3,  R9
+DECLARE_REG 4,  R10, 40
+DECLARE_REG 5,  R11, 48
+DECLARE_REG 6,  rax, 56
+DECLARE_REG 7,  rdi, 64
+DECLARE_REG 8,  rsi, 72
+DECLARE_REG 9,  rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R14, 96
+DECLARE_REG 12, R15, 104
+DECLARE_REG 13, R12, 112
+DECLARE_REG 14, R13, 120
+
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4, %3
+    %if mmsize != 8 && stack_size == 0
+        WIN64_SPILL_XMM %3
+    %endif
+    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+    %if xmm_regs_used > 6 + high_mm_regs
+        movaps [rstk + stack_offset +  8], xmm6
+    %endif
+    %if xmm_regs_used > 7 + high_mm_regs
+        movaps [rstk + stack_offset + 24], xmm7
+    %endif
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
+        %assign %%i 8
+        %rep %%xmm_regs_on_stack
+            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+    %assign xmm_regs_used %1
+    ASSERT xmm_regs_used <= 16 + high_mm_regs
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
+        ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
+        %assign %%pad %%xmm_regs_on_stack*16 + 32
+        %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+        SUB rsp, stack_size_padded
+    %endif
+    WIN64_PUSH_XMM
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 0
+    %assign %%pad_size 0
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
+        %assign %%i xmm_regs_used - high_mm_regs
+        %rep %%xmm_regs_on_stack
+            %assign %%i %%i-1
+            movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
+        %endrep
+    %endif
+    %if stack_size_padded > 0
+        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+            %assign %%pad_size stack_size_padded
+        %endif
+    %endif
+    %if xmm_regs_used > 7 + high_mm_regs
+        movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
+    %endif
+    %if xmm_regs_used > 6 + high_mm_regs
+        movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 0
+    WIN64_RESTORE_XMM_INTERNAL
+    %assign stack_offset (stack_offset-stack_size_padded)
+    %assign stack_size_padded 0
+    %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
+
+%macro RET 0
+    WIN64_RESTORE_XMM_INTERNAL
+    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+    %if vzeroupper_required
+        vzeroupper
+    %endif
+    AUTO_REP_RET
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0,  rdi
+DECLARE_REG 1,  rsi
+DECLARE_REG 2,  rdx
+DECLARE_REG 3,  rcx
+DECLARE_REG 4,  R8
+DECLARE_REG 5,  R9
+DECLARE_REG 6,  rax, 8
+DECLARE_REG 7,  R10, 16
+DECLARE_REG 8,  R11, 24
+DECLARE_REG 9,  rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R14, 48
+DECLARE_REG 12, R15, 56
+DECLARE_REG 13, R12, 64
+DECLARE_REG 14, R13, 72
+
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    %assign xmm_regs_used %3
+    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4
+    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
+
+%macro RET 0
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
+    POP_IF_USED 14, 13, 12, 11, 10, 9
+    %if vzeroupper_required
+        vzeroupper
+    %endif
+    AUTO_REP_RET
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+    %rep %0
+        %define r%1m [rstk + stack_offset + 4*%1 + 4]
+        %define r%1mp dword r%1m
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    %if num_args > 7
+        %assign num_args 7
+    %endif
+    %if regs_used > 7
+        %assign regs_used 7
+    %endif
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 7
+    PUSH_IF_USED 3, 4, 5, 6
+    ALLOC_STACK %4
+    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
+
+%macro RET 0
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
+    POP_IF_USED 6, 5, 4, 3
+    %if vzeroupper_required
+        vzeroupper
+    %endif
+    AUTO_REP_RET
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+    %macro WIN64_SPILL_XMM 1
+        %assign xmm_regs_used %1
+    %endmacro
+    %macro WIN64_RESTORE_XMM 0
+        %assign xmm_regs_used 0
+    %endmacro
+    %macro WIN64_PUSH_XMM 0
+    %endmacro
+%endif
+
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
+%macro REP_RET 0
+    %if has_epilogue || cpuflag(ssse3)
+        RET
+    %else
+        rep ret
+    %endif
+    annotate_function_size
+%endmacro
+
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+    %if notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
+    %endif
+    ret
+    annotate_function_size
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+    %rep %0
+        %macro %1 1-2 %1
+            %2 %1
+            %if notcpuflag(ssse3)
+                %%branch_instr equ $
+                %xdefine last_branch_adr %%branch_instr
+            %endif
+        %endmacro
+        %rotate 1
+    %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
+    %if has_epilogue
+        call %1
+        RET
+    %elif %2
+        jmp %1
+    %endif
+    annotate_function_size
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 1, %1 %+ SUFFIX, %2
+%endmacro
+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+    annotate_function_size
+    %ifndef cglobaled_%2
+        %if %1
+            %xdefine %2 mangle(private_prefix %+ _ %+ %2)
+        %else
+            %xdefine %2 mangle(public_prefix %+ _ %+ %2)
+        %endif
+        %xdefine %2.skip_prologue %2 %+ .skip_prologue
+        CAT_XDEFINE cglobaled_, %2, 1
+    %endif
+    %xdefine current_function %2
+    %xdefine current_function_section __SECT__
+    %if FORMAT_ELF
+        %if %1
+            global %2:function hidden
+        %else
+            global %2:function
+        %endif
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
+        global %2:private_extern
+    %else
+        global %2
+    %endif
+    align function_align
+    %2:
+    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
+    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+    %assign stack_offset 0      ; stack pointer offset relative to the return address
+    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
+    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
+    %ifnidn %3, ""
+        PROLOGUE %3
+    %endif
+%endmacro
+
+; Create a global symbol from a local label with the correct name mangling and type
+%macro cglobal_label 1
+    %if FORMAT_ELF
+        global current_function %+ %1:function hidden
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+        global current_function %+ %1:private_extern
+    %else
+        global current_function %+ %1
+    %endif
+    %1:
+%endmacro
+
+%macro cextern 1
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+    %ifdef PREFIX
+        %xdefine %1 mangle(%1)
+    %endif
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+%macro const 1-2+
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    %if FORMAT_ELF
+        global %1:data hidden
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+        global %1:private_extern
+    %else
+        global %1
+    %endif
+    %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+%endif
+
+; Tell debuggers how large the function was.
+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
+; then its size might be unspecified.
+%macro annotate_function_size 0
+    %ifdef __YASM_VER__
+        %ifdef current_function
+            %if FORMAT_ELF
+                current_function_section
+                %%ecf equ $
+                size current_function %%ecf - current_function
+                __SECT__
+            %endif
+        %endif
+    %endif
+%endmacro
+
+; cpuflags
+
+%assign cpuflags_mmx       (1<<0)
+%assign cpuflags_mmx2      (1<<1)  | cpuflags_mmx
+%assign cpuflags_3dnow     (1<<2)  | cpuflags_mmx
+%assign cpuflags_3dnowext  (1<<3)  | cpuflags_3dnow
+%assign cpuflags_sse       (1<<4)  | cpuflags_mmx2
+%assign cpuflags_sse2      (1<<5)  | cpuflags_sse
+%assign cpuflags_sse2slow  (1<<6)  | cpuflags_sse2
+%assign cpuflags_lzcnt     (1<<7)  | cpuflags_sse2
+%assign cpuflags_sse3      (1<<8)  | cpuflags_sse2
+%assign cpuflags_ssse3     (1<<9)  | cpuflags_sse3
+%assign cpuflags_sse4      (1<<10) | cpuflags_ssse3
+%assign cpuflags_sse42     (1<<11) | cpuflags_sse4
+%assign cpuflags_aesni     (1<<12) | cpuflags_sse42
+%assign cpuflags_gfni      (1<<13) | cpuflags_sse42
+%assign cpuflags_avx       (1<<14) | cpuflags_sse42
+%assign cpuflags_xop       (1<<15) | cpuflags_avx
+%assign cpuflags_fma4      (1<<16) | cpuflags_avx
+%assign cpuflags_fma3      (1<<17) | cpuflags_avx
+%assign cpuflags_bmi1      (1<<18) | cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2      (1<<19) | cpuflags_bmi1
+%assign cpuflags_avx2      (1<<20) | cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512    (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL
+%assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
+
+%assign cpuflags_cache32   (1<<23)
+%assign cpuflags_cache64   (1<<24)
+%assign cpuflags_aligned   (1<<25) ; not a cpu feature, but a function variant
+%assign cpuflags_atom      (1<<26)
+
+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
+%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
+%define notcpuflag(x) (cpuflag(x) ^ 1)
+
+; Takes an arbitrary number of cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-*
+    %xdefine SUFFIX
+    %undef cpuname
+    %assign cpuflags 0
+
+    %if %0 >= 1
+        %rep %0
+            %ifdef cpuname
+                %xdefine cpuname cpuname %+ _%1
+            %else
+                %xdefine cpuname %1
+            %endif
+            %assign cpuflags cpuflags | cpuflags_%1
+            %rotate 1
+        %endrep
+        %xdefine SUFFIX _ %+ cpuname
+
+        %if cpuflag(avx)
+            %assign avx_enabled 1
+        %endif
+        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
+            %define mova movaps
+            %define movu movups
+            %define movnta movntps
+        %endif
+        %if cpuflag(aligned)
+            %define movu mova
+        %elif cpuflag(sse3) && notcpuflag(ssse3)
+            %define movu lddqu
+        %endif
+    %endif
+
+    %if ARCH_X86_64 || cpuflag(sse2)
+        %ifdef __NASM_VER__
+            ALIGNMODE p6
+        %else
+            CPU amdnop
+        %endif
+    %else
+        %ifdef __NASM_VER__
+            ALIGNMODE nop
+        %else
+            CPU basicnop
+        %endif
+    %endif
+%endmacro
+
+; Merge mmx, sse*, and avx*
+; m# is a simd register of the currently selected size
+; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
+; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
+
+%macro CAT_XDEFINE 3
+    %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+    %undef %1%2
+%endmacro
+
+%macro DEFINE_MMREGS 1 ; mmtype
+    %assign %%prev_mmregs 0
+    %ifdef num_mmregs
+        %assign %%prev_mmregs num_mmregs
+    %endif
+
+    %assign num_mmregs 8
+    %if ARCH_X86_64 && mmsize >= 16
+        %assign num_mmregs 16
+        %if cpuflag(avx512) || mmsize == 64
+            %assign num_mmregs 32
+        %endif
+    %endif
+
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, %1 %+ %%i
+        CAT_XDEFINE nn%1, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    %if %%prev_mmregs > num_mmregs
+        %rep %%prev_mmregs - num_mmregs
+            CAT_UNDEF m, %%i
+            CAT_UNDEF nn %+ mmtype, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+    %xdefine mmtype %1
+%endmacro
+
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+    %if ARCH_X86_64 && cpuflag(avx512)
+        %assign %%i %1
+        %rep 16-%1
+            %assign %%i_high %%i+16
+            SWAP %%i, %%i_high
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+%macro INIT_MMX 0-1+
+    %assign avx_enabled 0
+    %define RESET_MM_PERMUTATION INIT_MMX %1
+    %define mmsize 8
+    %define mova movq
+    %define movu movq
+    %define movh movd
+    %define movnta movntq
+    INIT_CPUFLAGS %1
+    DEFINE_MMREGS mm
+%endmacro
+
+%macro INIT_XMM 0-1+
+    %assign avx_enabled 0
+    %define RESET_MM_PERMUTATION INIT_XMM %1
+    %define mmsize 16
+    %define mova movdqa
+    %define movu movdqu
+    %define movh movq
+    %define movnta movntdq
+    INIT_CPUFLAGS %1
+    DEFINE_MMREGS xmm
+    %if WIN64
+        AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
+    %endif
+%endmacro
+
+%macro INIT_YMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_YMM %1
+    %define mmsize 32
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    INIT_CPUFLAGS %1
+    DEFINE_MMREGS ymm
+    AVX512_MM_PERMUTATION
+%endmacro
+
+%macro INIT_ZMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_ZMM %1
+    %define mmsize 64
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    INIT_CPUFLAGS %1
+    DEFINE_MMREGS zmm
+    AVX512_MM_PERMUTATION
+%endmacro
+
+INIT_XMM
+
+%macro DECLARE_MMCAST 1
+    %define  mmmm%1   mm%1
+    %define  mmxmm%1  mm%1
+    %define  mmymm%1  mm%1
+    %define  mmzmm%1  mm%1
+    %define xmmmm%1   mm%1
+    %define xmmxmm%1 xmm%1
+    %define xmmymm%1 xmm%1
+    %define xmmzmm%1 xmm%1
+    %define ymmmm%1   mm%1
+    %define ymmxmm%1 xmm%1
+    %define ymmymm%1 ymm%1
+    %define ymmzmm%1 ymm%1
+    %define zmmmm%1   mm%1
+    %define zmmxmm%1 xmm%1
+    %define zmmymm%1 ymm%1
+    %define zmmzmm%1 zmm%1
+    %define xm%1 xmm %+ m%1
+    %define ym%1 ymm %+ m%1
+    %define zm%1 zmm %+ m%1
+%endmacro
+
+%assign i 0
+%rep 32
+    DECLARE_MMCAST i
+    %assign i i+1
+%endrep
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+    %rep %0/2
+        %xdefine %%tmp%2 m%2
+        %rotate 2
+    %endrep
+    %rep %0/2
+        %xdefine m%1 %%tmp%2
+        CAT_XDEFINE nn, m%1, %1
+        %rotate 2
+    %endrep
+%endmacro
+
+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
+    %ifnum %1 ; SWAP 0, 1, ...
+        SWAP_INTERNAL_NUM %1, %2
+    %else ; SWAP m0, m1, ...
+        SWAP_INTERNAL_NAME %1, %2
+    %endif
+%endmacro
+
+%macro SWAP_INTERNAL_NUM 2-*
+    %rep %0-1
+        %xdefine %%tmp m%1
+        %xdefine m%1 m%2
+        %xdefine m%2 %%tmp
+        CAT_XDEFINE nn, m%1, %1
+        CAT_XDEFINE nn, m%2, %2
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SWAP_INTERNAL_NAME 2-*
+    %xdefine %%args nn %+ %1
+    %rep %0-1
+        %xdefine %%args %%args, nn %+ %2
+        %rotate 1
+    %endrep
+    SWAP_INTERNAL_NUM %%args
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %assign %%i 0
+    %rep num_mmregs
+        %xdefine %%tmp m %+ %%i
+        CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
+        %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 0-1 ; name to load from
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %xdefine %%tmp %%f %+ 0
+    %ifnum %%tmp
+        RESET_MM_PERMUTATION
+        AVX512_MM_PERMUTATION
+        %assign %%i 0
+        %rep num_mmregs
+            %xdefine %%tmp %%f %+ %%i
+            CAT_XDEFINE %%m, %%i, m %+ %%tmp
+            %assign %%i %%i+1
+        %endrep
+        %rep num_mmregs
+            %assign %%i %%i-1
+            CAT_XDEFINE m, %%i, %%m %+ %%i
+            CAT_XDEFINE nn, m %+ %%i, %%i
+        %endrep
+    %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+    %ifid %1
+        call_internal %1 %+ SUFFIX, %1
+    %else
+        call %1
+    %endif
+%endmacro
+%macro call_internal 2
+    %xdefine %%i %2
+    %ifndef cglobaled_%2
+        %ifdef cglobaled_%1
+            %xdefine %%i %1
+        %endif
+    %endif
+    call %%i
+    LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 32
+    %if i < 8
+        CAT_XDEFINE sizeofmm, i, 8
+        CAT_XDEFINE regnumofmm, i, i
+    %endif
+    CAT_XDEFINE sizeofxmm, i, 16
+    CAT_XDEFINE sizeofymm, i, 32
+    CAT_XDEFINE sizeofzmm, i, 64
+    CAT_XDEFINE regnumofxmm, i, i
+    CAT_XDEFINE regnumofymm, i, i
+    CAT_XDEFINE regnumofzmm, i, i
+    %assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+    %xdefine %%opcode %1
+    %xdefine %%dst %2
+    %rep %0-2
+        %ifidn %%dst, %3
+            %error non-avx emulation of ``%%opcode'' is not supported
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%6+: operands
+%macro RUN_AVX_INSTR 6-9+
+    %ifnum sizeof%7
+        %assign __sizeofreg sizeof%7
+    %elifnum sizeof%6
+        %assign __sizeofreg sizeof%6
+    %else
+        %assign __sizeofreg mmsize
+    %endif
+    %assign __emulate_avx 0
+    %if avx_enabled && __sizeofreg >= 16
+        %xdefine __instr v%1
+    %else
+        %xdefine __instr %1
+        %if %0 >= 8+%4
+            %assign __emulate_avx 1
+        %endif
+    %endif
+    %ifnidn %2, fnord
+        %ifdef cpuname
+            %if notcpuflag(%2)
+                %error use of ``%1'' %2 instruction in cpuname function: current_function
+            %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
+                %error use of ``%1'' sse2 instruction in cpuname function: current_function
+            %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
+                %error use of ``%1'' avx2 instruction in cpuname function: current_function
+            %elifidn %1, pextrw ; special case because the base instruction is mmx2,
+                %ifnid %6       ; but sse4 is required for memory operands
+                    %if notcpuflag(sse4)
+                        %error use of ``%1'' sse4 instruction in cpuname function: current_function
+                    %endif
+                %endif
+            %endif
+        %endif
+    %endif
+
+    %if __emulate_avx
+        %xdefine __src1 %7
+        %xdefine __src2 %8
+        %if %5 && %4 == 0
+            %ifnidn %6, %7
+                %ifidn %6, %8
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %elifnnum sizeof%8
+                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
+                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+                    ; So, if the instruction is commutative with a memory arg, swap them.
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %endif
+            %endif
+        %endif
+        %ifnidn %6, __src1
+            %if %0 >= 9
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
+            %else
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
+            %endif
+            %if __sizeofreg == 8
+                MOVQ %6, __src1
+            %elif %3
+                MOVAPS %6, __src1
+            %else
+                MOVDQA %6, __src1
+            %endif
+        %endif
+        %if %0 >= 9
+            %1 %6, __src2, %9
+        %else
+            %1 %6, __src2
+        %endif
+    %elif %0 >= 9
+        __instr %6, %7, %8, %9
+    %elif %0 == 8
+        %if avx_enabled && %5
+            %xdefine __src1 %7
+            %xdefine __src2 %8
+            %ifnum regnumof%7
+                %ifnum regnumof%8
+                    %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
+                        ; Most VEX-encoded instructions require an additional byte to encode when
+                        ; src2 is a high register (e.g. m8..15). If the instruction is commutative
+                        ; we can swap src1 and src2 when doing so reduces the instruction length.
+                        %xdefine __src1 %8
+                        %xdefine __src2 %7
+                    %endif
+                %endif
+            %endif
+            __instr %6, __src1, __src2
+        %else
+            __instr %6, %7, %8
+        %endif
+    %elif %0 == 7
+        %if avx_enabled && %5
+            %xdefine __src1 %6
+            %xdefine __src2 %7
+            %ifnum regnumof%6
+                %ifnum regnumof%7
+                    %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
+                        %xdefine __src1 %7
+                        %xdefine __src2 %6
+                    %endif
+                %endif
+            %endif
+            __instr %6, __src1, __src2
+        %else
+            __instr %6, %7
+        %endif
+    %else
+        __instr %6
+    %endif
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-5 fnord, 0, 255, 0
+    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
+        %ifidn %2, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
+        %elifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
+        %elifidn %4, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
+        %elifidn %5, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
+        %else
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
+        %endif
+    %endmacro
+%endmacro
+
+; Instructions with both VEX/EVEX and legacy encodings
+; Non-destructive instructions are written without parameters
+AVX_INSTR addpd, sse2, 1, 0, 1
+AVX_INSTR addps, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 0
+AVX_INSTR addss, sse, 1, 0, 0
+AVX_INSTR addsubpd, sse3, 1, 0, 0
+AVX_INSTR addsubps, sse3, 1, 0, 0
+AVX_INSTR aesdec, aesni, 0, 0, 0
+AVX_INSTR aesdeclast, aesni, 0, 0, 0
+AVX_INSTR aesenc, aesni, 0, 0, 0
+AVX_INSTR aesenclast, aesni, 0, 0, 0
+AVX_INSTR aesimc, aesni
+AVX_INSTR aeskeygenassist, aesni
+AVX_INSTR andnpd, sse2, 1, 0, 0
+AVX_INSTR andnps, sse, 1, 0, 0
+AVX_INSTR andpd, sse2, 1, 0, 1
+AVX_INSTR andps, sse, 1, 0, 1
+AVX_INSTR blendpd, sse4, 1, 1, 0
+AVX_INSTR blendps, sse4, 1, 1, 0
+AVX_INSTR blendvpd, sse4 ; can't be emulated
+AVX_INSTR blendvps, sse4 ; can't be emulated
+AVX_INSTR cmpeqpd, sse2, 1, 0, 1
+AVX_INSTR cmpeqps, sse, 1, 0, 1
+AVX_INSTR cmpeqsd, sse2, 1, 0, 0
+AVX_INSTR cmpeqss, sse, 1, 0, 0
+AVX_INSTR cmplepd, sse2, 1, 0, 0
+AVX_INSTR cmpleps, sse, 1, 0, 0
+AVX_INSTR cmplesd, sse2, 1, 0, 0
+AVX_INSTR cmpless, sse, 1, 0, 0
+AVX_INSTR cmpltpd, sse2, 1, 0, 0
+AVX_INSTR cmpltps, sse, 1, 0, 0
+AVX_INSTR cmpltsd, sse2, 1, 0, 0
+AVX_INSTR cmpltss, sse, 1, 0, 0
+AVX_INSTR cmpneqpd, sse2, 1, 0, 1
+AVX_INSTR cmpneqps, sse, 1, 0, 1
+AVX_INSTR cmpneqsd, sse2, 1, 0, 0
+AVX_INSTR cmpneqss, sse, 1, 0, 0
+AVX_INSTR cmpnlepd, sse2, 1, 0, 0
+AVX_INSTR cmpnleps, sse, 1, 0, 0
+AVX_INSTR cmpnlesd, sse2, 1, 0, 0
+AVX_INSTR cmpnless, sse, 1, 0, 0
+AVX_INSTR cmpnltpd, sse2, 1, 0, 0
+AVX_INSTR cmpnltps, sse, 1, 0, 0
+AVX_INSTR cmpnltsd, sse2, 1, 0, 0
+AVX_INSTR cmpnltss, sse, 1, 0, 0
+AVX_INSTR cmpordpd, sse2 1, 0, 1
+AVX_INSTR cmpordps, sse 1, 0, 1
+AVX_INSTR cmpordsd, sse2 1, 0, 0
+AVX_INSTR cmpordss, sse 1, 0, 0
+AVX_INSTR cmppd, sse2, 1, 1, 0
+AVX_INSTR cmpps, sse, 1, 1, 0
+AVX_INSTR cmpsd, sse2, 1, 1, 0
+AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR cmpunordpd, sse2, 1, 0, 1
+AVX_INSTR cmpunordps, sse, 1, 0, 1
+AVX_INSTR cmpunordsd, sse2, 1, 0, 0
+AVX_INSTR cmpunordss, sse, 1, 0, 0
+AVX_INSTR comisd, sse2, 1
+AVX_INSTR comiss, sse, 1
+AVX_INSTR cvtdq2pd, sse2, 1
+AVX_INSTR cvtdq2ps, sse2, 1
+AVX_INSTR cvtpd2dq, sse2, 1
+AVX_INSTR cvtpd2ps, sse2, 1
+AVX_INSTR cvtps2dq, sse2, 1
+AVX_INSTR cvtps2pd, sse2, 1
+AVX_INSTR cvtsd2si, sse2, 1
+AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
+AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
+AVX_INSTR cvtsi2ss, sse, 1, 0, 0
+AVX_INSTR cvtss2sd, sse2, 1, 0, 0
+AVX_INSTR cvtss2si, sse, 1
+AVX_INSTR cvttpd2dq, sse2, 1
+AVX_INSTR cvttps2dq, sse2, 1
+AVX_INSTR cvttsd2si, sse2, 1
+AVX_INSTR cvttss2si, sse, 1
+AVX_INSTR divpd, sse2, 1, 0, 0
+AVX_INSTR divps, sse, 1, 0, 0
+AVX_INSTR divsd, sse2, 1, 0, 0
+AVX_INSTR divss, sse, 1, 0, 0
+AVX_INSTR dppd, sse4, 1, 1, 0
+AVX_INSTR dpps, sse4, 1, 1, 0
+AVX_INSTR extractps, sse4, 1
+AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
+AVX_INSTR haddpd, sse3, 1, 0, 0
+AVX_INSTR haddps, sse3, 1, 0, 0
+AVX_INSTR hsubpd, sse3, 1, 0, 0
+AVX_INSTR hsubps, sse3, 1, 0, 0
+AVX_INSTR insertps, sse4, 1, 1, 0
+AVX_INSTR lddqu, sse3
+AVX_INSTR ldmxcsr, sse, 1
+AVX_INSTR maskmovdqu, sse2
+AVX_INSTR maxpd, sse2, 1, 0, 1
+AVX_INSTR maxps, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 0
+AVX_INSTR maxss, sse, 1, 0, 0
+AVX_INSTR minpd, sse2, 1, 0, 1
+AVX_INSTR minps, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 0
+AVX_INSTR minss, sse, 1, 0, 0
+AVX_INSTR movapd, sse2, 1
+AVX_INSTR movaps, sse, 1
+AVX_INSTR movd, mmx
+AVX_INSTR movddup, sse3, 1
+AVX_INSTR movdqa, sse2
+AVX_INSTR movdqu, sse2
+AVX_INSTR movhlps, sse, 1, 0, 0
+AVX_INSTR movhpd, sse2, 1, 0, 0
+AVX_INSTR movhps, sse, 1, 0, 0
+AVX_INSTR movlhps, sse, 1, 0, 0
+AVX_INSTR movlpd, sse2, 1, 0, 0
+AVX_INSTR movlps, sse, 1, 0, 0
+AVX_INSTR movmskpd, sse2, 1
+AVX_INSTR movmskps, sse, 1
+AVX_INSTR movntdq, sse2
+AVX_INSTR movntdqa, sse4
+AVX_INSTR movntpd, sse2, 1
+AVX_INSTR movntps, sse, 1
+AVX_INSTR movq, mmx
+AVX_INSTR movsd, sse2, 1, 0, 0
+AVX_INSTR movshdup, sse3, 1
+AVX_INSTR movsldup, sse3, 1
+AVX_INSTR movss, sse, 1, 0, 0
+AVX_INSTR movupd, sse2, 1
+AVX_INSTR movups, sse, 1
+AVX_INSTR mpsadbw, sse4, 0, 1, 0
+AVX_INSTR mulpd, sse2, 1, 0, 1
+AVX_INSTR mulps, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 0
+AVX_INSTR mulss, sse, 1, 0, 0
+AVX_INSTR orpd, sse2, 1, 0, 1
+AVX_INSTR orps, sse, 1, 0, 1
+AVX_INSTR pabsb, ssse3
+AVX_INSTR pabsd, ssse3
+AVX_INSTR pabsw, ssse3
+AVX_INSTR packsswb, mmx, 0, 0, 0
+AVX_INSTR packssdw, mmx, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR paddb, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
+AVX_INSTR paddd, mmx, 0, 0, 1
+AVX_INSTR paddq, sse2, 0, 0, 1
+AVX_INSTR paddsb, mmx, 0, 0, 1
+AVX_INSTR paddsw, mmx, 0, 0, 1
+AVX_INSTR paddusb, mmx, 0, 0, 1
+AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR palignr, ssse3, 0, 1, 0
+AVX_INSTR pand, mmx, 0, 0, 1
+AVX_INSTR pandn, mmx, 0, 0, 0
+AVX_INSTR pavgb, mmx2, 0, 0, 1
+AVX_INSTR pavgw, mmx2, 0, 0, 1
+AVX_INSTR pblendvb, sse4 ; can't be emulated
+AVX_INSTR pblendw, sse4, 0, 1, 0
+AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pextrb, sse4
+AVX_INSTR pextrd, sse4
+AVX_INSTR pextrq, sse4
+AVX_INSTR pextrw, mmx2
+AVX_INSTR phaddw, ssse3, 0, 0, 0
+AVX_INSTR phaddd, ssse3, 0, 0, 0
+AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phminposuw, sse4
+AVX_INSTR phsubw, ssse3, 0, 0, 0
+AVX_INSTR phsubd, ssse3, 0, 0, 0
+AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR pinsrb, sse4, 0, 1, 0
+AVX_INSTR pinsrd, sse4, 0, 1, 0
+AVX_INSTR pinsrq, sse4, 0, 1, 0
+AVX_INSTR pinsrw, mmx2, 0, 1, 0
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaxsb, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
+AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxub, mmx2, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
+AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pminsb, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
+AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminub, mmx2, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
+AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pmovmskb, mmx2
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxbd, sse4
+AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxwd, sse4
+AVX_INSTR pmovsxwq, sse4
+AVX_INSTR pmovsxdq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxbd, sse4
+AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxwd, sse4
+AVX_INSTR pmovzxwq, sse4
+AVX_INSTR pmovzxdq, sse4
+AVX_INSTR pmuldq, sse4, 0, 0, 1
+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
+AVX_INSTR pmulhw, mmx, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
+AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmuludq, sse2, 0, 0, 1
+AVX_INSTR por, mmx, 0, 0, 1
+AVX_INSTR psadbw, mmx2, 0, 0, 1
+AVX_INSTR pshufb, ssse3, 0, 0, 0
+AVX_INSTR pshufd, sse2
+AVX_INSTR pshufhw, sse2
+AVX_INSTR pshuflw, sse2
+AVX_INSTR psignb, ssse3, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
+AVX_INSTR psignd, ssse3, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR pslld, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR pslldq, sse2, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psrad, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psrld, mmx, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psubb, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
+AVX_INSTR psubd, mmx, 0, 0, 0
+AVX_INSTR psubq, sse2, 0, 0, 0
+AVX_INSTR psubsb, mmx, 0, 0, 0
+AVX_INSTR psubsw, mmx, 0, 0, 0
+AVX_INSTR psubusb, mmx, 0, 0, 0
+AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR ptest, sse4
+AVX_INSTR punpckhbw, mmx, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
+AVX_INSTR punpckhdq, mmx, 0, 0, 0
+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklbw, mmx, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
+AVX_INSTR punpckldq, mmx, 0, 0, 0
+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR pxor, mmx, 0, 0, 1
+AVX_INSTR rcpps, sse, 1
+AVX_INSTR rcpss, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4, 1
+AVX_INSTR roundps, sse4, 1
+AVX_INSTR roundsd, sse4, 1, 1, 0
+AVX_INSTR roundss, sse4, 1, 1, 0
+AVX_INSTR rsqrtps, sse, 1
+AVX_INSTR rsqrtss, sse, 1, 0, 0
+AVX_INSTR shufpd, sse2, 1, 1, 0
+AVX_INSTR shufps, sse, 1, 1, 0
+AVX_INSTR sqrtpd, sse2, 1
+AVX_INSTR sqrtps, sse, 1
+AVX_INSTR sqrtsd, sse2, 1, 0, 0
+AVX_INSTR sqrtss, sse, 1, 0, 0
+AVX_INSTR stmxcsr, sse, 1
+AVX_INSTR subpd, sse2, 1, 0, 0
+AVX_INSTR subps, sse, 1, 0, 0
+AVX_INSTR subsd, sse2, 1, 0, 0
+AVX_INSTR subss, sse, 1, 0, 0
+AVX_INSTR ucomisd, sse2, 1
+AVX_INSTR ucomiss, sse, 1
+AVX_INSTR unpckhpd, sse2, 1, 0, 0
+AVX_INSTR unpckhps, sse, 1, 0, 0
+AVX_INSTR unpcklpd, sse2, 1, 0, 0
+AVX_INSTR unpcklps, sse, 1, 0, 0
+AVX_INSTR xorpd, sse2, 1, 0, 1
+AVX_INSTR xorps, sse, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
+AVX_INSTR pfmul, 3dnow, 1, 0, 1
+
+;%1 == instruction
+;%2 == minimal instruction set
+%macro GPR_INSTR 2
+    %macro %1 2-5 fnord, %1, %2
+        %ifdef cpuname
+            %if notcpuflag(%5)
+                %error use of ``%4'' %5 instruction in cpuname function: current_function
+            %endif
+        %endif
+        %ifidn %3, fnord
+            %4 %1, %2
+        %else
+            %4 %1, %2, %3
+        %endif
+    %endmacro
+%endmacro
+
+GPR_INSTR andn, bmi1
+GPR_INSTR bextr, bmi1
+GPR_INSTR blsi, bmi1
+GPR_INSTR blsmsk, bmi1
+GPR_INSTR bzhi, bmi2
+GPR_INSTR mulx, bmi2
+GPR_INSTR pdep, bmi2
+GPR_INSTR pext, bmi2
+GPR_INSTR popcnt, sse42
+GPR_INSTR rorx, bmi2
+GPR_INSTR sarx, bmi2
+GPR_INSTR shlx, bmi2
+GPR_INSTR shrx, bmi2
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+    %if j < 10
+        CAT_XDEFINE q000, j, i
+    %elif j < 100
+        CAT_XDEFINE q00, j, i
+    %elif j < 1000
+        CAT_XDEFINE q0, j, i
+    %else
+        CAT_XDEFINE q, j, i
+    %endif
+    %assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+    %macro %1 4-7 %1, %2, %3
+        %if cpuflag(xop)
+            v%5 %1, %2, %3, %4
+        %elifnidn %1, %4
+            %6 %1, %2, %3
+            %7 %1, %4
+        %else
+            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
+        %endif
+    %endmacro
+%endmacro
+
+FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
+; FMA3 is only possible if dst is the same as one of the src registers.
+; Either src2 or src3 can be a memory operand.
+%macro FMA4_INSTR 2-*
+    %push fma4_instr
+    %xdefine %$prefix %1
+    %rep %0 - 1
+        %macro %$prefix%2 4-6 %$prefix, %2
+            %if notcpuflag(fma3) && notcpuflag(fma4)
+                %error use of ``%5%6'' fma instruction in cpuname function: current_function
+            %elif cpuflag(fma4)
+                v%5%6 %1, %2, %3, %4
+            %elifidn %1, %2
+                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
+                %ifnum sizeof%3
+                    v%{5}213%6 %2, %3, %4
+                %else
+                    v%{5}132%6 %2, %4, %3
+                %endif
+            %elifidn %1, %3
+                v%{5}213%6 %3, %2, %4
+            %elifidn %1, %4
+                v%{5}231%6 %4, %2, %3
+            %else
+                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
+            %endif
+        %endmacro
+        %rotate 1
+    %endrep
+    %pop
+%endmacro
+
+FMA4_INSTR fmadd,    pd, ps, sd, ss
+FMA4_INSTR fmaddsub, pd, ps
+FMA4_INSTR fmsub,    pd, ps, sd, ss
+FMA4_INSTR fmsubadd, pd, ps
+FMA4_INSTR fnmadd,   pd, ps, sd, ss
+FMA4_INSTR fnmsub,   pd, ps, sd, ss
+
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+    %macro %1 2-7 fnord, fnord, %1, %2, %3
+        %ifidn %3, fnord
+            %define %%args %1, %2
+        %elifidn %4, fnord
+            %define %%args %1, %2, %3
+        %else
+            %define %%args %1, %2, %3, %4
+        %endif
+        %assign %%evex_required cpuflag(avx512) & %7
+        %ifnum regnumof%1
+            %if regnumof%1 >= 16 || sizeof%1 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %ifnum regnumof%2
+            %if regnumof%2 >= 16 || sizeof%2 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %ifnum regnumof%3
+            %if regnumof%3 >= 16 || sizeof%3 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %if %%evex_required
+            %6 %%args
+        %else
+            %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+        %endif
+    %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128,   vextractf32x4
+EVEX_INSTR vextracti128,   vextracti32x4
+EVEX_INSTR vinsertf128,    vinsertf32x4
+EVEX_INSTR vinserti128,    vinserti32x4
+EVEX_INSTR vmovdqa,        vmovdqa32
+EVEX_INSTR vmovdqu,        vmovdqu32
+EVEX_INSTR vpand,          vpandd
+EVEX_INSTR vpandn,         vpandnd
+EVEX_INSTR vpor,           vpord
+EVEX_INSTR vpxor,          vpxord
+EVEX_INSTR vrcpps,         vrcp14ps,   1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss,         vrcp14ss,   1
+EVEX_INSTR vrsqrtps,       vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss,       vrsqrt14ss, 1
diff --git a/src/fg_apply.h b/src/fg_apply.h
new file mode 100644 (file)
index 0000000..6b96a06
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_FG_APPLY_H
+#define DAV1D_SRC_FG_APPLY_H
+
+#include "dav1d/picture.h"
+
+#include "common/bitdepth.h"
+
+#include "src/film_grain.h"
+
+bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp,
+                                    Dav1dPicture *const out,
+                                    Dav1dPicture *const in);
+
+#endif /* DAV1D_SRC_FG_APPLY_H */
diff --git a/src/fg_apply_tmpl.c b/src/fg_apply_tmpl.c
new file mode 100644 (file)
index 0000000..4cde92c
--- /dev/null
@@ -0,0 +1,209 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+
+#include "dav1d/picture.h"
+
+#include "common.h"
+#include "common/intops.h"
+#include "common/bitdepth.h"
+
+#include "fg_apply.h"
+
+static void generate_scaling(const int bitdepth,
+                             const uint8_t points[][2], const int num,
+                             uint8_t scaling[SCALING_SIZE])
+{
+#if BITDEPTH == 8
+    const int shift_x = 0;
+#else
+    const int shift_x = bitdepth - 8;
+#endif
+    const int scaling_size = 1 << bitdepth;
+
+    // Fill up the preceding entries with the initial value
+    for (int i = 0; i < points[0][0] << shift_x; i++)
+        scaling[i] = points[0][1];
+
+    // Linearly interpolate the values in the middle
+    for (int i = 0; i < num - 1; i++) {
+        const int bx = points[i][0];
+        const int by = points[i][1];
+        const int ex = points[i+1][0];
+        const int ey = points[i+1][1];
+        const int dx = ex - bx;
+        const int dy = ey - by;
+        const int delta = dy * ((0x10000 + (dx >> 1)) / dx);
+        for (int x = 0; x < dx; x++) {
+            const int v = by + ((x * delta + 0x8000) >> 16);
+            scaling[(bx + x) << shift_x] = v;
+        }
+    }
+
+    // Fill up the remaining entries with the final value
+    for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)
+        scaling[i] = points[num - 1][1];
+
+#if BITDEPTH != 8
+    const int pad = 1 << shift_x, rnd = pad >> 1;
+    for (int i = 0; i < num - 1; i++) {
+        const int bx = points[i][0] << shift_x;
+        const int ex = points[i+1][0] << shift_x;
+        const int dx = ex - bx;
+        for (int x = 0; x < dx; x += pad) {
+            const int range = scaling[bx + x + pad] - scaling[bx + x];
+            for (int n = 1; n < pad; n++) {
+                scaling[bx + x + n] = scaling[bx + x] + ((range * n + rnd) >> shift_x);
+            }
+        }
+    }
+#endif
+}
+
+#ifndef UNIT_TEST
+void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
+                              Dav1dPicture *const out,
+                              Dav1dPicture *const in)
+{
+    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
+
+    entry grain_lut[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+    uint8_t scaling[3][SCALING_SIZE];
+#if BITDEPTH != 8
+    const int bitdepth_max = (1 << out->p.bpc) - 1;
+#endif
+
+    // Generate grain LUTs as needed
+    dsp->generate_grain_y(grain_lut[0], data HIGHBD_TAIL_SUFFIX); // always needed
+    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
+        dsp->generate_grain_uv[in->p.layout - 1](grain_lut[1], grain_lut[0],
+                                                 data, 0 HIGHBD_TAIL_SUFFIX);
+    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
+        dsp->generate_grain_uv[in->p.layout - 1](grain_lut[2], grain_lut[0],
+                                                 data, 1 HIGHBD_TAIL_SUFFIX);
+
+    // Generate scaling LUTs as needed
+    if (data->num_y_points)
+        generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
+    if (data->num_uv_points[0])
+        generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
+    if (data->num_uv_points[1])
+        generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);
+
+    // Copy over the non-modified planes
+    // TODO: eliminate in favor of per-plane refs
+    assert(out->stride[0] == in->stride[0]);
+    if (!data->num_y_points) {
+        const ptrdiff_t stride = out->stride[0];
+        const ptrdiff_t sz = out->p.h * stride;
+        if (sz < 0)
+            memcpy((uint8_t*) out->data[0] + sz - stride,
+                   (uint8_t*) in->data[0] + sz - stride, -sz);
+        else
+            memcpy(out->data[0], in->data[0], sz);
+    }
+
+    if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400 && !data->chroma_scaling_from_luma) {
+        assert(out->stride[1] == in->stride[1]);
+        const int ss_ver = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const ptrdiff_t stride = out->stride[1];
+        const ptrdiff_t sz = (out->p.h * stride) >> ss_ver;
+        if (sz < 0) {
+            if (!data->num_uv_points[0])
+                memcpy((uint8_t*) out->data[1] + sz - stride,
+                       (uint8_t*) in->data[1] + sz - stride, -sz);
+            if (!data->num_uv_points[1])
+                memcpy((uint8_t*) out->data[2] + sz - stride,
+                       (uint8_t*) in->data[2] + sz - stride, -sz);
+        } else {
+            if (!data->num_uv_points[0])
+                memcpy(out->data[1], in->data[1], sz);
+            if (!data->num_uv_points[1])
+                memcpy(out->data[2], in->data[2], sz);
+        }
+    }
+
+    // Synthesize grain for the affected planes
+    const int rows = (out->p.h + 31) >> 5;
+    const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int cpw = (out->p.w + ss_x) >> ss_x;
+    const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
+    for (int row = 0; row < rows; row++) {
+        pixel *const luma_src =
+            ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
+
+        if (data->num_y_points) {
+            const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
+            dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
+                             luma_src, out->stride[0], data,
+                             out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
+        }
+
+        if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
+            !data->chroma_scaling_from_luma)
+        {
+            continue;
+        }
+
+        const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
+
+        // extend padding pixels
+        if (out->p.w & ss_x) {
+            pixel *ptr = luma_src;
+            for (int y = 0; y < bh; y++) {
+                ptr[out->p.w] = ptr[out->p.w - 1];
+                ptr += PXSTRIDE(in->stride[0]) << ss_y;
+            }
+        }
+
+        const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
+        if (data->chroma_scaling_from_luma) {
+            for (int pl = 0; pl < 2; pl++)
+                dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
+                                                    ((const pixel *) in->data[1 + pl]) + uv_off,
+                                                    in->stride[1], data, cpw,
+                                                    scaling[0], grain_lut[1 + pl],
+                                                    bh, row, luma_src, in->stride[0],
+                                                    pl, is_id HIGHBD_TAIL_SUFFIX);
+        } else {
+            for (int pl = 0; pl < 2; pl++)
+                if (data->num_uv_points[pl])
+                    dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
+                                                        ((const pixel *) in->data[1 + pl]) + uv_off,
+                                                        in->stride[1], data, cpw,
+                                                        scaling[1 + pl], grain_lut[1 + pl],
+                                                        bh, row, luma_src, in->stride[0],
+                                                        pl, is_id HIGHBD_TAIL_SUFFIX);
+        }
+    }
+}
+#endif
diff --git a/src/film_grain.h b/src/film_grain.h
new file mode 100644 (file)
index 0000000..5bd4287
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_FILM_GRAIN_H
+#define DAV1D_SRC_FILM_GRAIN_H
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+#define BLOCK_SIZE 32
+#if !defined(BITDEPTH) || BITDEPTH == 8
+#define SCALING_SIZE 256
+typedef int8_t entry;
+#else
+#define SCALING_SIZE 4096
+typedef int16_t entry;
+#endif
+
+#define decl_generate_grain_y_fn(name) \
+void (name)(entry buf[][GRAIN_WIDTH], \
+            const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX)
+typedef decl_generate_grain_y_fn(*generate_grain_y_fn);
+
+#define decl_generate_grain_uv_fn(name) \
+void (name)(entry buf[][GRAIN_WIDTH], \
+            const entry buf_y[][GRAIN_WIDTH], \
+            const Dav1dFilmGrainData *const data, const intptr_t uv HIGHBD_DECL_SUFFIX)
+typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn);
+
+#define decl_fgy_32x32xn_fn(name) \
+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
+            const Dav1dFilmGrainData *data, \
+            size_t pw, const uint8_t scaling[SCALING_SIZE], \
+            const entry grain_lut[][GRAIN_WIDTH], \
+            int bh, int row_num HIGHBD_DECL_SUFFIX)
+typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn);
+
+#define decl_fguv_32x32xn_fn(name) \
+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
+            const Dav1dFilmGrainData *data, int pw, \
+            const uint8_t scaling[SCALING_SIZE], \
+            const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \
+            const pixel *luma_row, ptrdiff_t luma_stride, \
+            int uv_pl, int is_id HIGHBD_DECL_SUFFIX)
+typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn);
+
+typedef struct Dav1dFilmGrainDSPContext {
+    generate_grain_y_fn generate_grain_y;
+    generate_grain_uv_fn generate_grain_uv[3];
+
+    fgy_32x32xn_fn fgy_32x32xn;
+    fguv_32x32xn_fn fguv_32x32xn[3];
+} Dav1dFilmGrainDSPContext;
+
+bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c);
+bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c);
+
+#endif /* DAV1D_SRC_FILM_GRAIN_H */
diff --git a/src/film_grain_tmpl.c b/src/film_grain_tmpl.c
new file mode 100644 (file)
index 0000000..90a03a4
--- /dev/null
@@ -0,0 +1,437 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "film_grain.h"
+#include "tables.h"
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+static inline int get_random_number(const int bits, unsigned *const state) {
+    const int r = *state;
+    unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+    *state = (r >> 1) | (bit << 15);
+
+    return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(const int x, const uint64_t shift) {
+    return (x + ((1 << shift) >> 1)) >> shift;
+}
+
+static void generate_grain_y_c(entry buf[][GRAIN_WIDTH],
+                               const Dav1dFilmGrainData *const data
+                               HIGHBD_DECL_SUFFIX)
+{
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    unsigned seed = data->seed;
+    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+    for (int y = 0; y < GRAIN_HEIGHT; y++) {
+        for (int x = 0; x < GRAIN_WIDTH; x++) {
+            const int value = get_random_number(11, &seed);
+            buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+        }
+    }
+
+    const int ar_pad = 3;
+    const int ar_lag = data->ar_coeff_lag;
+
+    for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+        for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_y;
+            int sum = 0;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    if (!dx && !dy)
+                        break;
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            buf[y][x] = iclip(grain, grain_min, grain_max);
+        }
+    }
+}
+
+static NOINLINE void
+generate_grain_uv_c(entry buf[][GRAIN_WIDTH],
+                    const entry buf_y[][GRAIN_WIDTH],
+                    const Dav1dFilmGrainData *const data, const intptr_t uv,
+                    const int subx, const int suby HIGHBD_DECL_SUFFIX)
+{
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);
+    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+    const int chromaW = subx ? SUB_GRAIN_WIDTH  : GRAIN_WIDTH;
+    const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+    for (int y = 0; y < chromaH; y++) {
+        for (int x = 0; x < chromaW; x++) {
+            const int value = get_random_number(11, &seed);
+            buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+        }
+    }
+
+    const int ar_pad = 3;
+    const int ar_lag = data->ar_coeff_lag;
+
+    for (int y = ar_pad; y < chromaH; y++) {
+        for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_uv[uv];
+            int sum = 0;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    // For the final (current) pixel, we need to add in the
+                    // contribution from the luma grain texture
+                    if (!dx && !dy) {
+                        if (!data->num_y_points)
+                            break;
+                        int luma = 0;
+                        const int lumaX = ((x - ar_pad) << subx) + ar_pad;
+                        const int lumaY = ((y - ar_pad) << suby) + ar_pad;
+                        for (int i = 0; i <= suby; i++) {
+                            for (int j = 0; j <= subx; j++) {
+                                luma += buf_y[lumaY + i][lumaX + j];
+                            }
+                        }
+                        luma = round2(luma, subx + suby);
+                        sum += luma * (*coeff);
+                        break;
+                    }
+
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            buf[y][x] = iclip(grain, grain_min, grain_max);
+        }
+    }
+}
+
+#define gnuv_ss_fn(nm, ss_x, ss_y) \
+static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \
+    generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \
+}
+
+gnuv_ss_fn(420, 1, 1);
+gnuv_ss_fn(422, 1, 0);
+gnuv_ss_fn(444, 0, 0);
+
+// samples from the correct block of a grain LUT, while taking into account the
+// offsets provided by the offsets cache
+static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
+                               const int offsets[2][2], const int subx, const int suby,
+                               const int bx, const int by, const int x, const int y)
+{
+    const int randval = offsets[bx][by];
+    const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
+    const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
+    return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by]
+                    [offx + x + (BLOCK_SIZE >> subx) * bx];
+}
+
+static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
+                          const ptrdiff_t stride,
+                          const Dav1dFilmGrainData *const data, const size_t pw,
+                          const uint8_t scaling[SCALING_SIZE],
+                          const entry grain_lut[][GRAIN_WIDTH],
+                          const int bh, const int row_num HIGHBD_DECL_SUFFIX)
+{
+    const int rows = 1 + (data->overlap_flag && row_num > 0);
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+    int min_value, max_value;
+    if (data->clip_to_restricted_range) {
+        min_value = 16 << bitdepth_min_8;
+        max_value = 235 << bitdepth_min_8;
+    } else {
+        min_value = 0;
+#if BITDEPTH == 8
+        max_value = 0xff;
+#else
+        max_value = bitdepth_max;
+#endif
+    }
+
+    // seed[0] contains the current row, seed[1] contains the previous
+    unsigned seed[2];
+    for (int i = 0; i < rows; i++) {
+        seed[i] = data->seed;
+        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
+        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+    }
+
+    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+
+    int offsets[2 /* col offset */][2 /* row offset */];
+
+    // process this row in BLOCK_SIZE^2 blocks
+    for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
+        const int bw = imin(BLOCK_SIZE, (int) pw - bx);
+
+        if (data->overlap_flag && bx) {
+            // shift previous offsets left
+            for (int i = 0; i < rows; i++)
+                offsets[1][i] = offsets[0][i];
+        }
+
+        // update current offsets
+        for (int i = 0; i < rows; i++)
+            offsets[0][i] = get_random_number(8, &seed[i]);
+
+        // x/y block offsets to compensate for overlapped regions
+        const int ystart = data->overlap_flag && row_num ? imin(2, bh) : 0;
+        const int xstart = data->overlap_flag && bx      ? imin(2, bw) : 0;
+
+        static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
+
+#define add_noise_y(x, y, grain)                                                  \
+        const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx;     \
+        pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx;           \
+        const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
+        *dst = iclip(*src + noise, min_value, max_value);
+
+        for (int y = ystart; y < bh; y++) {
+            // Non-overlapped image region (straightforward)
+            for (int x = xstart; x < bw; x++) {
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                add_noise_y(x, y, grain);
+            }
+
+            // Special case for overlapped column
+            for (int x = 0; x < xstart; x++) {
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+                grain = round2(old * w[x][0] + grain * w[x][1], 5);
+                grain = iclip(grain, grain_min, grain_max);
+                add_noise_y(x, y, grain);
+            }
+        }
+
+        for (int y = 0; y < ystart; y++) {
+            // Special case for overlapped row (sans corner)
+            for (int x = xstart; x < bw; x++) {
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+                grain = round2(old * w[y][0] + grain * w[y][1], 5);
+                grain = iclip(grain, grain_min, grain_max);
+                add_noise_y(x, y, grain);
+            }
+
+            // Special case for doubly-overlapped corner
+            for (int x = 0; x < xstart; x++) {
+                // Blend the top pixel with the top left block
+                int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+                int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y);
+                top = round2(old * w[x][0] + top * w[x][1], 5);
+                top = iclip(top, grain_min, grain_max);
+
+                // Blend the current pixel with the left block
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+                grain = round2(old * w[x][0] + grain * w[x][1], 5);
+                grain = iclip(grain, grain_min, grain_max);
+
+                // Mix the row rows together and apply grain
+                grain = round2(top * w[y][0] + grain * w[y][1], 5);
+                grain = iclip(grain, grain_min, grain_max);
+                add_noise_y(x, y, grain);
+            }
+        }
+    }
+}
+
+static NOINLINE void
+fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
+               const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
+               const int pw, const uint8_t scaling[SCALING_SIZE],
+               const entry grain_lut[][GRAIN_WIDTH], const int bh,
+               const int row_num, const pixel *const luma_row,
+               const ptrdiff_t luma_stride, const int uv, const int is_id,
+               const int sx, const int sy HIGHBD_DECL_SUFFIX)
+{
+    const int rows = 1 + (data->overlap_flag && row_num > 0);
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+    int min_value, max_value;
+    if (data->clip_to_restricted_range) {
+        min_value = 16 << bitdepth_min_8;
+        max_value = (is_id ? 235 : 240) << bitdepth_min_8;
+    } else {
+        min_value = 0;
+#if BITDEPTH == 8
+        max_value = 0xff;
+#else
+        max_value = bitdepth_max;
+#endif
+    }
+
+    // seed[0] contains the current row, seed[1] contains the previous
+    unsigned seed[2];
+    for (int i = 0; i < rows; i++) {
+        seed[i] = data->seed;
+        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
+        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+    }
+
+    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+
+    int offsets[2 /* col offset */][2 /* row offset */];
+
+    // process this row in BLOCK_SIZE^2 blocks (subsampled)
+    for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
+        const int bw = imin(BLOCK_SIZE >> sx, pw - bx);
+        if (data->overlap_flag && bx) {
+            // shift previous offsets left
+            for (int i = 0; i < rows; i++)
+                offsets[1][i] = offsets[0][i];
+        }
+
+        // update current offsets
+        for (int i = 0; i < rows; i++)
+            offsets[0][i] = get_random_number(8, &seed[i]);
+
+        // x/y block offsets to compensate for overlapped regions
+        const int ystart = data->overlap_flag && row_num ? imin(2 >> sy, bh) : 0;
+        const int xstart = data->overlap_flag && bx      ? imin(2 >> sx, bw) : 0;
+
+        static const int w[2 /* sub */][2 /* off */][2] = {
+            { { 27, 17 }, { 17, 27 } },
+            { { 23, 22 } },
+        };
+
+#define add_noise_uv(x, y, grain)                                                    \
+            const int lx = (bx + x) << sx;                                           \
+            const int ly = y << sy;                                                  \
+            const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx;    \
+            pixel avg = luma[0];                                                     \
+            if (sx)                                                                  \
+                avg = (avg + luma[1] + 1) >> 1;                                      \
+            const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x));  \
+            pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x));        \
+            int val = avg;                                                           \
+            if (!data->chroma_scaling_from_luma) {                                   \
+                const int combined = avg * data->uv_luma_mult[uv] +                  \
+                               *src * data->uv_mult[uv];                             \
+                val = iclip_pixel( (combined >> 6) +                                 \
+                                   (data->uv_offset[uv] * (1 << bitdepth_min_8)) );  \
+            }                                                                        \
+            const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \
+            *dst = iclip(*src + noise, min_value, max_value);
+
+        for (int y = ystart; y < bh; y++) {
+            // Non-overlapped image region (straightforward)
+            for (int x = xstart; x < bw; x++) {
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                add_noise_uv(x, y, grain);
+            }
+
+            // Special case for overlapped column
+            for (int x = 0; x < xstart; x++) {
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+                grain = (old * w[sx][x][0] + grain * w[sx][x][1] + 16) >> 5;
+                grain = iclip(grain, grain_min, grain_max);
+                add_noise_uv(x, y, grain);
+            }
+        }
+
+        for (int y = 0; y < ystart; y++) {
+            // Special case for overlapped row (sans corner)
+            for (int x = xstart; x < bw; x++) {
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+                grain = (old * w[sy][y][0] + grain * w[sy][y][1] + 16) >> 5;
+                grain = iclip(grain, grain_min, grain_max);
+                add_noise_uv(x, y, grain);
+            }
+
+            // Special case for doubly-overlapped corner
+            for (int x = 0; x < xstart; x++) {
+                // Blend the top pixel with the top left block
+                int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+                int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y);
+                top = (old * w[sx][x][0] + top * w[sx][x][1] + 16) >> 5;
+                top = iclip(top, grain_min, grain_max);
+
+                // Blend the current pixel with the left block
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+                grain = (old * w[sx][x][0] + grain * w[sx][x][1] + 16) >> 5;
+                grain = iclip(grain, grain_min, grain_max);
+
+                // Mix the row rows together and apply to image
+                grain = (top * w[sy][y][0] + grain * w[sy][y][1] + 16) >> 5;
+                grain = iclip(grain, grain_min, grain_max);
+                add_noise_uv(x, y, grain);
+            }
+        }
+    }
+}
+
+#define fguv_ss_fn(nm, ss_x, ss_y) \
+static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \
+    fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \
+                   row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \
+                   HIGHBD_TAIL_SUFFIX); \
+}
+
+fguv_ss_fn(420, 1, 1);
+fguv_ss_fn(422, 1, 0);
+fguv_ss_fn(444, 0, 0);
+
+COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
+    c->generate_grain_y = generate_grain_y_c;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c;
+
+    c->fgy_32x32xn = fgy_32x32xn_c;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c;
+
+#if HAVE_ASM && ARCH_X86
+    bitfn(dav1d_film_grain_dsp_init_x86)(c);
+#endif
+}
diff --git a/src/getbits.c b/src/getbits.c
new file mode 100644 (file)
index 0000000..c185053
--- /dev/null
@@ -0,0 +1,160 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/intops.h"
+
+#include "src/getbits.h"
+
+void dav1d_init_get_bits(GetBits *const c, const uint8_t *const data,
+                         const size_t sz)
+{
+    c->ptr = c->ptr_start = data;
+    c->ptr_end = &c->ptr_start[sz];
+    c->bits_left = 0;
+    c->state = 0;
+    c->error = 0;
+    c->eof = 0;
+}
+
+static void refill(GetBits *const c, const unsigned n) {
+    assert(c->bits_left <= 56);
+    uint64_t state = 0;
+    do {
+        state <<= 8;
+        c->bits_left += 8;
+        if (!c->eof)
+            state |= *c->ptr++;
+        if (c->ptr >= c->ptr_end) {
+            c->error = c->eof;
+            c->eof = 1;
+        }
+    } while (n > c->bits_left);
+    c->state |= state << (64 - c->bits_left);
+}
+
+unsigned dav1d_get_bits(GetBits *const c, const unsigned n) {
+    assert(n <= 32 /* can go up to 57 if we change return type */);
+    assert(n /* can't shift state by 64 */);
+
+    if (n > c->bits_left) refill(c, n);
+
+    const uint64_t state = c->state;
+    c->bits_left -= n;
+    c->state <<= n;
+
+    return (unsigned) (state >> (64 - n));
+}
+
+int dav1d_get_sbits(GetBits *const c, const unsigned n) {
+    const int shift = 31 - n;
+    const int res = dav1d_get_bits(c, n + 1) << shift;
+    return res >> shift;
+}
+
+unsigned dav1d_get_uleb128(GetBits *c) {
+    unsigned val = 0, more, i = 0;
+
+    do {
+        more = dav1d_get_bits(c, 1);
+        unsigned bits = dav1d_get_bits(c, 7);
+        if (i <= 3 || (i == 4 && bits < (1 << 4)))
+            val |= bits << (i * 7);
+        else if (bits) {
+            c->error = 1;
+            return 0;
+        }
+        if (more && ++i == 8) {
+            c->error = 1;
+            return 0;
+        }
+    } while (more);
+
+    return val;
+}
+
+unsigned dav1d_get_uniform(GetBits *const c, const unsigned max) {
+    // Output in range [0..max-1]
+    // max must be > 1, or else nothing is read from the bitstream
+    assert(max > 1);
+    const int l = ulog2(max) + 1;
+    assert(l > 1);
+    const unsigned m = (1U << l) - max;
+    const unsigned v = dav1d_get_bits(c, l - 1);
+    return v < m ? v : (v << 1) - m + dav1d_get_bits(c, 1);
+}
+
+unsigned dav1d_get_vlc(GetBits *const c) {
+    int n_bits = 0;
+    while (!dav1d_get_bits(c, 1))
+        if (++n_bits == 32)
+            return 0xFFFFFFFFU;
+    return n_bits ? ((1U << n_bits) - 1) + dav1d_get_bits(c, n_bits) : 0;
+}
+
+static unsigned get_bits_subexp_u(GetBits *const c, const unsigned ref,
+                                  const unsigned n)
+{
+    unsigned v = 0;
+
+    for (int i = 0;; i++) {
+        const int b = i ? 3 + i - 1 : 3;
+
+        if (n < v + 3 * (1 << b)) {
+            v += dav1d_get_uniform(c, n - v + 1);
+            break;
+        }
+
+        if (!dav1d_get_bits(c, 1)) {
+            v += dav1d_get_bits(c, b);
+            break;
+        }
+
+        v += 1 << b;
+    }
+
+    return ref * 2 <= n ? inv_recenter(ref, v) : n - inv_recenter(n - ref, v);
+}
+
+int dav1d_get_bits_subexp(GetBits *const c, const int ref, const unsigned n) {
+    return (int) get_bits_subexp_u(c, ref + (1 << n), 2 << n) - (1 << n);
+}
+
+void dav1d_bytealign_get_bits(GetBits *c) {
+    // bits_left is never more than 7, because it is only incremented
+    // by refill(), called by dav1d_get_bits and that never reads more
+    // than 7 bits more than it needs.
+    //
+    // If this wasn't true, we would need to work out how many bits to
+    // discard (bits_left % 8), subtract that from bits_left and then
+    // shift state right by that amount.
+    assert(c->bits_left <= 7);
+
+    c->bits_left = 0;
+    c->state = 0;
+}
diff --git a/src/getbits.h b/src/getbits.h
new file mode 100644 (file)
index 0000000..fc38214
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_GETBITS_H
+#define DAV1D_SRC_GETBITS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct GetBits {
+    int error, eof;
+    uint64_t state;
+    unsigned bits_left;
+    const uint8_t *ptr, *ptr_start, *ptr_end;
+} GetBits;
+
+void dav1d_init_get_bits(GetBits *c, const uint8_t *data, size_t sz);
+unsigned dav1d_get_bits(GetBits *c, unsigned n);
+int dav1d_get_sbits(GetBits *c, unsigned n);
+unsigned dav1d_get_uleb128(GetBits *c);
+
+// Output in range 0..max-1
+unsigned dav1d_get_uniform(GetBits *c, unsigned max);
+unsigned dav1d_get_vlc(GetBits *c);
+int dav1d_get_bits_subexp(GetBits *c, int ref, unsigned n);
+
+// Discard bits from the buffer until we're next byte-aligned.
+void dav1d_bytealign_get_bits(GetBits *c);
+
+// Return the current bit position relative to the start of the buffer.
+static inline unsigned dav1d_get_bits_pos(const GetBits *c) {
+    return (unsigned) (c->ptr - c->ptr_start) * 8 - c->bits_left;
+}
+
+#endif /* DAV1D_SRC_GETBITS_H */
diff --git a/src/internal.h b/src/internal.h
new file mode 100644 (file)
index 0000000..07f5676
--- /dev/null
@@ -0,0 +1,349 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_INTERNAL_H
+#define DAV1D_SRC_INTERNAL_H
+
+#include <stdatomic.h>
+
+#include "dav1d/data.h"
+
+typedef struct Dav1dFrameContext Dav1dFrameContext;
+typedef struct Dav1dTileState Dav1dTileState;
+typedef struct Dav1dTileContext Dav1dTileContext;
+
+#include "common/attributes.h"
+
+#include "src/cdef.h"
+#include "src/cdf.h"
+#include "src/data.h"
+#include "src/env.h"
+#include "src/film_grain.h"
+#include "src/intra_edge.h"
+#include "src/ipred.h"
+#include "src/itx.h"
+#include "src/levels.h"
+#include "src/lf_mask.h"
+#include "src/loopfilter.h"
+#include "src/looprestoration.h"
+#include "src/mc.h"
+#include "src/msac.h"
+#include "src/picture.h"
+#include "src/recon.h"
+#include "src/refmvs.h"
+#include "src/thread.h"
+
+typedef struct Dav1dDSPContext {
+    Dav1dFilmGrainDSPContext fg;
+    Dav1dIntraPredDSPContext ipred;
+    Dav1dMCDSPContext mc;
+    Dav1dInvTxfmDSPContext itx;
+    Dav1dLoopFilterDSPContext lf;
+    Dav1dCdefDSPContext cdef;
+    Dav1dLoopRestorationDSPContext lr;
+} Dav1dDSPContext;
+
+struct Dav1dTileGroup {
+    Dav1dData data;
+    int start, end;
+};
+
+struct Dav1dContext {
+    Dav1dFrameContext *fc;
+    unsigned n_fc;
+
+    // cache of OBUs that make up a single frame before we submit them
+    // to a frame worker to be decoded
+    struct Dav1dTileGroup *tile;
+    int n_tile_data_alloc;
+    int n_tile_data;
+    int n_tiles;
+    Dav1dRef *seq_hdr_ref;
+    Dav1dSequenceHeader *seq_hdr;
+    Dav1dRef *frame_hdr_ref;
+    Dav1dFrameHeader *frame_hdr;
+
+    Dav1dRef *content_light_ref;
+    Dav1dContentLightLevel *content_light;
+    Dav1dRef *mastering_display_ref;
+    Dav1dMasteringDisplay *mastering_display;
+    Dav1dRef *itut_t35_ref;
+    Dav1dITUTT35 *itut_t35;
+
+    // decoded output picture queue
+    Dav1dData in;
+    Dav1dPicture out;
+    struct {
+        Dav1dThreadPicture *out_delayed;
+        unsigned next;
+        // dummy is a pointer to prevent compiler errors about atomic_load()
+        // not taking const arguments; the const attribute is not taken
+        // from pointers
+        atomic_int flush_mem, *flush;
+    } frame_thread;
+
+    // reference/entropy state
+    struct {
+        Dav1dThreadPicture p;
+        Dav1dRef *segmap;
+        Dav1dRef *refmvs;
+        unsigned refpoc[7];
+    } refs[8];
+    CdfThreadContext cdf[8];
+
+    Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */];
+
+    // tree to keep track of which edges are available
+    struct {
+        EdgeNode *root[2 /* BL_128X128 vs. BL_64X64 */];
+        EdgeBranch branch_sb128[1 + 4 + 16 + 64];
+        EdgeBranch branch_sb64[1 + 4 + 16];
+        EdgeTip tip_sb128[256];
+        EdgeTip tip_sb64[64];
+    } intra_edge;
+
+    Dav1dPicAllocator allocator;
+    int apply_grain;
+    int operating_point;
+    unsigned operating_point_idc;
+    int all_layers;
+    unsigned frame_size_limit;
+    int drain;
+
+    Dav1dLogger logger;
+};
+
+struct Dav1dFrameContext {
+    Dav1dRef *seq_hdr_ref;
+    Dav1dSequenceHeader *seq_hdr;
+    Dav1dRef *frame_hdr_ref;
+    Dav1dFrameHeader *frame_hdr;
+    Dav1dThreadPicture refp[7];
+    Dav1dPicture cur; // during block coding / reconstruction
+    Dav1dThreadPicture sr_cur; // after super-resolution upscaling
+    Dav1dRef *mvs_ref;
+    refmvs_temporal_block *mvs, *ref_mvs[7];
+    Dav1dRef *ref_mvs_ref[7];
+    Dav1dRef *cur_segmap_ref, *prev_segmap_ref;
+    uint8_t *cur_segmap;
+    const uint8_t *prev_segmap;
+    unsigned refpoc[7], refrefpoc[7][7];
+    uint8_t gmv_warp_allowed[7];
+    CdfThreadContext in_cdf, out_cdf;
+    struct Dav1dTileGroup *tile;
+    int n_tile_data_alloc;
+    int n_tile_data;
+
+    // for scalable references
+    struct ScalableMotionParams {
+        int scale; // if no scaling, this is 0
+        int step;
+    } svc[7][2 /* x, y */];
+    int resize_step[2 /* y, uv */], resize_start[2 /* y, uv */];
+
+    const Dav1dContext *c;
+    Dav1dTileContext *tc;
+    int n_tc;
+    Dav1dTileState *ts;
+    int n_ts;
+    const Dav1dDSPContext *dsp;
+    struct {
+        recon_b_intra_fn recon_b_intra;
+        recon_b_inter_fn recon_b_inter;
+        filter_sbrow_fn filter_sbrow;
+        backup_ipred_edge_fn backup_ipred_edge;
+        read_coef_blocks_fn read_coef_blocks;
+    } bd_fn;
+
+    int ipred_edge_sz;
+    pixel *ipred_edge[3];
+    ptrdiff_t b4_stride;
+    int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step, sr_sb128w;
+    uint16_t dq[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
+    const uint8_t *qm[2 /* is_1d */][N_RECT_TX_SIZES][3 /* plane */];
+    BlockContext *a;
+    int a_sz /* w*tile_rows */;
+    refmvs_frame rf;
+    uint8_t jnt_weights[7][7];
+    int bitdepth_max;
+
+    struct {
+        struct thread_data td;
+        int pass, die;
+        // indexed using t->by * f->b4_stride + t->bx
+        Av1Block *b;
+        struct CodedBlockInfo {
+            int16_t eob[3 /* plane */];
+            uint8_t txtp[3 /* plane */];
+        } *cbi;
+        // indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1)
+        uint16_t (*pal)[3 /* plane */][8 /* idx */];
+        // iterated over inside tile state
+        uint8_t *pal_idx;
+        coef *cf;
+        int pal_sz, pal_idx_sz, cf_sz;
+        // start offsets per tile
+        int *tile_start_off;
+    } frame_thread;
+
+    // loopfilter
+    struct {
+        uint8_t (*level)[4];
+        Av1Filter *mask;
+        Av1Restoration *lr_mask;
+        int top_pre_cdef_toggle;
+        int mask_sz /* w*h */, lr_mask_sz, cdef_line_sz[2] /* stride */;
+        int lr_line_sz, re_sz /* h */;
+        ALIGN(Av1FilterLUT lim_lut, 16);
+        int last_sharpness;
+        uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
+        uint8_t *tx_lpf_right_edge[2];
+        uint8_t *cdef_line_buf;
+        pixel *cdef_line[2 /* pre, post */][3 /* plane */];
+        pixel *lr_lpf_line[3 /* plane */];
+
+        // in-loop filter per-frame state keeping
+        int tile_row; // for carry-over at tile row edges
+        pixel *p[3], *sr_p[3];
+        Av1Filter *mask_ptr, *prev_mask_ptr;
+        int restore_planes; // enum LrRestorePlanes
+    } lf;
+
+    // threading (refer to tc[] for per-thread things)
+    struct FrameTileThreadData {
+        uint64_t available;
+        pthread_mutex_t lock;
+        pthread_cond_t cond, icond;
+        int tasks_left, num_tasks;
+        int (*task_idx_to_sby_and_tile_idx)[2];
+        int titsati_sz, titsati_init[2];
+        uint16_t titsati_index_rows[1 + DAV1D_MAX_TILE_ROWS];
+        int inited;
+    } tile_thread;
+};
+
+struct Dav1dTileState {
+    CdfContext cdf;
+    MsacContext msac;
+
+    struct {
+        int col_start, col_end, row_start, row_end; // in 4px units
+        int col, row; // in tile units
+    } tiling;
+
+    atomic_int progress; // in sby units, TILE_ERROR after a decoding error
+    struct {
+        pthread_mutex_t lock;
+        pthread_cond_t cond;
+    } tile_thread;
+    struct {
+        uint8_t *pal_idx;
+        coef *cf;
+    } frame_thread;
+
+    uint16_t dqmem[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
+    const uint16_t (*dq)[3][2];
+    int last_qidx;
+
+    int8_t last_delta_lf[4];
+    uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
+    const uint8_t (*lflvl)[4][8][2];
+
+    Av1RestorationUnit *lr_ref[3];
+};
+
+struct Dav1dTileContext {
+    const Dav1dFrameContext *f;
+    Dav1dTileState *ts;
+    int bx, by;
+    BlockContext l, *a;
+    ALIGN(union, 32) {
+        int16_t cf_8bpc [32 * 32];
+        int32_t cf_16bpc[32 * 32];
+    };
+    // FIXME types can be changed to pixel (and dynamically allocated)
+    // which would make copy/assign operations slightly faster?
+    uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
+    uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
+    uint8_t txtp_map[32 * 32]; // inter-only
+    refmvs_tile rt;
+    ALIGN(union, 64) {
+        struct {
+            union {
+                uint8_t  lap_8bpc [128 * 32];
+                uint16_t lap_16bpc[128 * 32];
+                struct {
+                    int16_t compinter[2][128 * 128];
+                    uint8_t seg_mask[128 * 128];
+                };
+            };
+            union {
+                // stride=192 for non-SVC, or 320 for SVC
+                uint8_t  emu_edge_8bpc [320 * (256 + 7)];
+                uint16_t emu_edge_16bpc[320 * (256 + 7)];
+            };
+        };
+        struct {
+            union {
+                uint8_t levels[32 * 34];
+                struct {
+                    uint8_t pal_order[64][8];
+                    uint8_t pal_ctx[64];
+                };
+            };
+            int16_t ac[32 * 32];
+            uint8_t pal_idx[2 * 64 * 64];
+            uint16_t pal[3 /* plane */][8 /* palette_idx */];
+            ALIGN(union, 32) {
+                struct {
+                    uint8_t interintra_8bpc[64 * 64];
+                    uint8_t edge_8bpc[257];
+                };
+                struct {
+                    uint16_t interintra_16bpc[64 * 64];
+                    uint16_t edge_16bpc[257];
+                };
+            };
+        };
+    } scratch;
+
+    Dav1dWarpedMotionParams warpmv;
+    Av1Filter *lf_mask;
+    int8_t *cur_sb_cdef_idx_ptr;
+    // for chroma sub8x8, we need to know the filter for all 4 subblocks in
+    // a 4x4 area, but the top/left one can go out of cache already, so this
+    // keeps it accessible
+    enum Filter2d tl_4x4_filter;
+
+    struct {
+        struct thread_data td;
+        struct FrameTileThreadData *fttd;
+        int die;
+    } tile_thread;
+};
+
+#endif /* DAV1D_SRC_INTERNAL_H */
diff --git a/src/intra_edge.c b/src/intra_edge.c
new file mode 100644 (file)
index 0000000..684d113
--- /dev/null
@@ -0,0 +1,165 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/attributes.h"
+
+#include "src/intra_edge.h"
+#include "src/levels.h"
+
+struct ModeSelMem {
+    EdgeBranch *nwc[3 /* 64x64, 32x32, 16x16 */];
+    EdgeTip *nt;
+};
+
+static void init_edges(EdgeNode *const node,
+                       const enum BlockLevel bl,
+                       const enum EdgeFlags edge_flags)
+{
+    node->o = edge_flags;
+
+#define ALL_FL(t) (EDGE_I444_##t | EDGE_I422_##t | EDGE_I420_##t)
+    if (bl == BL_8X8) {
+        EdgeTip *const nt = (EdgeTip *) node;
+
+        node->h[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM);
+        node->h[1] = edge_flags & (ALL_FL(LEFT_HAS_BOTTOM) |
+                                   EDGE_I420_TOP_HAS_RIGHT);
+
+        node->v[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT);
+        node->v[1] = edge_flags & (ALL_FL(TOP_HAS_RIGHT) |
+                                   EDGE_I420_LEFT_HAS_BOTTOM |
+                                   EDGE_I422_LEFT_HAS_BOTTOM);
+
+        nt->split[0] = ALL_FL(TOP_HAS_RIGHT) | ALL_FL(LEFT_HAS_BOTTOM);
+        nt->split[1] = (edge_flags & ALL_FL(TOP_HAS_RIGHT)) |
+                       EDGE_I422_LEFT_HAS_BOTTOM;
+        nt->split[2] = edge_flags | EDGE_I444_TOP_HAS_RIGHT;
+        nt->split[3] = edge_flags & (EDGE_I420_TOP_HAS_RIGHT |
+                                     EDGE_I420_LEFT_HAS_BOTTOM |
+                                     EDGE_I422_LEFT_HAS_BOTTOM);
+    } else {
+        EdgeBranch *const nwc = (EdgeBranch *) node;
+
+        node->h[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM);
+        node->h[1] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM);
+
+        node->v[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT);
+        node->v[1] = edge_flags & ALL_FL(TOP_HAS_RIGHT);
+
+        nwc->h4[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM);
+        nwc->h4[1] =
+        nwc->h4[2] = ALL_FL(LEFT_HAS_BOTTOM);
+        nwc->h4[3] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM);
+        if (bl == BL_16X16)
+            nwc->h4[1] |= edge_flags & EDGE_I420_TOP_HAS_RIGHT;
+
+        nwc->v4[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT);
+        nwc->v4[1] =
+        nwc->v4[2] = ALL_FL(TOP_HAS_RIGHT);
+        nwc->v4[3] = edge_flags & ALL_FL(TOP_HAS_RIGHT);
+        if (bl == BL_16X16)
+            nwc->v4[1] |= edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM |
+                                        EDGE_I422_LEFT_HAS_BOTTOM);
+
+        nwc->tls[0] = ALL_FL(TOP_HAS_RIGHT) | ALL_FL(LEFT_HAS_BOTTOM);
+        nwc->tls[1] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM);
+        nwc->tls[2] = edge_flags & ALL_FL(TOP_HAS_RIGHT);
+
+        nwc->trs[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT);
+        nwc->trs[1] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM);
+        nwc->trs[2] = 0;
+
+        nwc->tts[0] = ALL_FL(TOP_HAS_RIGHT) | ALL_FL(LEFT_HAS_BOTTOM);
+        nwc->tts[1] = edge_flags & ALL_FL(TOP_HAS_RIGHT);
+        nwc->tts[2] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM);
+
+        nwc->tbs[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM);
+        nwc->tbs[1] = edge_flags | ALL_FL(TOP_HAS_RIGHT);
+        nwc->tbs[2] = 0;
+    }
+}
+
+static void init_mode_node(EdgeBranch *const nwc,
+                           const enum BlockLevel bl,
+                           struct ModeSelMem *const mem,
+                           const int top_has_right,
+                           const int left_has_bottom)
+{
+    init_edges(&nwc->node, bl,
+               (top_has_right ? ALL_FL(TOP_HAS_RIGHT) : 0) |
+               (left_has_bottom ? ALL_FL(LEFT_HAS_BOTTOM) : 0));
+    if (bl == BL_16X16) {
+        for (int n = 0; n < 4; n++) {
+            EdgeTip *const nt = mem->nt++;
+            nwc->split[n] = &nt->node;
+            init_edges(&nt->node, bl + 1,
+                       ((n == 3 || (n == 1 && !top_has_right)) ? 0 :
+                        ALL_FL(TOP_HAS_RIGHT)) |
+                       (!(n == 0 || (n == 2 && left_has_bottom)) ? 0 :
+                        ALL_FL(LEFT_HAS_BOTTOM)));
+        }
+    } else {
+        for (int n = 0; n < 4; n++) {
+            EdgeBranch *const nwc_child = mem->nwc[bl]++;
+            nwc->split[n] = &nwc_child->node;
+            init_mode_node(nwc_child, bl + 1, mem,
+                           !(n == 3 || (n == 1 && !top_has_right)),
+                           n == 0 || (n == 2 && left_has_bottom));
+        }
+    }
+}
+
+void dav1d_init_mode_tree(EdgeNode *const root_node, EdgeTip *const nt,
+                          const int allow_sb128)
+{
+    EdgeBranch *const root = (EdgeBranch *) root_node;
+    struct ModeSelMem mem;
+    mem.nt = nt;
+
+    if (allow_sb128) {
+        mem.nwc[BL_128X128] = &root[1];
+        mem.nwc[BL_64X64] = &root[1 + 4];
+        mem.nwc[BL_32X32] = &root[1 + 4 + 16];
+        init_mode_node(root, BL_128X128, &mem, 1, 0);
+        assert(mem.nwc[BL_128X128] == &root[1 + 4]);
+        assert(mem.nwc[BL_64X64] == &root[1 + 4 + 16]);
+        assert(mem.nwc[BL_32X32] == &root[1 + 4 + 16 + 64]);
+        assert(mem.nt == &nt[256]);
+    } else {
+        mem.nwc[BL_128X128] = NULL;
+        mem.nwc[BL_64X64] = &root[1];
+        mem.nwc[BL_32X32] = &root[1 + 4];
+        init_mode_node(root, BL_64X64, &mem, 1, 0);
+        assert(mem.nwc[BL_64X64] == &root[1 + 4]);
+        assert(mem.nwc[BL_32X32] == &root[1 + 4 + 16]);
+        assert(mem.nt == &nt[64]);
+    }
+}
diff --git a/src/intra_edge.h b/src/intra_edge.h
new file mode 100644 (file)
index 0000000..8b4e150
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_INTRA_EDGE_H
+#define DAV1D_SRC_INTRA_EDGE_H
+
+enum EdgeFlags {
+    EDGE_I444_TOP_HAS_RIGHT = 1 << 0,
+    EDGE_I422_TOP_HAS_RIGHT = 1 << 1,
+    EDGE_I420_TOP_HAS_RIGHT = 1 << 2,
+    EDGE_I444_LEFT_HAS_BOTTOM = 1 << 3,
+    EDGE_I422_LEFT_HAS_BOTTOM = 1 << 4,
+    EDGE_I420_LEFT_HAS_BOTTOM = 1 << 5,
+};
+
+typedef struct EdgeNode EdgeNode;
+struct EdgeNode {
+    enum EdgeFlags o, h[2], v[2];
+};
+typedef struct EdgeTip {
+    EdgeNode node;
+    enum EdgeFlags split[4];
+} EdgeTip;
+typedef struct EdgeBranch {
+    EdgeNode node;
+    enum EdgeFlags tts[3], tbs[3], tls[3], trs[3], h4[4], v4[4];
+    EdgeNode *split[4];
+} EdgeBranch;
+
+void dav1d_init_mode_tree(EdgeNode *const root, EdgeTip *const nt,
+                          const int allow_sb128);
+
+#endif /* DAV1D_SRC_INTRA_EDGE_H */
diff --git a/src/ipred.h b/src/ipred.h
new file mode 100644 (file)
index 0000000..5df2657
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_IPRED_H
+#define DAV1D_SRC_IPRED_H
+
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+/*
+ * Intra prediction.
+ * - a is the angle (in degrees) for directional intra predictors. For other
+ *   modes, it is ignored;
+ * - topleft is the same as the argument given to dav1d_prepare_intra_edges(),
+ *   see ipred_prepare.h for more detailed documentation.
+ */
+#define decl_angular_ipred_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, \
+            int width, int height, int angle, int max_width, int max_height \
+            HIGHBD_DECL_SUFFIX)
+typedef decl_angular_ipred_fn(*angular_ipred_fn);
+
+/*
+ * Create a subsampled Y plane with the DC subtracted.
+ * - w/h_pad is the edge of the width/height that extends outside the visible
+ *   portion of the frame in 4px units;
+ * - ac has a stride of 16.
+ */
+#define decl_cfl_ac_fn(name) \
+void (name)(int16_t *ac, const pixel *y, ptrdiff_t stride, \
+            int w_pad, int h_pad, int cw, int ch)
+typedef decl_cfl_ac_fn(*cfl_ac_fn);
+
+/*
+ * dst[x,y] += alpha * ac[x,y]
+ * - alpha contains a q3 scalar in [-16,16] range;
+ */
+#define decl_cfl_pred_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, \
+            int width, int height, const int16_t *ac, int alpha \
+            HIGHBD_DECL_SUFFIX)
+typedef decl_cfl_pred_fn(*cfl_pred_fn);
+
+/*
+ * dst[x,y] = pal[idx[x,y]]
+ * - palette indices are [0-7]
+ */
+#define decl_pal_pred_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const uint16_t *pal, \
+            const uint8_t *idx, int w, int h)
+typedef decl_pal_pred_fn(*pal_pred_fn);
+
+typedef struct Dav1dIntraPredDSPContext {
+    angular_ipred_fn intra_pred[N_IMPL_INTRA_PRED_MODES];
+
+    // chroma-from-luma
+    cfl_ac_fn cfl_ac[3 /* 420, 422, 444 */];
+    cfl_pred_fn cfl_pred[DC_128_PRED + 1];
+
+    // palette
+    pal_pred_fn pal_pred;
+} Dav1dIntraPredDSPContext;
+
+bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c);
+bitfn_decls(void dav1d_intra_pred_dsp_init_arm, Dav1dIntraPredDSPContext *c);
+bitfn_decls(void dav1d_intra_pred_dsp_init_x86, Dav1dIntraPredDSPContext *c);
+
+#endif /* DAV1D_SRC_IPRED_H */
diff --git a/src/ipred_prepare.h b/src/ipred_prepare.h
new file mode 100644 (file)
index 0000000..6a7efeb
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_IPRED_PREPARE_H
+#define DAV1D_SRC_IPRED_PREPARE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common/bitdepth.h"
+
+#include "src/env.h"
+#include "src/intra_edge.h"
+#include "src/levels.h"
+
+/*
+ * Luma intra edge preparation.
+ *
+ * x/y/start/w/h are in luma block (4px) units:
+ * - x and y are the absolute block positions in the image;
+ * - start/w/h are the *dependent tile* boundary positions. In practice, start
+ *   is the horizontal tile start, w is the horizontal tile end, the vertical
+ *   tile start is assumed to be 0 and h is the vertical image end.
+ *
+ * edge_flags signals which edges are available for this transform-block inside
+ * the given partition, as well as for the partition inside the superblock
+ * structure.
+ *
+ * dst and stride are pointers to the top/left position of the current block,
+ * and can be used to locate the top, left, top/left, top/right and bottom/left
+ * edge pointers also.
+ *
+ * angle is the angle_delta [-3..3] on input, and the absolute angle on output.
+ *
+ * mode is the intra prediction mode as coded in the bitstream. The return value
+ * is this same mode, converted to an index in the DSP functions.
+ *
+ * tw/th are the size of the transform block in block (4px) units.
+ *
+ * topleft_out is a pointer to scratch memory that will be filled with the edge
+ * pixels. The memory array should have space to be indexed in the [-2*w,2*w]
+ * range, in the following order:
+ * - [0] will be the top/left edge pixel;
+ * - [1..w] will be the top edge pixels (1 being left-most, w being right-most);
+ * - [w+1..2*w] will be the top/right edge pixels;
+ * - [-1..-w] will be the left edge pixels (-1 being top-most, -w being bottom-
+ *   most);
+ * - [-w-1..-2*w] will be the bottom/left edge pixels.
+ * Each edge may remain uninitialized if it is not used by the returned mode
+ * index. If edges are not available (because the edge position is outside the
+ * tile dimensions or because edge_flags indicates lack of edge availability),
+ * they will be extended from nearby edges as defined by the av1 spec.
+ */
+enum IntraPredMode
+    bytefn(dav1d_prepare_intra_edges)(int x, int have_left, int y, int have_top,
+                                      int w, int h, enum EdgeFlags edge_flags,
+                                      const pixel *dst, ptrdiff_t stride,
+                                      const pixel *prefilter_toplevel_sb_edge,
+                                      enum IntraPredMode mode, int *angle,
+                                      int tw, int th, int filter_edge,
+                                      pixel *topleft_out HIGHBD_DECL_SUFFIX);
+
+// These flags are OR'd with the angle argument into intra predictors.
+// ANGLE_USE_EDGE_FILTER_FLAG signals that edges should be convolved
+// with a filter before using them to predict values in a block.
+// ANGLE_SMOOTH_EDGE_FLAG means that edges are smooth and should use
+// reduced filter strength.
+#define ANGLE_USE_EDGE_FILTER_FLAG 1024
+#define ANGLE_SMOOTH_EDGE_FLAG      512
+
+static inline int sm_flag(const BlockContext *const b, const int idx) {
+    if (!b->intra[idx]) return 0;
+    const enum IntraPredMode m = b->mode[idx];
+    return (m == SMOOTH_PRED || m == SMOOTH_H_PRED ||
+            m == SMOOTH_V_PRED) ? ANGLE_SMOOTH_EDGE_FLAG : 0;
+}
+
+static inline int sm_uv_flag(const BlockContext *const b, const int idx) {
+    const enum IntraPredMode m = b->uvmode[idx];
+    return (m == SMOOTH_PRED || m == SMOOTH_H_PRED ||
+            m == SMOOTH_V_PRED) ? ANGLE_SMOOTH_EDGE_FLAG : 0;
+}
+
+#endif /* DAV1D_SRC_IPRED_PREPARE_H */
diff --git a/src/ipred_prepare_tmpl.c b/src/ipred_prepare_tmpl.c
new file mode 100644 (file)
index 0000000..0bf9de9
--- /dev/null
@@ -0,0 +1,204 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/ipred_prepare.h"
+
+static const uint8_t av1_mode_conv[N_INTRA_PRED_MODES]
+                                  [2 /* have_left */][2 /* have_top */] =
+{
+    [DC_PRED]    = { { DC_128_PRED,  TOP_DC_PRED },
+                     { LEFT_DC_PRED, DC_PRED     } },
+    [PAETH_PRED] = { { DC_128_PRED,  VERT_PRED   },
+                     { HOR_PRED,     PAETH_PRED  } },
+};
+
+static const uint8_t av1_mode_to_angle_map[8] = {
+    90, 180, 45, 135, 113, 157, 203, 67
+};
+
+static const struct {
+    uint8_t needs_left:1;
+    uint8_t needs_top:1;
+    uint8_t needs_topleft:1;
+    uint8_t needs_topright:1;
+    uint8_t needs_bottomleft:1;
+} av1_intra_prediction_edges[N_IMPL_INTRA_PRED_MODES] = {
+    [DC_PRED]       = { .needs_top  = 1, .needs_left = 1 },
+    [VERT_PRED]     = { .needs_top  = 1 },
+    [HOR_PRED]      = { .needs_left = 1 },
+    [LEFT_DC_PRED]  = { .needs_left = 1 },
+    [TOP_DC_PRED]   = { .needs_top  = 1 },
+    [DC_128_PRED]   = { 0 },
+    [Z1_PRED]       = { .needs_top = 1, .needs_topright = 1,
+                        .needs_topleft = 1 },
+    [Z2_PRED]       = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+    [Z3_PRED]       = { .needs_left = 1, .needs_bottomleft = 1,
+                        .needs_topleft = 1 },
+    [SMOOTH_PRED]   = { .needs_left = 1, .needs_top = 1 },
+    [SMOOTH_V_PRED] = { .needs_left = 1, .needs_top = 1 },
+    [SMOOTH_H_PRED] = { .needs_left = 1, .needs_top = 1 },
+    [PAETH_PRED]    = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+    [FILTER_PRED]   = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+};
+
+enum IntraPredMode
+bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
+                                  const int y, const int have_top,
+                                  const int w, const int h,
+                                  const enum EdgeFlags edge_flags,
+                                  const pixel *const dst,
+                                  const ptrdiff_t stride,
+                                  const pixel *prefilter_toplevel_sb_edge,
+                                  enum IntraPredMode mode, int *const angle,
+                                  const int tw, const int th, const int filter_edge,
+                                  pixel *const topleft_out HIGHBD_DECL_SUFFIX)
+{
+    const int bitdepth = bitdepth_from_max(bitdepth_max);
+    assert(y < h && x < w);
+
+    switch (mode) {
+    case VERT_PRED:
+    case HOR_PRED:
+    case DIAG_DOWN_LEFT_PRED:
+    case DIAG_DOWN_RIGHT_PRED:
+    case VERT_RIGHT_PRED:
+    case HOR_DOWN_PRED:
+    case HOR_UP_PRED:
+    case VERT_LEFT_PRED: {
+        *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle;
+
+        if (*angle <= 90)
+            mode = *angle < 90 && have_top ? Z1_PRED : VERT_PRED;
+        else if (*angle < 180)
+            mode = Z2_PRED;
+        else
+            mode = *angle > 180 && have_left ? Z3_PRED : HOR_PRED;
+        break;
+    }
+    case DC_PRED:
+    case PAETH_PRED:
+        mode = av1_mode_conv[mode][have_left][have_top];
+        break;
+    default:
+        break;
+    }
+
+    const pixel *dst_top;
+    if (have_top &&
+        (av1_intra_prediction_edges[mode].needs_top ||
+         av1_intra_prediction_edges[mode].needs_topleft ||
+         (av1_intra_prediction_edges[mode].needs_left && !have_left)))
+    {
+        if (prefilter_toplevel_sb_edge) {
+            dst_top = &prefilter_toplevel_sb_edge[x * 4];
+        } else {
+            dst_top = &dst[-PXSTRIDE(stride)];
+        }
+    }
+
+    if (av1_intra_prediction_edges[mode].needs_left) {
+        const int sz = th << 2;
+        pixel *const left = &topleft_out[-sz];
+
+        if (have_left) {
+            const int px_have = imin(sz, (h - y) << 2);
+
+            for (int i = 0; i < px_have; i++)
+                left[sz - 1 - i] = dst[PXSTRIDE(stride) * i - 1];
+            if (px_have < sz)
+                pixel_set(left, left[sz - px_have], sz - px_have);
+        } else {
+            pixel_set(left, have_top ? *dst_top : ((1 << bitdepth) >> 1) + 1, sz);
+        }
+
+        if (av1_intra_prediction_edges[mode].needs_bottomleft) {
+            const int have_bottomleft = (!have_left || y + th >= h) ? 0 :
+                                        (edge_flags & EDGE_I444_LEFT_HAS_BOTTOM);
+
+            if (have_bottomleft) {
+                const int px_have = imin(sz, (h - y - th) << 2);
+
+                for (int i = 0; i < px_have; i++)
+                    left[-(i + 1)] = dst[(sz + i) * PXSTRIDE(stride) - 1];
+                if (px_have < sz)
+                    pixel_set(left - sz, left[-px_have], sz - px_have);
+            } else {
+                pixel_set(left - sz, left[0], sz);
+            }
+        }
+    }
+
+    if (av1_intra_prediction_edges[mode].needs_top) {
+        const int sz = tw << 2;
+        pixel *const top = &topleft_out[1];
+
+        if (have_top) {
+            const int px_have = imin(sz, (w - x) << 2);
+            pixel_copy(top, dst_top, px_have);
+            if (px_have < sz)
+                pixel_set(top + px_have, top[px_have - 1], sz - px_have);
+        } else {
+            pixel_set(top, have_left ? dst[-1] : ((1 << bitdepth) >> 1) - 1, sz);
+        }
+
+        if (av1_intra_prediction_edges[mode].needs_topright) {
+            const int have_topright = (!have_top || x + tw >= w) ? 0 :
+                                      (edge_flags & EDGE_I444_TOP_HAS_RIGHT);
+
+            if (have_topright) {
+                const int px_have = imin(sz, (w - x - tw) << 2);
+
+                pixel_copy(top + sz, &dst_top[sz], px_have);
+                if (px_have < sz)
+                    pixel_set(top + sz + px_have, top[sz + px_have - 1],
+                              sz - px_have);
+            } else {
+                pixel_set(top + sz, top[sz - 1], sz);
+            }
+        }
+    }
+
+    if (av1_intra_prediction_edges[mode].needs_topleft) {
+        if (have_left)
+            *topleft_out = have_top ? dst_top[-1] : dst[-1];
+        else
+            *topleft_out = have_top ? *dst_top : (1 << bitdepth) >> 1;
+
+        if (mode == Z2_PRED && tw + th >= 6 && filter_edge)
+            *topleft_out = ((topleft_out[-1] + topleft_out[1]) * 5 +
+                            topleft_out[0] * 6 + 8) >> 4;
+    }
+
+    return mode;
+}
diff --git a/src/ipred_tmpl.c b/src/ipred_tmpl.c
new file mode 100644 (file)
index 0000000..50c7a3c
--- /dev/null
@@ -0,0 +1,763 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/ipred.h"
+#include "src/tables.h"
+
+static NOINLINE void
+splat_dc(pixel *dst, const ptrdiff_t stride,
+         const int width, const int height, const int dc HIGHBD_DECL_SUFFIX)
+{
+#if BITDEPTH == 8
+    assert(dc <= 0xff);
+    if (width > 4) {
+        const uint64_t dcN = dc * 0x0101010101010101ULL;
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x += sizeof(dcN))
+                *((uint64_t *) &dst[x]) = dcN;
+            dst += PXSTRIDE(stride);
+        }
+    } else {
+        const unsigned dcN = dc * 0x01010101U;
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x += sizeof(dcN))
+                *((unsigned *) &dst[x]) = dcN;
+            dst += PXSTRIDE(stride);
+        }
+    }
+#else
+    assert(dc <= bitdepth_max);
+    const uint64_t dcN = dc * 0x0001000100010001ULL;
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x += sizeof(dcN) >> 1)
+            *((uint64_t *) &dst[x]) = dcN;
+        dst += PXSTRIDE(stride);
+    }
+#endif
+}
+
+static NOINLINE void
+cfl_pred(pixel *dst, const ptrdiff_t stride,
+         const int width, const int height, const int dc,
+         const int16_t *ac, const int alpha HIGHBD_DECL_SUFFIX)
+{
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            const int diff = alpha * ac[x];
+            dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));
+        }
+        ac += width;
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static unsigned dc_gen_top(const pixel *const topleft, const int width) {
+    unsigned dc = width >> 1;
+    for (int i = 0; i < width; i++)
+       dc += topleft[1 + i];
+    return dc >> ctz(width);
+}
+
+static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
+                           const pixel *const topleft,
+                           const int width, const int height, const int a,
+                           const int max_width, const int max_height
+                           HIGHBD_DECL_SUFFIX)
+{
+    splat_dc(dst, stride, width, height, dc_gen_top(topleft, width)
+             HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,
+                            const pixel *const topleft,
+                            const int width, const int height,
+                            const int16_t *ac, const int alpha
+                            HIGHBD_DECL_SUFFIX)
+{
+    cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha
+             HIGHBD_TAIL_SUFFIX);
+}
+
+static unsigned dc_gen_left(const pixel *const topleft, const int height) {
+    unsigned dc = height >> 1;
+    for (int i = 0; i < height; i++)
+       dc += topleft[-(1 + i)];
+    return dc >> ctz(height);
+}
+
+static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
+                            const pixel *const topleft,
+                            const int width, const int height, const int a,
+                            const int max_width, const int max_height
+                            HIGHBD_DECL_SUFFIX)
+{
+    splat_dc(dst, stride, width, height, dc_gen_left(topleft, height)
+             HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
+                             const pixel *const topleft,
+                             const int width, const int height,
+                             const int16_t *ac, const int alpha
+                             HIGHBD_DECL_SUFFIX)
+{
+    const unsigned dc = dc_gen_left(topleft, height);
+    cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
+}
+
+#if BITDEPTH == 8
+#define MULTIPLIER_1x2 0x5556
+#define MULTIPLIER_1x4 0x3334
+#define BASE_SHIFT 16
+#else
+#define MULTIPLIER_1x2 0xAAAB
+#define MULTIPLIER_1x4 0x6667
+#define BASE_SHIFT 17
+#endif
+
+static unsigned dc_gen(const pixel *const topleft,
+                       const int width, const int height)
+{
+    unsigned dc = (width + height) >> 1;
+    for (int i = 0; i < width; i++)
+       dc += topleft[i + 1];
+    for (int i = 0; i < height; i++)
+       dc += topleft[-(i + 1)];
+    dc >>= ctz(width + height);
+
+    if (width != height) {
+        dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :
+                                                           MULTIPLIER_1x2;
+        dc >>= BASE_SHIFT;
+    }
+    return dc;
+}
+
+static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
+                       const pixel *const topleft,
+                       const int width, const int height, const int a,
+                       const int max_width, const int max_height
+                       HIGHBD_DECL_SUFFIX)
+{
+    splat_dc(dst, stride, width, height, dc_gen(topleft, width, height)
+             HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,
+                        const pixel *const topleft,
+                        const int width, const int height,
+                        const int16_t *ac, const int alpha
+                        HIGHBD_DECL_SUFFIX)
+{
+    unsigned dc = dc_gen(topleft, width, height);
+    cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
+}
+
+#undef MULTIPLIER_1x2
+#undef MULTIPLIER_1x4
+#undef BASE_SHIFT
+
+static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
+                           const pixel *const topleft,
+                           const int width, const int height, const int a,
+                           const int max_width, const int max_height
+                           HIGHBD_DECL_SUFFIX)
+{
+#if BITDEPTH == 16
+    const int dc = (bitdepth_max + 1) >> 1;
+#else
+    const int dc = 128;
+#endif
+    splat_dc(dst, stride, width, height, dc HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,
+                            const pixel *const topleft,
+                            const int width, const int height,
+                            const int16_t *ac, const int alpha
+                            HIGHBD_DECL_SUFFIX)
+{
+#if BITDEPTH == 16
+    const int dc = (bitdepth_max + 1) >> 1;
+#else
+    const int dc = 128;
+#endif
+    cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
+                      const pixel *const topleft,
+                      const int width, const int height, const int a,
+                      const int max_width, const int max_height
+                      HIGHBD_DECL_SUFFIX)
+{
+    for (int y = 0; y < height; y++) {
+        pixel_copy(dst, topleft + 1, width);
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
+                      const pixel *const topleft,
+                      const int width, const int height, const int a,
+                      const int max_width, const int max_height
+                      HIGHBD_DECL_SUFFIX)
+{
+    for (int y = 0; y < height; y++) {
+        pixel_set(dst, topleft[-(1 + y)], width);
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
+                          const pixel *const tl_ptr,
+                          const int width, const int height, const int a,
+                          const int max_width, const int max_height
+                          HIGHBD_DECL_SUFFIX)
+{
+    const int topleft = tl_ptr[0];
+    for (int y = 0; y < height; y++) {
+        const int left = tl_ptr[-(y + 1)];
+        for (int x = 0; x < width; x++) {
+            const int top = tl_ptr[1 + x];
+            const int base = left + top - topleft;
+            const int ldiff = abs(left - base);
+            const int tdiff = abs(top - base);
+            const int tldiff = abs(topleft - base);
+
+            dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :
+                     tdiff <= tldiff ? top : topleft;
+        }
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
+                           const pixel *const topleft,
+                           const int width, const int height, const int a,
+                           const int max_width, const int max_height
+                           HIGHBD_DECL_SUFFIX)
+{
+    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
+    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
+    const int right = topleft[width], bottom = topleft[-height];
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            const int pred = weights_ver[y]  * topleft[1 + x] +
+                      (256 - weights_ver[y]) * bottom +
+                             weights_hor[x]  * topleft[-(1 + y)] +
+                      (256 - weights_hor[x]) * right;
+            dst[x] = (pred + 256) >> 9;
+        }
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
+                             const pixel *const topleft,
+                             const int width, const int height, const int a,
+                             const int max_width, const int max_height
+                             HIGHBD_DECL_SUFFIX)
+{
+    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
+    const int bottom = topleft[-height];
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            const int pred = weights_ver[y]  * topleft[1 + x] +
+                      (256 - weights_ver[y]) * bottom;
+            dst[x] = (pred + 128) >> 8;
+        }
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
+                             const pixel *const topleft,
+                             const int width, const int height, const int a,
+                             const int max_width, const int max_height
+                             HIGHBD_DECL_SUFFIX)
+{
+    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
+    const int right = topleft[width];
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            const int pred = weights_hor[x]  * topleft[-(y + 1)] +
+                      (256 - weights_hor[x]) * right;
+            dst[x] = (pred + 128) >> 8;
+        }
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static NOINLINE int get_filter_strength(const int wh, const int angle,
+                                        const int is_sm)
+{
+    if (is_sm) {
+        if (wh <= 8) {
+            if (angle >= 64) return 2;
+            if (angle >= 40) return 1;
+        } else if (wh <= 16) {
+            if (angle >= 48) return 2;
+            if (angle >= 20) return 1;
+        } else if (wh <= 24) {
+            if (angle >=  4) return 3;
+        } else {
+            return 3;
+        }
+    } else {
+        if (wh <= 8) {
+            if (angle >= 56) return 1;
+        } else if (wh <= 16) {
+            if (angle >= 40) return 1;
+        } else if (wh <= 24) {
+            if (angle >= 32) return 3;
+            if (angle >= 16) return 2;
+            if (angle >=  8) return 1;
+        } else if (wh <= 32) {
+            if (angle >= 32) return 3;
+            if (angle >=  4) return 2;
+            return 1;
+        } else {
+            return 3;
+        }
+    }
+    return 0;
+}
+
+static NOINLINE void filter_edge(pixel *const out, const int sz,
+                                 const int lim_from, const int lim_to,
+                                 const pixel *const in, const int from,
+                                 const int to, const int strength)
+{
+    static const uint8_t kernel[3][5] = {
+        { 0, 4, 8, 4, 0 },
+        { 0, 5, 6, 5, 0 },
+        { 2, 4, 4, 4, 2 }
+    };
+
+    assert(strength > 0);
+    int i = 0;
+    for (; i < imin(sz, lim_from); i++)
+        out[i] = in[iclip(i, from, to - 1)];
+    for (; i < imin(lim_to, sz); i++) {
+        int s = 0;
+        for (int j = 0; j < 5; j++)
+            s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];
+        out[i] = (s + 8) >> 4;
+    }
+    for (; i < sz; i++)
+        out[i] = in[iclip(i, from, to - 1)];
+}
+
+static inline int get_upsample(const int wh, const int angle, const int is_sm) {
+    return angle < 40 && wh <= 16 >> is_sm;
+}
+
+static NOINLINE void upsample_edge(pixel *const out, const int hsz,
+                                   const pixel *const in, const int from,
+                                   const int to HIGHBD_DECL_SUFFIX)
+{
+    static const int8_t kernel[4] = { -1, 9, 9, -1 };
+    int i;
+    for (i = 0; i < hsz - 1; i++) {
+        out[i * 2] = in[iclip(i, from, to - 1)];
+
+        int s = 0;
+        for (int j = 0; j < 4; j++)
+            s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];
+        out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);
+    }
+    out[i * 2] = in[iclip(i, from, to - 1)];
+}
+
+static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
+                       const pixel *const topleft_in,
+                       const int width, const int height, int angle,
+                       const int max_width, const int max_height
+                       HIGHBD_DECL_SUFFIX)
+{
+    const int is_sm = (angle >> 9) & 0x1;
+    const int enable_intra_edge_filter = angle >> 10;
+    angle &= 511;
+    assert(angle < 90);
+    int dx = dav1d_dr_intra_derivative[angle >> 1];
+    pixel top_out[64 + 64];
+    const pixel *top;
+    int max_base_x;
+    const int upsample_above = enable_intra_edge_filter ?
+        get_upsample(width + height, 90 - angle, is_sm) : 0;
+    if (upsample_above) {
+        upsample_edge(top_out, width + height, &topleft_in[1], -1,
+                      width + imin(width, height) HIGHBD_TAIL_SUFFIX);
+        top = top_out;
+        max_base_x = 2 * (width + height) - 2;
+        dx <<= 1;
+    } else {
+        const int filter_strength = enable_intra_edge_filter ?
+            get_filter_strength(width + height, 90 - angle, is_sm) : 0;
+        if (filter_strength) {
+            filter_edge(top_out, width + height, 0, width + height,
+                        &topleft_in[1], -1, width + imin(width, height),
+                        filter_strength);
+            top = top_out;
+            max_base_x = width + height - 1;
+        } else {
+            top = &topleft_in[1];
+            max_base_x = width + imin(width, height) - 1;
+        }
+    }
+    const int base_inc = 1 + upsample_above;
+    for (int y = 0, xpos = dx; y < height;
+         y++, dst += PXSTRIDE(stride), xpos += dx)
+    {
+        const int frac = xpos & 0x3E;
+
+        for (int x = 0, base = xpos >> 6; x < width; x++, base += base_inc) {
+            if (base < max_base_x) {
+                const int v = top[base] * (64 - frac) + top[base + 1] * frac;
+                dst[x] = (v + 32) >> 6;
+            } else {
+                pixel_set(&dst[x], top[max_base_x], width - x);
+                break;
+            }
+        }
+    }
+}
+
+static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
+                       const pixel *const topleft_in,
+                       const int width, const int height, int angle,
+                       const int max_width, const int max_height
+                       HIGHBD_DECL_SUFFIX)
+{
+    const int is_sm = (angle >> 9) & 0x1;
+    const int enable_intra_edge_filter = angle >> 10;
+    angle &= 511;
+    assert(angle > 90 && angle < 180);
+    int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
+    int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
+    const int upsample_left = enable_intra_edge_filter ?
+        get_upsample(width + height, 180 - angle, is_sm) : 0;
+    const int upsample_above = enable_intra_edge_filter ?
+        get_upsample(width + height, angle - 90, is_sm) : 0;
+    pixel edge[64 + 64 + 1];
+    pixel *const topleft = &edge[64];
+
+    if (upsample_above) {
+        upsample_edge(topleft, width + 1, topleft_in, 0, width + 1
+                      HIGHBD_TAIL_SUFFIX);
+        dx <<= 1;
+    } else {
+        const int filter_strength = enable_intra_edge_filter ?
+            get_filter_strength(width + height, angle - 90, is_sm) : 0;
+
+        if (filter_strength) {
+            filter_edge(&topleft[1], width, 0, max_width,
+                        &topleft_in[1], -1, width,
+                        filter_strength);
+        } else {
+            pixel_copy(&topleft[1], &topleft_in[1], width);
+        }
+    }
+    if (upsample_left) {
+        upsample_edge(&topleft[-height * 2], height + 1, &topleft_in[-height],
+                      0, height + 1 HIGHBD_TAIL_SUFFIX);
+        dy <<= 1;
+    } else {
+        const int filter_strength = enable_intra_edge_filter ?
+            get_filter_strength(width + height, 180 - angle, is_sm) : 0;
+
+        if (filter_strength) {
+            filter_edge(&topleft[-height], height, height - max_height, height,
+                        &topleft_in[-height],
+                        0, height + 1, filter_strength);
+        } else {
+            pixel_copy(&topleft[-height], &topleft_in[-height], height);
+        }
+    }
+    *topleft = *topleft_in;
+
+    const int base_inc_x = 1 + upsample_above;
+    const pixel *const left = &topleft[-(1 + upsample_left)];
+    for (int y = 0, xpos = ((1 + upsample_above) << 6) - dx; y < height;
+         y++, xpos -= dx, dst += PXSTRIDE(stride))
+    {
+        int base_x = xpos >> 6;
+        const int frac_x = xpos & 0x3E;
+
+        for (int x = 0, ypos = (y << (6 + upsample_left)) - dy; x < width;
+             x++, base_x += base_inc_x, ypos -= dy)
+        {
+            int v;
+            if (base_x >= 0) {
+                v = topleft[base_x] * (64 - frac_x) +
+                    topleft[base_x + 1] * frac_x;
+            } else {
+                const int base_y = ypos >> 6;
+                assert(base_y >= -(1 + upsample_left));
+                const int frac_y = ypos & 0x3E;
+                v = left[-base_y] * (64 - frac_y) +
+                    left[-(base_y + 1)] * frac_y;
+            }
+            dst[x] = (v + 32) >> 6;
+        }
+    }
+}
+
+static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
+                       const pixel *const topleft_in,
+                       const int width, const int height, int angle,
+                       const int max_width, const int max_height
+                       HIGHBD_DECL_SUFFIX)
+{
+    const int is_sm = (angle >> 9) & 0x1;
+    const int enable_intra_edge_filter = angle >> 10;
+    angle &= 511;
+    assert(angle > 180);
+    int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
+    pixel left_out[64 + 64];
+    const pixel *left;
+    int max_base_y;
+    const int upsample_left = enable_intra_edge_filter ?
+        get_upsample(width + height, angle - 180, is_sm) : 0;
+    if (upsample_left) {
+        upsample_edge(left_out, width + height,
+                      &topleft_in[-(width + height)],
+                      imax(width - height, 0), width + height + 1
+                      HIGHBD_TAIL_SUFFIX);
+        left = &left_out[2 * (width + height) - 2];
+        max_base_y = 2 * (width + height) - 2;
+        dy <<= 1;
+    } else {
+        const int filter_strength = enable_intra_edge_filter ?
+            get_filter_strength(width + height, angle - 180, is_sm) : 0;
+
+        if (filter_strength) {
+            filter_edge(left_out, width + height, 0, width + height,
+                        &topleft_in[-(width + height)],
+                        imax(width - height, 0), width + height + 1,
+                        filter_strength);
+            left = &left_out[width + height - 1];
+            max_base_y = width + height - 1;
+        } else {
+            left = &topleft_in[-1];
+            max_base_y = height + imin(width, height) - 1;
+        }
+    }
+    const int base_inc = 1 + upsample_left;
+    for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
+        const int frac = ypos & 0x3E;
+
+        for (int y = 0, base = ypos >> 6; y < height; y++, base += base_inc) {
+            if (base < max_base_y) {
+                const int v = left[-base] * (64 - frac) +
+                              left[-(base + 1)] * frac;
+                dst[y * PXSTRIDE(stride) + x] = (v + 32) >> 6;
+            } else {
+                do {
+                    dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
+                } while (++y < height);
+                break;
+            }
+        }
+    }
+}
+
+#if ARCH_X86
+#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
+    flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +           \
+    flt_ptr[16] * p2 + flt_ptr[17] * p3 +           \
+    flt_ptr[32] * p4 + flt_ptr[33] * p5 +           \
+    flt_ptr[48] * p6
+#define FLT_INCR 2
+#else
+#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
+    flt_ptr[ 0] * p0 + flt_ptr[ 8] * p1 +           \
+    flt_ptr[16] * p2 + flt_ptr[24] * p3 +           \
+    flt_ptr[32] * p4 + flt_ptr[40] * p5 +           \
+    flt_ptr[48] * p6
+#define FLT_INCR 1
+#endif
+
+/* Up to 32x32 only */
+static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
+                           const pixel *const topleft_in,
+                           const int width, const int height, int filt_idx,
+                           const int max_width, const int max_height
+                           HIGHBD_DECL_SUFFIX)
+{
+    filt_idx &= 511;
+    assert(filt_idx < 5);
+
+    const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
+    const pixel *top = &topleft_in[1];
+    for (int y = 0; y < height; y += 2) {
+        const pixel *topleft = &topleft_in[-y];
+        const pixel *left = &topleft[-1];
+        ptrdiff_t left_stride = -1;
+        for (int x = 0; x < width; x += 4) {
+            const int p0 = *topleft;
+            const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
+            const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
+            pixel *ptr = &dst[x];
+            const int8_t *flt_ptr = filter;
+
+            for (int yy = 0; yy < 2; yy++) {
+                for (int xx = 0; xx < 4; xx++, flt_ptr += FLT_INCR) {
+                    const int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6);
+                    ptr[xx] = iclip_pixel((acc + 8) >> 4);
+                }
+                ptr += PXSTRIDE(stride);
+            }
+            left = &dst[x + 4 - 1];
+            left_stride = PXSTRIDE(stride);
+            top += 4;
+            topleft = &top[-1];
+        }
+        top = &dst[PXSTRIDE(stride)];
+        dst = &dst[PXSTRIDE(stride) * 2];
+    }
+}
+
+static NOINLINE void
+cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
+         const int w_pad, const int h_pad, const int width, const int height,
+         const int ss_hor, const int ss_ver)
+{
+    int y, x;
+    int16_t *const ac_orig = ac;
+
+    assert(w_pad >= 0 && w_pad * 4 < width);
+    assert(h_pad >= 0 && h_pad * 4 < height);
+
+    for (y = 0; y < height - 4 * h_pad; y++) {
+        for (x = 0; x < width - 4 * w_pad; x++) {
+            int ac_sum = ypx[x << ss_hor];
+            if (ss_hor) ac_sum += ypx[x * 2 + 1];
+            if (ss_ver) {
+                ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];
+                if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];
+            }
+            ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);
+        }
+        for (; x < width; x++)
+            ac[x] = ac[x - 1];
+        ac += width;
+        ypx += PXSTRIDE(stride) << ss_ver;
+    }
+    for (; y < height; y++) {
+        memcpy(ac, &ac[-width], width * sizeof(*ac));
+        ac += width;
+    }
+
+    const int log2sz = ctz(width) + ctz(height);
+    int sum = (1 << log2sz) >> 1;
+    for (ac = ac_orig, y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            sum += ac[x];
+        ac += width;
+    }
+    sum >>= log2sz;
+
+    // subtract DC
+    for (ac = ac_orig, y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            ac[x] -= sum;
+        ac += width;
+    }
+}
+
+#define cfl_ac_fn(fmt, ss_hor, ss_ver) \
+static void cfl_ac_##fmt##_c(int16_t *const ac, const pixel *const ypx, \
+                             const ptrdiff_t stride, const int w_pad, \
+                             const int h_pad, const int cw, const int ch) \
+{ \
+    cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver); \
+}
+
+cfl_ac_fn(420, 1, 1)
+cfl_ac_fn(422, 1, 0)
+cfl_ac_fn(444, 0, 0)
+
+static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
+                       const uint16_t *const pal, const uint8_t *idx,
+                       const int w, const int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)
+            dst[x] = (pixel) pal[idx[x]];
+        idx += w;
+        dst += PXSTRIDE(stride);
+    }
+}
+
+COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
+    c->intra_pred[DC_PRED      ] = ipred_dc_c;
+    c->intra_pred[DC_128_PRED  ] = ipred_dc_128_c;
+    c->intra_pred[TOP_DC_PRED  ] = ipred_dc_top_c;
+    c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
+    c->intra_pred[HOR_PRED     ] = ipred_h_c;
+    c->intra_pred[VERT_PRED    ] = ipred_v_c;
+    c->intra_pred[PAETH_PRED   ] = ipred_paeth_c;
+    c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;
+    c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
+    c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
+    c->intra_pred[Z1_PRED      ] = ipred_z1_c;
+    c->intra_pred[Z2_PRED      ] = ipred_z2_c;
+    c->intra_pred[Z3_PRED      ] = ipred_z3_c;
+    c->intra_pred[FILTER_PRED  ] = ipred_filter_c;
+
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = cfl_ac_420_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = cfl_ac_422_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = cfl_ac_444_c;
+
+    c->cfl_pred[DC_PRED     ] = ipred_cfl_c;
+    c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
+    c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
+    c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
+
+    c->pal_pred = pal_pred_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    bitfn(dav1d_intra_pred_dsp_init_arm)(c);
+#elif ARCH_X86
+    bitfn(dav1d_intra_pred_dsp_init_x86)(c);
+#endif
+#endif
+}
diff --git a/src/itx.h b/src/itx.h
new file mode 100644 (file)
index 0000000..a299629
--- /dev/null
+++ b/src/itx.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ITX_H
+#define DAV1D_SRC_ITX_H
+
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+#define decl_itx_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob \
+            HIGHBD_DECL_SUFFIX)
+typedef decl_itx_fn(*itxfm_fn);
+
+typedef struct Dav1dInvTxfmDSPContext {
+    itxfm_fn itxfm_add[N_RECT_TX_SIZES][N_TX_TYPES_PLUS_LL];
+} Dav1dInvTxfmDSPContext;
+
+bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc);
+bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c, int bpc);
+bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c);
+
+#endif /* DAV1D_SRC_ITX_H */
diff --git a/src/itx_1d.c b/src/itx_1d.c
new file mode 100644 (file)
index 0000000..ca14fc8
--- /dev/null
@@ -0,0 +1,1034 @@
+/*
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common/intops.h"
+
+#include "src/itx_1d.h"
+
+#define CLIP(a) iclip(a, min, max)
+
+/*
+ * In some places, we use the pattern like this:
+ * t2 = ((in1 *  1567         - in3 * (3784 - 4096) + 2048) >> 12) - in3;
+ * even though the reference code might use something like:
+ * t2 =  (in1 *  1567         - in3 *  3784         + 2048) >> 12;
+ *
+ * The reason for this is that for 12 bits/component bitstreams (corrupt/
+ * invalid ones, but they are codable nonetheless), each coefficient or
+ * input can be 19(+sign) bits, and therefore if the combination of the
+ * two multipliers (each 12 bits) is >= 4096, the result of the add/sub
+ * after the pair of multiplies will exceed the 31+sign bit range. Signed
+ * integer overflows are UB in C, and we'd like to prevent that.
+ *
+ * To workaround this, we invert one of the two coefficients (or, if both are
+ * multiples of 2, we reduce their magnitude by one bit). It should be noted
+ * that SIMD implementations do not have to follow this exact behaviour. The
+ * AV1 spec clearly states that the result of the multiply/add pairs should
+ * fit in 31+sign bit intermediates, and that streams violating this convention
+ * are not AV1-compliant. So, as long as we don't trigger UB (which some people
+ * would consider a security vulnerability), we're fine. So, SIMD can simply
+ * use the faster implementation, even if that might in some cases result in
+ * integer overflows, since these are not considered valid AV1 anyway, and in
+ * e.g. x86 assembly, integer overflows are not considered UB, but they merely
+ * wrap around.
+ */
+
+static NOINLINE void
+inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+                       const int min, const int max, const int tx64)
+{
+    assert(stride > 0);
+    const int in0 = c[0 * stride], in1 = c[1 * stride];
+
+    int t0, t1, t2, t3;
+    if (tx64) {
+        t0 = t1 = (in0 * 181 + 128) >> 8;
+        t2 = (in1 * 1567 + 2048) >> 12;
+        t3 = (in1 * 3784 + 2048) >> 12;
+    } else {
+        const int in2 = c[2 * stride], in3 = c[3 * stride];
+
+        t0 = ((in0 + in2) * 181 + 128) >> 8;
+        t1 = ((in0 - in2) * 181 + 128) >> 8;
+        t2 = ((in1 *  1567         - in3 * (3784 - 4096) + 2048) >> 12) - in3;
+        t3 = ((in1 * (3784 - 4096) + in3 *  1567         + 2048) >> 12) + in1;
+    }
+
+    c[0 * stride] = CLIP(t0 + t3);
+    c[1 * stride] = CLIP(t1 + t2);
+    c[2 * stride] = CLIP(t1 - t2);
+    c[3 * stride] = CLIP(t0 - t3);
+}
+
+void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
+                         const int min, const int max)
+{
+    inv_dct4_1d_internal_c(c, stride, min, max, 0);
+}
+
+static NOINLINE void
+inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+                       const int min, const int max, const int tx64)
+{
+    assert(stride > 0);
+    inv_dct4_1d_internal_c(c, stride << 1, min, max, tx64);
+
+    const int in1 = c[1 * stride], in3 = c[3 * stride];
+
+    int t4a, t5a, t6a, t7a;
+    if (tx64) {
+        t4a = (in1 *   799 + 2048) >> 12;
+        t5a = (in3 * -2276 + 2048) >> 12;
+        t6a = (in3 *  3406 + 2048) >> 12;
+        t7a = (in1 *  4017 + 2048) >> 12;
+    } else {
+        const int in5 = c[5 * stride], in7 = c[7 * stride];
+
+        t4a = ((in1 *   799         - in7 * (4017 - 4096) + 2048) >> 12) - in7;
+        t5a =  (in5 *  1703         - in3 *  1138         + 1024) >> 11;
+        t6a =  (in5 *  1138         + in3 *  1703         + 1024) >> 11;
+        t7a = ((in1 * (4017 - 4096) + in7 *  799          + 2048) >> 12) + in1;
+    }
+
+    const int t4  = CLIP(t4a + t5a);
+              t5a = CLIP(t4a - t5a);
+    const int t7  = CLIP(t7a + t6a);
+              t6a = CLIP(t7a - t6a);
+
+    const int t5  = ((t6a - t5a) * 181 + 128) >> 8;
+    const int t6  = ((t6a + t5a) * 181 + 128) >> 8;
+
+    const int t0 = c[0 * stride];
+    const int t1 = c[2 * stride];
+    const int t2 = c[4 * stride];
+    const int t3 = c[6 * stride];
+
+    c[0 * stride] = CLIP(t0 + t7);
+    c[1 * stride] = CLIP(t1 + t6);
+    c[2 * stride] = CLIP(t2 + t5);
+    c[3 * stride] = CLIP(t3 + t4);
+    c[4 * stride] = CLIP(t3 - t4);
+    c[5 * stride] = CLIP(t2 - t5);
+    c[6 * stride] = CLIP(t1 - t6);
+    c[7 * stride] = CLIP(t0 - t7);
+}
+
+void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
+                         const int min, const int max)
+{
+    inv_dct8_1d_internal_c(c, stride, min, max, 0);
+}
+
+static NOINLINE void
+inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+                        const int min, const int max, int tx64)
+{
+    assert(stride > 0);
+    inv_dct8_1d_internal_c(c, stride << 1, min, max, tx64);
+
+    const int in1 = c[1 * stride], in3 = c[3 * stride];
+    const int in5 = c[5 * stride], in7 = c[7 * stride];
+
+    int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+    if (tx64) {
+        t8a  = (in1 *   401 + 2048) >> 12;
+        t9a  = (in7 * -2598 + 2048) >> 12;
+        t10a = (in5 *  1931 + 2048) >> 12;
+        t11a = (in3 * -1189 + 2048) >> 12;
+        t12a = (in3 *  3920 + 2048) >> 12;
+        t13a = (in5 *  3612 + 2048) >> 12;
+        t14a = (in7 *  3166 + 2048) >> 12;
+        t15a = (in1 *  4076 + 2048) >> 12;
+    } else {
+        const int in9  = c[ 9 * stride], in11 = c[11 * stride];
+        const int in13 = c[13 * stride], in15 = c[15 * stride];
+
+        t8a  = ((in1  *   401         - in15 * (4076 - 4096) + 2048) >> 12) - in15;
+        t9a  =  (in9  *  1583         - in7  *  1299         + 1024) >> 11;
+        t10a = ((in5  *  1931         - in11 * (3612 - 4096) + 2048) >> 12) - in11;
+        t11a = ((in13 * (3920 - 4096) - in3  *  1189         + 2048) >> 12) + in13;
+        t12a = ((in13 *  1189         + in3  * (3920 - 4096) + 2048) >> 12) + in3;
+        t13a = ((in5  * (3612 - 4096) + in11 *  1931         + 2048) >> 12) + in5;
+        t14a =  (in9  *  1299         + in7  *  1583         + 1024) >> 11;
+        t15a = ((in1  * (4076 - 4096) + in15 *   401         + 2048) >> 12) + in1;
+    }
+
+    int t8  = CLIP(t8a  + t9a);
+    int t9  = CLIP(t8a  - t9a);
+    int t10 = CLIP(t11a - t10a);
+    int t11 = CLIP(t11a + t10a);
+    int t12 = CLIP(t12a + t13a);
+    int t13 = CLIP(t12a - t13a);
+    int t14 = CLIP(t15a - t14a);
+    int t15 = CLIP(t15a + t14a);
+
+    t9a  = ((  t14 *  1567         - t9  * (3784 - 4096)  + 2048) >> 12) - t9;
+    t14a = ((  t14 * (3784 - 4096) + t9  *  1567          + 2048) >> 12) + t14;
+    t10a = ((-(t13 * (3784 - 4096) + t10 *  1567)         + 2048) >> 12) - t13;
+    t13a = ((  t13 *  1567         - t10 * (3784 - 4096)  + 2048) >> 12) - t10;
+
+    t8a  = CLIP(t8   + t11);
+    t9   = CLIP(t9a  + t10a);
+    t10  = CLIP(t9a  - t10a);
+    t11a = CLIP(t8   - t11);
+    t12a = CLIP(t15  - t12);
+    t13  = CLIP(t14a - t13a);
+    t14  = CLIP(t14a + t13a);
+    t15a = CLIP(t15  + t12);
+
+    t10a = ((t13  - t10)  * 181 + 128) >> 8;
+    t13a = ((t13  + t10)  * 181 + 128) >> 8;
+    t11  = ((t12a - t11a) * 181 + 128) >> 8;
+    t12  = ((t12a + t11a) * 181 + 128) >> 8;
+
+    const int t0 = c[ 0 * stride];
+    const int t1 = c[ 2 * stride];
+    const int t2 = c[ 4 * stride];
+    const int t3 = c[ 6 * stride];
+    const int t4 = c[ 8 * stride];
+    const int t5 = c[10 * stride];
+    const int t6 = c[12 * stride];
+    const int t7 = c[14 * stride];
+
+    c[ 0 * stride] = CLIP(t0 + t15a);
+    c[ 1 * stride] = CLIP(t1 + t14);
+    c[ 2 * stride] = CLIP(t2 + t13a);
+    c[ 3 * stride] = CLIP(t3 + t12);
+    c[ 4 * stride] = CLIP(t4 + t11);
+    c[ 5 * stride] = CLIP(t5 + t10a);
+    c[ 6 * stride] = CLIP(t6 + t9);
+    c[ 7 * stride] = CLIP(t7 + t8a);
+    c[ 8 * stride] = CLIP(t7 - t8a);
+    c[ 9 * stride] = CLIP(t6 - t9);
+    c[10 * stride] = CLIP(t5 - t10a);
+    c[11 * stride] = CLIP(t4 - t11);
+    c[12 * stride] = CLIP(t3 - t12);
+    c[13 * stride] = CLIP(t2 - t13a);
+    c[14 * stride] = CLIP(t1 - t14);
+    c[15 * stride] = CLIP(t0 - t15a);
+}
+
+void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
+                          const int min, const int max)
+{
+    inv_dct16_1d_internal_c(c, stride, min, max, 0);
+}
+
+static NOINLINE void
+inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+                        const int min, const int max, const int tx64)
+{
+    assert(stride > 0);
+    inv_dct16_1d_internal_c(c, stride << 1, min, max, tx64);
+
+    const int in1  = c[ 1 * stride], in3  = c[ 3 * stride];
+    const int in5  = c[ 5 * stride], in7  = c[ 7 * stride];
+    const int in9  = c[ 9 * stride], in11 = c[11 * stride];
+    const int in13 = c[13 * stride], in15 = c[15 * stride];
+
+    int t16a, t17a, t18a, t19a, t20a, t21a, t22a, t23a;
+    int t24a, t25a, t26a, t27a, t28a, t29a, t30a, t31a;
+    if (tx64) {
+        t16a = (in1  *   201 + 2048) >> 12;
+        t17a = (in15 * -2751 + 2048) >> 12;
+        t18a = (in9  *  1751 + 2048) >> 12;
+        t19a = (in7  * -1380 + 2048) >> 12;
+        t20a = (in5  *   995 + 2048) >> 12;
+        t21a = (in11 * -2106 + 2048) >> 12;
+        t22a = (in13 *  2440 + 2048) >> 12;
+        t23a = (in3  *  -601 + 2048) >> 12;
+        t24a = (in3  *  4052 + 2048) >> 12;
+        t25a = (in13 *  3290 + 2048) >> 12;
+        t26a = (in11 *  3513 + 2048) >> 12;
+        t27a = (in5  *  3973 + 2048) >> 12;
+        t28a = (in7  *  3857 + 2048) >> 12;
+        t29a = (in9  *  3703 + 2048) >> 12;
+        t30a = (in15 *  3035 + 2048) >> 12;
+        t31a = (in1  *  4091 + 2048) >> 12;
+    } else {
+        const int in17 = c[17 * stride], in19 = c[19 * stride];
+        const int in21 = c[21 * stride], in23 = c[23 * stride];
+        const int in25 = c[25 * stride], in27 = c[27 * stride];
+        const int in29 = c[29 * stride], in31 = c[31 * stride];
+
+        t16a = ((in1  *   201         - in31 * (4091 - 4096) + 2048) >> 12) - in31;
+        t17a = ((in17 * (3035 - 4096) - in15 *  2751         + 2048) >> 12) + in17;
+        t18a = ((in9  *  1751         - in23 * (3703 - 4096) + 2048) >> 12) - in23;
+        t19a = ((in25 * (3857 - 4096) - in7  *  1380         + 2048) >> 12) + in25;
+        t20a = ((in5  *   995         - in27 * (3973 - 4096) + 2048) >> 12) - in27;
+        t21a = ((in21 * (3513 - 4096) - in11 *  2106         + 2048) >> 12) + in21;
+        t22a =  (in13 *  1220         - in19 *  1645         + 1024) >> 11;
+        t23a = ((in29 * (4052 - 4096) - in3  *   601         + 2048) >> 12) + in29;
+        t24a = ((in29 *   601         + in3  * (4052 - 4096) + 2048) >> 12) + in3;
+        t25a =  (in13 *  1645         + in19 *  1220         + 1024) >> 11;
+        t26a = ((in21 *  2106         + in11 * (3513 - 4096) + 2048) >> 12) + in11;
+        t27a = ((in5  * (3973 - 4096) + in27 *   995         + 2048) >> 12) + in5;
+        t28a = ((in25 *  1380         + in7  * (3857 - 4096) + 2048) >> 12) + in7;
+        t29a = ((in9  * (3703 - 4096) + in23 *  1751         + 2048) >> 12) + in9;
+        t30a = ((in17 *  2751         + in15 * (3035 - 4096) + 2048) >> 12) + in15;
+        t31a = ((in1  * (4091 - 4096) + in31 *   201         + 2048) >> 12) + in1;
+    }
+
+    int t16 = CLIP(t16a + t17a);
+    int t17 = CLIP(t16a - t17a);
+    int t18 = CLIP(t19a - t18a);
+    int t19 = CLIP(t19a + t18a);
+    int t20 = CLIP(t20a + t21a);
+    int t21 = CLIP(t20a - t21a);
+    int t22 = CLIP(t23a - t22a);
+    int t23 = CLIP(t23a + t22a);
+    int t24 = CLIP(t24a + t25a);
+    int t25 = CLIP(t24a - t25a);
+    int t26 = CLIP(t27a - t26a);
+    int t27 = CLIP(t27a + t26a);
+    int t28 = CLIP(t28a + t29a);
+    int t29 = CLIP(t28a - t29a);
+    int t30 = CLIP(t31a - t30a);
+    int t31 = CLIP(t31a + t30a);
+
+    t17a = ((  t30 *   799         - t17 * (4017 - 4096)  + 2048) >> 12) - t17;
+    t30a = ((  t30 * (4017 - 4096) + t17 *   799          + 2048) >> 12) + t30;
+    t18a = ((-(t29 * (4017 - 4096) + t18 *   799)         + 2048) >> 12) - t29;
+    t29a = ((  t29 *   799         - t18 * (4017 - 4096)  + 2048) >> 12) - t18;
+    t21a =  (  t26 *  1703         - t21 *  1138          + 1024) >> 11;
+    t26a =  (  t26 *  1138         + t21 *  1703          + 1024) >> 11;
+    t22a =  (-(t25 *  1138         + t22 *  1703        ) + 1024) >> 11;
+    t25a =  (  t25 *  1703         - t22 *  1138          + 1024) >> 11;
+
+    t16a = CLIP(t16  + t19);
+    t17  = CLIP(t17a + t18a);
+    t18  = CLIP(t17a - t18a);
+    t19a = CLIP(t16  - t19);
+    t20a = CLIP(t23  - t20);
+    t21  = CLIP(t22a - t21a);
+    t22  = CLIP(t22a + t21a);
+    t23a = CLIP(t23  + t20);
+    t24a = CLIP(t24  + t27);
+    t25  = CLIP(t25a + t26a);
+    t26  = CLIP(t25a - t26a);
+    t27a = CLIP(t24  - t27);
+    t28a = CLIP(t31  - t28);
+    t29  = CLIP(t30a - t29a);
+    t30  = CLIP(t30a + t29a);
+    t31a = CLIP(t31  + t28);
+
+    t18a = ((  t29  *  1567         - t18  * (3784 - 4096)  + 2048) >> 12) - t18;
+    t29a = ((  t29  * (3784 - 4096) + t18  *  1567          + 2048) >> 12) + t29;
+    t19  = ((  t28a *  1567         - t19a * (3784 - 4096)  + 2048) >> 12) - t19a;
+    t28  = ((  t28a * (3784 - 4096) + t19a *  1567          + 2048) >> 12) + t28a;
+    t20  = ((-(t27a * (3784 - 4096) + t20a *  1567)         + 2048) >> 12) - t27a;
+    t27  = ((  t27a *  1567         - t20a * (3784 - 4096)  + 2048) >> 12) - t20a;
+    t21a = ((-(t26  * (3784 - 4096) + t21  *  1567)         + 2048) >> 12) - t26;
+    t26a = ((  t26  *  1567         - t21  * (3784 - 4096)  + 2048) >> 12) - t21;
+
+    t16  = CLIP(t16a + t23a);
+    t17a = CLIP(t17  + t22);
+    t18  = CLIP(t18a + t21a);
+    t19a = CLIP(t19  + t20);
+    t20a = CLIP(t19  - t20);
+    t21  = CLIP(t18a - t21a);
+    t22a = CLIP(t17  - t22);
+    t23  = CLIP(t16a - t23a);
+    t24  = CLIP(t31a - t24a);
+    t25a = CLIP(t30  - t25);
+    t26  = CLIP(t29a - t26a);
+    t27a = CLIP(t28  - t27);
+    t28a = CLIP(t28  + t27);
+    t29  = CLIP(t29a + t26a);
+    t30a = CLIP(t30  + t25);
+    t31  = CLIP(t31a + t24a);
+
+    t20  = ((t27a - t20a) * 181 + 128) >> 8;
+    t27  = ((t27a + t20a) * 181 + 128) >> 8;
+    t21a = ((t26  - t21 ) * 181 + 128) >> 8;
+    t26a = ((t26  + t21 ) * 181 + 128) >> 8;
+    t22  = ((t25a - t22a) * 181 + 128) >> 8;
+    t25  = ((t25a + t22a) * 181 + 128) >> 8;
+    t23a = ((t24  - t23 ) * 181 + 128) >> 8;
+    t24a = ((t24  + t23 ) * 181 + 128) >> 8;
+
+    const int t0  = c[ 0 * stride];
+    const int t1  = c[ 2 * stride];
+    const int t2  = c[ 4 * stride];
+    const int t3  = c[ 6 * stride];
+    const int t4  = c[ 8 * stride];
+    const int t5  = c[10 * stride];
+    const int t6  = c[12 * stride];
+    const int t7  = c[14 * stride];
+    const int t8  = c[16 * stride];
+    const int t9  = c[18 * stride];
+    const int t10 = c[20 * stride];
+    const int t11 = c[22 * stride];
+    const int t12 = c[24 * stride];
+    const int t13 = c[26 * stride];
+    const int t14 = c[28 * stride];
+    const int t15 = c[30 * stride];
+
+    c[ 0 * stride] = CLIP(t0  + t31);
+    c[ 1 * stride] = CLIP(t1  + t30a);
+    c[ 2 * stride] = CLIP(t2  + t29);
+    c[ 3 * stride] = CLIP(t3  + t28a);
+    c[ 4 * stride] = CLIP(t4  + t27);
+    c[ 5 * stride] = CLIP(t5  + t26a);
+    c[ 6 * stride] = CLIP(t6  + t25);
+    c[ 7 * stride] = CLIP(t7  + t24a);
+    c[ 8 * stride] = CLIP(t8  + t23a);
+    c[ 9 * stride] = CLIP(t9  + t22);
+    c[10 * stride] = CLIP(t10 + t21a);
+    c[11 * stride] = CLIP(t11 + t20);
+    c[12 * stride] = CLIP(t12 + t19a);
+    c[13 * stride] = CLIP(t13 + t18);
+    c[14 * stride] = CLIP(t14 + t17a);
+    c[15 * stride] = CLIP(t15 + t16);
+    c[16 * stride] = CLIP(t15 - t16);
+    c[17 * stride] = CLIP(t14 - t17a);
+    c[18 * stride] = CLIP(t13 - t18);
+    c[19 * stride] = CLIP(t12 - t19a);
+    c[20 * stride] = CLIP(t11 - t20);
+    c[21 * stride] = CLIP(t10 - t21a);
+    c[22 * stride] = CLIP(t9  - t22);
+    c[23 * stride] = CLIP(t8  - t23a);
+    c[24 * stride] = CLIP(t7  - t24a);
+    c[25 * stride] = CLIP(t6  - t25);
+    c[26 * stride] = CLIP(t5  - t26a);
+    c[27 * stride] = CLIP(t4  - t27);
+    c[28 * stride] = CLIP(t3  - t28a);
+    c[29 * stride] = CLIP(t2  - t29);
+    c[30 * stride] = CLIP(t1  - t30a);
+    c[31 * stride] = CLIP(t0  - t31);
+}
+
+void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
+                          const int min, const int max)
+{
+    inv_dct32_1d_internal_c(c, stride, min, max, 0);
+}
+
+void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
+                          const int min, const int max)
+{
+    assert(stride > 0);
+    inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);
+
+    const int in1  = c[ 1 * stride], in3  = c[ 3 * stride];
+    const int in5  = c[ 5 * stride], in7  = c[ 7 * stride];
+    const int in9  = c[ 9 * stride], in11 = c[11 * stride];
+    const int in13 = c[13 * stride], in15 = c[15 * stride];
+    const int in17 = c[17 * stride], in19 = c[19 * stride];
+    const int in21 = c[21 * stride], in23 = c[23 * stride];
+    const int in25 = c[25 * stride], in27 = c[27 * stride];
+    const int in29 = c[29 * stride], in31 = c[31 * stride];
+
+    int t32a = (in1  *   101 + 2048) >> 12;
+    int t33a = (in31 * -2824 + 2048) >> 12;
+    int t34a = (in17 *  1660 + 2048) >> 12;
+    int t35a = (in15 * -1474 + 2048) >> 12;
+    int t36a = (in9  *   897 + 2048) >> 12;
+    int t37a = (in23 * -2191 + 2048) >> 12;
+    int t38a = (in25 *  2359 + 2048) >> 12;
+    int t39a = (in7  *  -700 + 2048) >> 12;
+    int t40a = (in5  *   501 + 2048) >> 12;
+    int t41a = (in27 * -2520 + 2048) >> 12;
+    int t42a = (in21 *  2019 + 2048) >> 12;
+    int t43a = (in11 * -1092 + 2048) >> 12;
+    int t44a = (in13 *  1285 + 2048) >> 12;
+    int t45a = (in19 * -1842 + 2048) >> 12;
+    int t46a = (in29 *  2675 + 2048) >> 12;
+    int t47a = (in3  *  -301 + 2048) >> 12;
+    int t48a = (in3  *  4085 + 2048) >> 12;
+    int t49a = (in29 *  3102 + 2048) >> 12;
+    int t50a = (in19 *  3659 + 2048) >> 12;
+    int t51a = (in13 *  3889 + 2048) >> 12;
+    int t52a = (in11 *  3948 + 2048) >> 12;
+    int t53a = (in21 *  3564 + 2048) >> 12;
+    int t54a = (in27 *  3229 + 2048) >> 12;
+    int t55a = (in5  *  4065 + 2048) >> 12;
+    int t56a = (in7  *  4036 + 2048) >> 12;
+    int t57a = (in25 *  3349 + 2048) >> 12;
+    int t58a = (in23 *  3461 + 2048) >> 12;
+    int t59a = (in9  *  3996 + 2048) >> 12;
+    int t60a = (in15 *  3822 + 2048) >> 12;
+    int t61a = (in17 *  3745 + 2048) >> 12;
+    int t62a = (in31 *  2967 + 2048) >> 12;
+    int t63a = (in1  *  4095 + 2048) >> 12;
+
+    int t32 = CLIP(t32a + t33a);
+    int t33 = CLIP(t32a - t33a);
+    int t34 = CLIP(t35a - t34a);
+    int t35 = CLIP(t35a + t34a);
+    int t36 = CLIP(t36a + t37a);
+    int t37 = CLIP(t36a - t37a);
+    int t38 = CLIP(t39a - t38a);
+    int t39 = CLIP(t39a + t38a);
+    int t40 = CLIP(t40a + t41a);
+    int t41 = CLIP(t40a - t41a);
+    int t42 = CLIP(t43a - t42a);
+    int t43 = CLIP(t43a + t42a);
+    int t44 = CLIP(t44a + t45a);
+    int t45 = CLIP(t44a - t45a);
+    int t46 = CLIP(t47a - t46a);
+    int t47 = CLIP(t47a + t46a);
+    int t48 = CLIP(t48a + t49a);
+    int t49 = CLIP(t48a - t49a);
+    int t50 = CLIP(t51a - t50a);
+    int t51 = CLIP(t51a + t50a);
+    int t52 = CLIP(t52a + t53a);
+    int t53 = CLIP(t52a - t53a);
+    int t54 = CLIP(t55a - t54a);
+    int t55 = CLIP(t55a + t54a);
+    int t56 = CLIP(t56a + t57a);
+    int t57 = CLIP(t56a - t57a);
+    int t58 = CLIP(t59a - t58a);
+    int t59 = CLIP(t59a + t58a);
+    int t60 = CLIP(t60a + t61a);
+    int t61 = CLIP(t60a - t61a);
+    int t62 = CLIP(t63a - t62a);
+    int t63 = CLIP(t63a + t62a);
+
+    t33a = ((t33 * (4096 - 4076) + t62 *   401         + 2048) >> 12) - t33;
+    t34a = ((t34 *  -401         + t61 * (4096 - 4076) + 2048) >> 12) - t61;
+    t37a =  (t37 * -1299         + t58 *  1583         + 1024) >> 11;
+    t38a =  (t38 * -1583         + t57 * -1299         + 1024) >> 11;
+    t41a = ((t41 * (4096 - 3612) + t54 *  1931         + 2048) >> 12) - t41;
+    t42a = ((t42 * -1931         + t53 * (4096 - 3612) + 2048) >> 12) - t53;
+    t45a = ((t45 * -1189         + t50 * (3920 - 4096) + 2048) >> 12) + t50;
+    t46a = ((t46 * (4096 - 3920) + t49 * -1189         + 2048) >> 12) - t46;
+    t49a = ((t46 * -1189         + t49 * (3920 - 4096) + 2048) >> 12) + t49;
+    t50a = ((t45 * (3920 - 4096) + t50 *  1189         + 2048) >> 12) + t45;
+    t53a = ((t42 * (4096 - 3612) + t53 *  1931         + 2048) >> 12) - t42;
+    t54a = ((t41 *  1931         + t54 * (3612 - 4096) + 2048) >> 12) + t54;
+    t57a =  (t38 * -1299         + t57 *  1583         + 1024) >> 11;
+    t58a =  (t37 *  1583         + t58 *  1299         + 1024) >> 11;
+    t61a = ((t34 * (4096 - 4076) + t61 *   401         + 2048) >> 12) - t34;
+    t62a = ((t33 *   401         + t62 * (4076 - 4096) + 2048) >> 12) + t62;
+
+    t32a = CLIP(t32  + t35);
+    t33  = CLIP(t33a + t34a);
+    t34  = CLIP(t33a - t34a);
+    t35a = CLIP(t32  - t35);
+    t36a = CLIP(t39  - t36);
+    t37  = CLIP(t38a - t37a);
+    t38  = CLIP(t38a + t37a);
+    t39a = CLIP(t39  + t36);
+    t40a = CLIP(t40  + t43);
+    t41  = CLIP(t41a + t42a);
+    t42  = CLIP(t41a - t42a);
+    t43a = CLIP(t40  - t43);
+    t44a = CLIP(t47  - t44);
+    t45  = CLIP(t46a - t45a);
+    t46  = CLIP(t46a + t45a);
+    t47a = CLIP(t47  + t44);
+    t48a = CLIP(t48  + t51);
+    t49  = CLIP(t49a + t50a);
+    t50  = CLIP(t49a - t50a);
+    t51a = CLIP(t48  - t51);
+    t52a = CLIP(t55  - t52);
+    t53  = CLIP(t54a - t53a);
+    t54  = CLIP(t54a + t53a);
+    t55a = CLIP(t55  + t52);
+    t56a = CLIP(t56  + t59);
+    t57  = CLIP(t57a + t58a);
+    t58  = CLIP(t57a - t58a);
+    t59a = CLIP(t56  - t59);
+    t60a = CLIP(t63  - t60);
+    t61  = CLIP(t62a - t61a);
+    t62  = CLIP(t62a + t61a);
+    t63a = CLIP(t63  + t60);
+
+    t34a = ((t34  * (4096 - 4017) + t61  *   799         + 2048) >> 12) - t34;
+    t35  = ((t35a * (4096 - 4017) + t60a *   799         + 2048) >> 12) - t35a;
+    t36  = ((t36a *  -799         + t59a * (4096 - 4017) + 2048) >> 12) - t59a;
+    t37a = ((t37  *  -799         + t58  * (4096 - 4017) + 2048) >> 12) - t58;
+    t42a =  (t42  * -1138         + t53  *  1703         + 1024) >> 11;
+    t43  =  (t43a * -1138         + t52a *  1703         + 1024) >> 11;
+    t44  =  (t44a * -1703         + t51a * -1138         + 1024) >> 11;
+    t45a =  (t45  * -1703         + t50  * -1138         + 1024) >> 11;
+    t50a =  (t45  * -1138         + t50  *  1703         + 1024) >> 11;
+    t51  =  (t44a * -1138         + t51a *  1703         + 1024) >> 11;
+    t52  =  (t43a *  1703         + t52a *  1138         + 1024) >> 11;
+    t53a =  (t42  *  1703         + t53  *  1138         + 1024) >> 11;
+    t58a = ((t37  * (4096 - 4017) + t58  *   799         + 2048) >> 12) - t37;
+    t59  = ((t36a * (4096 - 4017) + t59a *   799         + 2048) >> 12) - t36a;
+    t60  = ((t35a *   799         + t60a * (4017 - 4096) + 2048) >> 12) + t60a;
+    t61a = ((t34  *   799         + t61  * (4017 - 4096) + 2048) >> 12) + t61;
+
+    t32  = CLIP(t32a + t39a);
+    t33a = CLIP(t33  + t38);
+    t34  = CLIP(t34a + t37a);
+    t35a = CLIP(t35  + t36);
+    t36a = CLIP(t35  - t36);
+    t37  = CLIP(t34a - t37a);
+    t38a = CLIP(t33  - t38);
+    t39  = CLIP(t32a - t39a);
+    t40  = CLIP(t47a - t40a);
+    t41a = CLIP(t46  - t41);
+    t42  = CLIP(t45a - t42a);
+    t43a = CLIP(t44  - t43);
+    t44a = CLIP(t44  + t43);
+    t45  = CLIP(t45a + t42a);
+    t46a = CLIP(t46  + t41);
+    t47  = CLIP(t47a + t40a);
+    t48  = CLIP(t48a + t55a);
+    t49a = CLIP(t49  + t54);
+    t50  = CLIP(t50a + t53a);
+    t51a = CLIP(t51  + t52);
+    t52a = CLIP(t51  - t52);
+    t53  = CLIP(t50a - t53a);
+    t54a = CLIP(t49  - t54);
+    t55  = CLIP(t48a - t55a);
+    t56  = CLIP(t63a - t56a);
+    t57a = CLIP(t62  - t57);
+    t58  = CLIP(t61a - t58a);
+    t59a = CLIP(t60  - t59);
+    t60a = CLIP(t60  + t59);
+    t61  = CLIP(t61a + t58a);
+    t62a = CLIP(t62  + t57);
+    t63  = CLIP(t63a + t56a);
+
+    t36  = ((t36a * (4096 - 3784) + t59a *  1567         + 2048) >> 12) - t36a;
+    t37a = ((t37  * (4096 - 3784) + t58  *  1567         + 2048) >> 12) - t37;
+    t38  = ((t38a * (4096 - 3784) + t57a *  1567         + 2048) >> 12) - t38a;
+    t39a = ((t39  * (4096 - 3784) + t56  *  1567         + 2048) >> 12) - t39;
+    t40a = ((t40  * -1567         + t55  * (4096 - 3784) + 2048) >> 12) - t55;
+    t41  = ((t41a * -1567         + t54a * (4096 - 3784) + 2048) >> 12) - t54a;
+    t42a = ((t42  * -1567         + t53  * (4096 - 3784) + 2048) >> 12) - t53;
+    t43  = ((t43a * -1567         + t52a * (4096 - 3784) + 2048) >> 12) - t52a;
+    t52  = ((t43a * (4096 - 3784) + t52a *  1567         + 2048) >> 12) - t43a;
+    t53a = ((t42  * (4096 - 3784) + t53  *  1567         + 2048) >> 12) - t42;
+    t54  = ((t41a * (4096 - 3784) + t54a *  1567         + 2048) >> 12) - t41a;
+    t55a = ((t40  * (4096 - 3784) + t55  *  1567         + 2048) >> 12) - t40;
+    t56a = ((t39  *  1567         + t56  * (3784 - 4096) + 2048) >> 12) + t56;
+    t57  = ((t38a *  1567         + t57a * (3784 - 4096) + 2048) >> 12) + t57a;
+    t58a = ((t37  *  1567         + t58  * (3784 - 4096) + 2048) >> 12) + t58;
+    t59  = ((t36a *  1567         + t59a * (3784 - 4096) + 2048) >> 12) + t59a;
+
+    t32a = CLIP(t32  + t47);
+    t33  = CLIP(t33a + t46a);
+    t34a = CLIP(t34  + t45);
+    t35  = CLIP(t35a + t44a);
+    t36a = CLIP(t36  + t43);
+    t37  = CLIP(t37a + t42a);
+    t38a = CLIP(t38  + t41);
+    t39  = CLIP(t39a + t40a);
+    t40  = CLIP(t39a - t40a);
+    t41a = CLIP(t38  - t41);
+    t42  = CLIP(t37a - t42a);
+    t43a = CLIP(t36  - t43);
+    t44  = CLIP(t35a - t44a);
+    t45a = CLIP(t34  - t45);
+    t46  = CLIP(t33a - t46a);
+    t47a = CLIP(t32  - t47);
+    t48a = CLIP(t63  - t48);
+    t49  = CLIP(t62a - t49a);
+    t50a = CLIP(t61  - t50);
+    t51  = CLIP(t60a - t51a);
+    t52a = CLIP(t59  - t52);
+    t53  = CLIP(t58a - t53a);
+    t54a = CLIP(t57  - t54);
+    t55  = CLIP(t56a - t55a);
+    t56  = CLIP(t56a + t55a);
+    t57a = CLIP(t57  + t54);
+    t58  = CLIP(t58a + t53a);
+    t59a = CLIP(t59  + t52);
+    t60  = CLIP(t60a + t51a);
+    t61a = CLIP(t61  + t50);
+    t62  = CLIP(t62a + t49a);
+    t63a = CLIP(t63  + t48);
+
+    t40a = ((t55  - t40 ) * 181 + 128) >> 8;
+    t41  = ((t54a - t41a) * 181 + 128) >> 8;
+    t42a = ((t53  - t42 ) * 181 + 128) >> 8;
+    t43  = ((t52a - t43a) * 181 + 128) >> 8;
+    t44a = ((t51  - t44 ) * 181 + 128) >> 8;
+    t45  = ((t50a - t45a) * 181 + 128) >> 8;
+    t46a = ((t49  - t46 ) * 181 + 128) >> 8;
+    t47  = ((t48a - t47a) * 181 + 128) >> 8;
+    t48  = ((t47a + t48a) * 181 + 128) >> 8;
+    t49a = ((t46  + t49 ) * 181 + 128) >> 8;
+    t50  = ((t45a + t50a) * 181 + 128) >> 8;
+    t51a = ((t44  + t51 ) * 181 + 128) >> 8;
+    t52  = ((t43a + t52a) * 181 + 128) >> 8;
+    t53a = ((t42  + t53 ) * 181 + 128) >> 8;
+    t54  = ((t41a + t54a) * 181 + 128) >> 8;
+    t55a = ((t40  + t55 ) * 181 + 128) >> 8;
+
+    const int t0  = c[ 0 * stride];
+    const int t1  = c[ 2 * stride];
+    const int t2  = c[ 4 * stride];
+    const int t3  = c[ 6 * stride];
+    const int t4  = c[ 8 * stride];
+    const int t5  = c[10 * stride];
+    const int t6  = c[12 * stride];
+    const int t7  = c[14 * stride];
+    const int t8  = c[16 * stride];
+    const int t9  = c[18 * stride];
+    const int t10 = c[20 * stride];
+    const int t11 = c[22 * stride];
+    const int t12 = c[24 * stride];
+    const int t13 = c[26 * stride];
+    const int t14 = c[28 * stride];
+    const int t15 = c[30 * stride];
+    const int t16 = c[32 * stride];
+    const int t17 = c[34 * stride];
+    const int t18 = c[36 * stride];
+    const int t19 = c[38 * stride];
+    const int t20 = c[40 * stride];
+    const int t21 = c[42 * stride];
+    const int t22 = c[44 * stride];
+    const int t23 = c[46 * stride];
+    const int t24 = c[48 * stride];
+    const int t25 = c[50 * stride];
+    const int t26 = c[52 * stride];
+    const int t27 = c[54 * stride];
+    const int t28 = c[56 * stride];
+    const int t29 = c[58 * stride];
+    const int t30 = c[60 * stride];
+    const int t31 = c[62 * stride];
+
+    c[ 0 * stride] = CLIP(t0  + t63a);
+    c[ 1 * stride] = CLIP(t1  + t62);
+    c[ 2 * stride] = CLIP(t2  + t61a);
+    c[ 3 * stride] = CLIP(t3  + t60);
+    c[ 4 * stride] = CLIP(t4  + t59a);
+    c[ 5 * stride] = CLIP(t5  + t58);
+    c[ 6 * stride] = CLIP(t6  + t57a);
+    c[ 7 * stride] = CLIP(t7  + t56);
+    c[ 8 * stride] = CLIP(t8  + t55a);
+    c[ 9 * stride] = CLIP(t9  + t54);
+    c[10 * stride] = CLIP(t10 + t53a);
+    c[11 * stride] = CLIP(t11 + t52);
+    c[12 * stride] = CLIP(t12 + t51a);
+    c[13 * stride] = CLIP(t13 + t50);
+    c[14 * stride] = CLIP(t14 + t49a);
+    c[15 * stride] = CLIP(t15 + t48);
+    c[16 * stride] = CLIP(t16 + t47);
+    c[17 * stride] = CLIP(t17 + t46a);
+    c[18 * stride] = CLIP(t18 + t45);
+    c[19 * stride] = CLIP(t19 + t44a);
+    c[20 * stride] = CLIP(t20 + t43);
+    c[21 * stride] = CLIP(t21 + t42a);
+    c[22 * stride] = CLIP(t22 + t41);
+    c[23 * stride] = CLIP(t23 + t40a);
+    c[24 * stride] = CLIP(t24 + t39);
+    c[25 * stride] = CLIP(t25 + t38a);
+    c[26 * stride] = CLIP(t26 + t37);
+    c[27 * stride] = CLIP(t27 + t36a);
+    c[28 * stride] = CLIP(t28 + t35);
+    c[29 * stride] = CLIP(t29 + t34a);
+    c[30 * stride] = CLIP(t30 + t33);
+    c[31 * stride] = CLIP(t31 + t32a);
+    c[32 * stride] = CLIP(t31 - t32a);
+    c[33 * stride] = CLIP(t30 - t33);
+    c[34 * stride] = CLIP(t29 - t34a);
+    c[35 * stride] = CLIP(t28 - t35);
+    c[36 * stride] = CLIP(t27 - t36a);
+    c[37 * stride] = CLIP(t26 - t37);
+    c[38 * stride] = CLIP(t25 - t38a);
+    c[39 * stride] = CLIP(t24 - t39);
+    c[40 * stride] = CLIP(t23 - t40a);
+    c[41 * stride] = CLIP(t22 - t41);
+    c[42 * stride] = CLIP(t21 - t42a);
+    c[43 * stride] = CLIP(t20 - t43);
+    c[44 * stride] = CLIP(t19 - t44a);
+    c[45 * stride] = CLIP(t18 - t45);
+    c[46 * stride] = CLIP(t17 - t46a);
+    c[47 * stride] = CLIP(t16 - t47);
+    c[48 * stride] = CLIP(t15 - t48);
+    c[49 * stride] = CLIP(t14 - t49a);
+    c[50 * stride] = CLIP(t13 - t50);
+    c[51 * stride] = CLIP(t12 - t51a);
+    c[52 * stride] = CLIP(t11 - t52);
+    c[53 * stride] = CLIP(t10 - t53a);
+    c[54 * stride] = CLIP(t9  - t54);
+    c[55 * stride] = CLIP(t8  - t55a);
+    c[56 * stride] = CLIP(t7  - t56);
+    c[57 * stride] = CLIP(t6  - t57a);
+    c[58 * stride] = CLIP(t5  - t58);
+    c[59 * stride] = CLIP(t4  - t59a);
+    c[60 * stride] = CLIP(t3  - t60);
+    c[61 * stride] = CLIP(t2  - t61a);
+    c[62 * stride] = CLIP(t1  - t62);
+    c[63 * stride] = CLIP(t0  - t63a);
+}
+
+static NOINLINE void
+inv_adst4_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
+                        const int min, const int max,
+                        int32_t *const out, const ptrdiff_t out_s)
+{
+    assert(in_s > 0 && out_s != 0);
+    const int in0 = in[0 * in_s], in1 = in[1 * in_s];
+    const int in2 = in[2 * in_s], in3 = in[3 * in_s];
+
+    out[0 * out_s] = (( 1321         * in0 + (3803 - 4096) * in2 +
+                       (2482 - 4096) * in3 + (3344 - 4096) * in1 + 2048) >> 12) +
+                     in2 + in3 + in1;
+    out[1 * out_s] = (((2482 - 4096) * in0 -  1321         * in2 -
+                       (3803 - 4096) * in3 + (3344 - 4096) * in1 + 2048) >> 12) +
+                     in0 - in3 + in1;
+    out[2 * out_s] = (209 * (in0 - in2 + in3) + 128) >> 8;
+    out[3 * out_s] = (((3803 - 4096) * in0 + (2482 - 4096) * in2 -
+                        1321         * in3 - (3344 - 4096) * in1 + 2048) >> 12) +
+                     in0 + in2 - in1;
+}
+
+static NOINLINE void
+inv_adst8_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
+                        const int min, const int max,
+                        int32_t *const out, const ptrdiff_t out_s)
+{
+    assert(in_s > 0 && out_s != 0);
+    const int in0 = in[0 * in_s], in1 = in[1 * in_s];
+    const int in2 = in[2 * in_s], in3 = in[3 * in_s];
+    const int in4 = in[4 * in_s], in5 = in[5 * in_s];
+    const int in6 = in[6 * in_s], in7 = in[7 * in_s];
+
+    const int t0a = (((4076 - 4096) * in7 +   401         * in0 + 2048) >> 12) + in7;
+    const int t1a = ((  401         * in7 - (4076 - 4096) * in0 + 2048) >> 12) - in0;
+    const int t2a = (((3612 - 4096) * in5 +  1931         * in2 + 2048) >> 12) + in5;
+    const int t3a = (( 1931         * in5 - (3612 - 4096) * in2 + 2048) >> 12) - in2;
+          int t4a =  ( 1299         * in3 +  1583         * in4 + 1024) >> 11;
+          int t5a =  ( 1583         * in3 -  1299         * in4 + 1024) >> 11;
+          int t6a = (( 1189         * in1 + (3920 - 4096) * in6 + 2048) >> 12) + in6;
+          int t7a = (((3920 - 4096) * in1 -  1189         * in6 + 2048) >> 12) + in1;
+
+    const int t0 = CLIP(t0a + t4a);
+    const int t1 = CLIP(t1a + t5a);
+          int t2 = CLIP(t2a + t6a);
+          int t3 = CLIP(t3a + t7a);
+    const int t4 = CLIP(t0a - t4a);
+    const int t5 = CLIP(t1a - t5a);
+          int t6 = CLIP(t2a - t6a);
+          int t7 = CLIP(t3a - t7a);
+
+    t4a = (((3784 - 4096) * t4 +  1567         * t5 + 2048) >> 12) + t4;
+    t5a = (( 1567         * t4 - (3784 - 4096) * t5 + 2048) >> 12) - t5;
+    t6a = (((3784 - 4096) * t7 -  1567         * t6 + 2048) >> 12) + t7;
+    t7a = (( 1567         * t7 + (3784 - 4096) * t6 + 2048) >> 12) + t6;
+
+    out[0 * out_s] =  CLIP(t0  + t2 );
+    out[7 * out_s] = -CLIP(t1  + t3 );
+    t2             =  CLIP(t0  - t2 );
+    t3             =  CLIP(t1  - t3 );
+    out[1 * out_s] = -CLIP(t4a + t6a);
+    out[6 * out_s] =  CLIP(t5a + t7a);
+    t6             =  CLIP(t4a - t6a);
+    t7             =  CLIP(t5a - t7a);
+
+    out[3 * out_s] = -(((t2 + t3) * 181 + 128) >> 8);
+    out[4 * out_s] =   ((t2 - t3) * 181 + 128) >> 8;
+    out[2 * out_s] =   ((t6 + t7) * 181 + 128) >> 8;
+    out[5 * out_s] = -(((t6 - t7) * 181 + 128) >> 8);
+}
+
+static NOINLINE void
+inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
+                         const int min, const int max,
+                         int32_t *const out, const ptrdiff_t out_s)
+{
+    assert(in_s > 0 && out_s != 0);
+    const int in0  = in[ 0 * in_s], in1  = in[ 1 * in_s];
+    const int in2  = in[ 2 * in_s], in3  = in[ 3 * in_s];
+    const int in4  = in[ 4 * in_s], in5  = in[ 5 * in_s];
+    const int in6  = in[ 6 * in_s], in7  = in[ 7 * in_s];
+    const int in8  = in[ 8 * in_s], in9  = in[ 9 * in_s];
+    const int in10 = in[10 * in_s], in11 = in[11 * in_s];
+    const int in12 = in[12 * in_s], in13 = in[13 * in_s];
+    const int in14 = in[14 * in_s], in15 = in[15 * in_s];
+
+    int t0  = ((in15 * (4091 - 4096) + in0  *   201         + 2048) >> 12) + in15;
+    int t1  = ((in15 *   201         - in0  * (4091 - 4096) + 2048) >> 12) - in0;
+    int t2  = ((in13 * (3973 - 4096) + in2  *   995         + 2048) >> 12) + in13;
+    int t3  = ((in13 *   995         - in2  * (3973 - 4096) + 2048) >> 12) - in2;
+    int t4  = ((in11 * (3703 - 4096) + in4  *  1751         + 2048) >> 12) + in11;
+    int t5  = ((in11 *  1751         - in4  * (3703 - 4096) + 2048) >> 12) - in4;
+    int t6  =  (in9  *  1645         + in6  *  1220         + 1024) >> 11;
+    int t7  =  (in9  *  1220         - in6  *  1645         + 1024) >> 11;
+    int t8  = ((in7  *  2751         + in8  * (3035 - 4096) + 2048) >> 12) + in8;
+    int t9  = ((in7  * (3035 - 4096) - in8  *  2751         + 2048) >> 12) + in7;
+    int t10 = ((in5  *  2106         + in10 * (3513 - 4096) + 2048) >> 12) + in10;
+    int t11 = ((in5  * (3513 - 4096) - in10 *  2106         + 2048) >> 12) + in5;
+    int t12 = ((in3  *  1380         + in12 * (3857 - 4096) + 2048) >> 12) + in12;
+    int t13 = ((in3  * (3857 - 4096) - in12 *  1380         + 2048) >> 12) + in3;
+    int t14 = ((in1  *   601         + in14 * (4052 - 4096) + 2048) >> 12) + in14;
+    int t15 = ((in1  * (4052 - 4096) - in14 *   601         + 2048) >> 12) + in1;
+
+    int t0a  = CLIP(t0 + t8 );
+    int t1a  = CLIP(t1 + t9 );
+    int t2a  = CLIP(t2 + t10);
+    int t3a  = CLIP(t3 + t11);
+    int t4a  = CLIP(t4 + t12);
+    int t5a  = CLIP(t5 + t13);
+    int t6a  = CLIP(t6 + t14);
+    int t7a  = CLIP(t7 + t15);
+    int t8a  = CLIP(t0 - t8 );
+    int t9a  = CLIP(t1 - t9 );
+    int t10a = CLIP(t2 - t10);
+    int t11a = CLIP(t3 - t11);
+    int t12a = CLIP(t4 - t12);
+    int t13a = CLIP(t5 - t13);
+    int t14a = CLIP(t6 - t14);
+    int t15a = CLIP(t7 - t15);
+
+    t8   = ((t8a  * (4017 - 4096) + t9a  *   799         + 2048) >> 12) + t8a;
+    t9   = ((t8a  *   799         - t9a  * (4017 - 4096) + 2048) >> 12) - t9a;
+    t10  = ((t10a *  2276         + t11a * (3406 - 4096) + 2048) >> 12) + t11a;
+    t11  = ((t10a * (3406 - 4096) - t11a *  2276         + 2048) >> 12) + t10a;
+    t12  = ((t13a * (4017 - 4096) - t12a *   799         + 2048) >> 12) + t13a;
+    t13  = ((t13a *   799         + t12a * (4017 - 4096) + 2048) >> 12) + t12a;
+    t14  = ((t15a *  2276         - t14a * (3406 - 4096) + 2048) >> 12) - t14a;
+    t15  = ((t15a * (3406 - 4096) + t14a *  2276         + 2048) >> 12) + t15a;
+
+    t0   = CLIP(t0a + t4a);
+    t1   = CLIP(t1a + t5a);
+    t2   = CLIP(t2a + t6a);
+    t3   = CLIP(t3a + t7a);
+    t4   = CLIP(t0a - t4a);
+    t5   = CLIP(t1a - t5a);
+    t6   = CLIP(t2a - t6a);
+    t7   = CLIP(t3a - t7a);
+    t8a  = CLIP(t8  + t12);
+    t9a  = CLIP(t9  + t13);
+    t10a = CLIP(t10 + t14);
+    t11a = CLIP(t11 + t15);
+    t12a = CLIP(t8  - t12);
+    t13a = CLIP(t9  - t13);
+    t14a = CLIP(t10 - t14);
+    t15a = CLIP(t11 - t15);
+
+    t4a  = ((t4   * (3784 - 4096) + t5   *  1567         + 2048) >> 12) + t4;
+    t5a  = ((t4   *  1567         - t5   * (3784 - 4096) + 2048) >> 12) - t5;
+    t6a  = ((t7   * (3784 - 4096) - t6   *  1567         + 2048) >> 12) + t7;
+    t7a  = ((t7   *  1567         + t6   * (3784 - 4096) + 2048) >> 12) + t6;
+    t12  = ((t12a * (3784 - 4096) + t13a *  1567         + 2048) >> 12) + t12a;
+    t13  = ((t12a *  1567         - t13a * (3784 - 4096) + 2048) >> 12) - t13a;
+    t14  = ((t15a * (3784 - 4096) - t14a *  1567         + 2048) >> 12) + t15a;
+    t15  = ((t15a *  1567         + t14a * (3784 - 4096) + 2048) >> 12) + t14a;
+
+    out[ 0 * out_s] =  CLIP(t0  + t2  );
+    out[15 * out_s] = -CLIP(t1  + t3  );
+    t2a             =  CLIP(t0  - t2  );
+    t3a             =  CLIP(t1  - t3  );
+    out[ 3 * out_s] = -CLIP(t4a + t6a );
+    out[12 * out_s] =  CLIP(t5a + t7a );
+    t6              =  CLIP(t4a - t6a );
+    t7              =  CLIP(t5a - t7a );
+    out[ 1 * out_s] = -CLIP(t8a + t10a);
+    out[14 * out_s] =  CLIP(t9a + t11a);
+    t10             =  CLIP(t8a - t10a);
+    t11             =  CLIP(t9a - t11a);
+    out[ 2 * out_s] =  CLIP(t12 + t14 );
+    out[13 * out_s] = -CLIP(t13 + t15 );
+    t14a            =  CLIP(t12 - t14 );
+    t15a            =  CLIP(t13 - t15 );
+
+    out[ 7 * out_s] = -(((t2a  + t3a)  * 181 + 128) >> 8);
+    out[ 8 * out_s] =   ((t2a  - t3a)  * 181 + 128) >> 8;
+    out[ 4 * out_s] =   ((t6   + t7)   * 181 + 128) >> 8;
+    out[11 * out_s] = -(((t6   - t7)   * 181 + 128) >> 8);
+    out[ 6 * out_s] =   ((t10  + t11)  * 181 + 128) >> 8;
+    out[ 9 * out_s] = -(((t10  - t11)  * 181 + 128) >> 8);
+    out[ 5 * out_s] = -(((t14a + t15a) * 181 + 128) >> 8);
+    out[10 * out_s] =   ((t14a - t15a) * 181 + 128) >> 8;
+}
+
+#define inv_adst_1d(sz) \
+void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
+                               const int min, const int max) \
+{ \
+    inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \
+} \
+void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
+                                   const int min, const int max) \
+{ \
+    inv_adst##sz##_1d_internal_c(c, stride, min, max, \
+                                 &c[(sz - 1) * stride], -stride); \
+}
+
+inv_adst_1d( 4)
+inv_adst_1d( 8)
+inv_adst_1d(16)
+
+#undef inv_adst_1d
+
+void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
+                              const int min, const int max)
+{
+    assert(stride > 0);
+    for (int i = 0; i < 4; i++) {
+        const int in = c[stride * i];
+        c[stride * i] = in + ((in * 1697 + 2048) >> 12);
+    }
+}
+
+void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
+                              const int min, const int max)
+{
+    assert(stride > 0);
+    for (int i = 0; i < 8; i++)
+        c[stride * i] *= 2;
+}
+
+void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
+                               const int min, const int max)
+{
+    assert(stride > 0);
+    for (int i = 0; i < 16; i++) {
+        const int in = c[stride * i];
+        c[stride * i] = 2 * in + ((in * 1697 + 1024) >> 11);
+    }
+}
+
+void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
+                               const int min, const int max)
+{
+    assert(stride > 0);
+    for (int i = 0; i < 32; i++)
+        c[stride * i] *= 4;
+}
+
+void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) {
+    assert(stride > 0);
+    const int in0 = c[0 * stride], in1 = c[1 * stride];
+    const int in2 = c[2 * stride], in3 = c[3 * stride];
+
+    const int t0 = in0 + in1;
+    const int t2 = in2 - in3;
+    const int t4 = (t0 - t2) >> 1;
+    const int t3 = t4 - in3;
+    const int t1 = t4 - in1;
+
+    c[0 * stride] = t0 - t3;
+    c[1 * stride] = t3;
+    c[2 * stride] = t1;
+    c[3 * stride] = t2 + t1;
+}
diff --git a/src/itx_1d.h b/src/itx_1d.h
new file mode 100644 (file)
index 0000000..b63d71b
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef DAV1D_SRC_ITX_1D_H
+#define DAV1D_SRC_ITX_1D_H
+
+#define decl_itx_1d_fn(name) \
+void (name)(int32_t *c, ptrdiff_t stride, int min, int max)
+typedef decl_itx_1d_fn(*itx_1d_fn);
+
+decl_itx_1d_fn(dav1d_inv_dct4_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct8_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct16_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct32_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct64_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_adst4_1d_c);
+decl_itx_1d_fn(dav1d_inv_adst8_1d_c);
+decl_itx_1d_fn(dav1d_inv_adst16_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);
+decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);
+decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_identity4_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity8_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity16_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity32_1d_c);
+
+void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);
+
+#endif /* DAV1D_SRC_ITX_1D_H */
diff --git a/src/itx_tmpl.c b/src/itx_tmpl.c
new file mode 100644 (file)
index 0000000..a0e807f
--- /dev/null
@@ -0,0 +1,256 @@
+/*
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/itx.h"
+#include "src/itx_1d.h"
+
+static NOINLINE void
+inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
+               const int eob, const int w, const int h, const int shift,
+               const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
+               const int has_dconly HIGHBD_DECL_SUFFIX)
+{
+    assert(w >= 4 && w <= 64);
+    assert(h >= 4 && h <= 64);
+    assert(eob >= 0);
+
+    const int is_rect2 = w * 2 == h || h * 2 == w;
+    const int rnd = (1 << shift) >> 1;
+
+    if (eob < has_dconly) {
+        int dc = coeff[0];
+        coeff[0] = 0;
+        if (is_rect2)
+            dc = (dc * 181 + 128) >> 8;
+        dc = (dc * 181 + 128) >> 8;
+        dc = (dc + rnd) >> shift;
+        dc = (dc * 181 + 128 + 2048) >> 12;
+        for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
+            for (int x = 0; x < w; x++)
+                dst[x] = iclip_pixel(dst[x] + dc);
+        return;
+    }
+
+    const int sh = imin(h, 32), sw = imin(w, 32);
+#if BITDEPTH == 8
+    const int row_clip_min = INT16_MIN;
+    const int col_clip_min = INT16_MIN;
+#else
+    const int row_clip_min = (int) ((unsigned) ~bitdepth_max << 7);
+    const int col_clip_min = (int) ((unsigned) ~bitdepth_max << 5);
+#endif
+    const int row_clip_max = ~row_clip_min;
+    const int col_clip_max = ~col_clip_min;
+
+    int32_t tmp[64 * 64], *c = tmp;
+    for (int y = 0; y < sh; y++, c += w) {
+        if (is_rect2)
+            for (int x = 0; x < sw; x++)
+                c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
+        else
+            for (int x = 0; x < sw; x++)
+                c[x] = coeff[y + x * sh];
+        first_1d_fn(c, 1, row_clip_min, row_clip_max);
+    }
+
+    memset(coeff, 0, sizeof(*coeff) * sw * sh);
+    for (int i = 0; i < w * sh; i++)
+        tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max);
+
+    for (int x = 0; x < w; x++)
+        second_1d_fn(&tmp[x], w, col_clip_min, col_clip_max);
+
+    c = tmp;
+    for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
+        for (int x = 0; x < w; x++)
+            dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
+}
+
+#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
+static void \
+inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
+                                               const ptrdiff_t stride, \
+                                               coef *const coeff, \
+                                               const int eob \
+                                               HIGHBD_DECL_SUFFIX) \
+{ \
+    inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
+                   dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \
+                   has_dconly HIGHBD_TAIL_SUFFIX); \
+}
+
+#define inv_txfm_fn64(w, h, shift) \
+inv_txfm_fn(dct, dct, w, h, shift, 1)
+
+#define inv_txfm_fn32(w, h, shift) \
+inv_txfm_fn64(w, h, shift) \
+inv_txfm_fn(identity, identity, w, h, shift, 0)
+
+#define inv_txfm_fn16(w, h, shift) \
+inv_txfm_fn32(w, h, shift) \
+inv_txfm_fn(adst,     dct,      w, h, shift, 0) \
+inv_txfm_fn(dct,      adst,     w, h, shift, 0) \
+inv_txfm_fn(adst,     adst,     w, h, shift, 0) \
+inv_txfm_fn(dct,      flipadst, w, h, shift, 0) \
+inv_txfm_fn(flipadst, dct,      w, h, shift, 0) \
+inv_txfm_fn(adst,     flipadst, w, h, shift, 0) \
+inv_txfm_fn(flipadst, adst,     w, h, shift, 0) \
+inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \
+inv_txfm_fn(identity, dct,      w, h, shift, 0) \
+inv_txfm_fn(dct,      identity, w, h, shift, 0) \
+
+#define inv_txfm_fn84(w, h, shift) \
+inv_txfm_fn16(w, h, shift) \
+inv_txfm_fn(identity, flipadst, w, h, shift, 0) \
+inv_txfm_fn(flipadst, identity, w, h, shift, 0) \
+inv_txfm_fn(identity, adst,     w, h, shift, 0) \
+inv_txfm_fn(adst,     identity, w, h, shift, 0) \
+
+inv_txfm_fn84( 4,  4, 0)
+inv_txfm_fn84( 4,  8, 0)
+inv_txfm_fn84( 4, 16, 1)
+inv_txfm_fn84( 8,  4, 0)
+inv_txfm_fn84( 8,  8, 1)
+inv_txfm_fn84( 8, 16, 1)
+inv_txfm_fn32( 8, 32, 2)
+inv_txfm_fn84(16,  4, 1)
+inv_txfm_fn84(16,  8, 1)
+inv_txfm_fn16(16, 16, 2)
+inv_txfm_fn32(16, 32, 1)
+inv_txfm_fn64(16, 64, 2)
+inv_txfm_fn32(32,  8, 2)
+inv_txfm_fn32(32, 16, 1)
+inv_txfm_fn32(32, 32, 2)
+inv_txfm_fn64(32, 64, 1)
+inv_txfm_fn64(64, 16, 2)
+inv_txfm_fn64(64, 32, 1)
+inv_txfm_fn64(64, 64, 2)
+
+static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
+                                       coef *const coeff, const int eob
+                                       HIGHBD_DECL_SUFFIX)
+{
+    int32_t tmp[4 * 4], *c = tmp;
+    for (int y = 0; y < 4; y++, c += 4) {
+        for (int x = 0; x < 4; x++)
+            c[x] = coeff[y + x * 4] >> 2;
+        dav1d_inv_wht4_1d_c(c, 1);
+    }
+    memset(coeff, 0, sizeof(*coeff) * 4 * 4);
+
+    for (int x = 0; x < 4; x++)
+        dav1d_inv_wht4_1d_c(&tmp[x], 4);
+
+    c = tmp;
+    for (int y = 0; y < 4; y++, dst += PXSTRIDE(stride))
+        for (int x = 0; x < 4; x++)
+            dst[x] = iclip_pixel(dst[x] + *c++);
+}
+
+COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
+#define assign_itx_all_fn64(w, h, pfx) \
+    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
+        inv_txfm_add_dct_dct_##w##x##h##_c
+
+#define assign_itx_all_fn32(w, h, pfx) \
+    assign_itx_all_fn64(w, h, pfx); \
+    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
+        inv_txfm_add_identity_identity_##w##x##h##_c
+
+#define assign_itx_all_fn16(w, h, pfx) \
+    assign_itx_all_fn32(w, h, pfx); \
+    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
+        inv_txfm_add_adst_dct_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
+        inv_txfm_add_dct_adst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
+        inv_txfm_add_adst_adst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
+        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
+        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
+        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
+        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
+        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
+        inv_txfm_add_dct_identity_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
+        inv_txfm_add_identity_dct_##w##x##h##_c
+
+#define assign_itx_all_fn84(w, h, pfx) \
+    assign_itx_all_fn16(w, h, pfx); \
+    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
+        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
+        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
+        inv_txfm_add_adst_identity_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
+        inv_txfm_add_identity_adst_##w##x##h##_c; \
+
+    c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
+    assign_itx_all_fn84( 4,  4, );
+    assign_itx_all_fn84( 4,  8, R);
+    assign_itx_all_fn84( 4, 16, R);
+    assign_itx_all_fn84( 8,  4, R);
+    assign_itx_all_fn84( 8,  8, );
+    assign_itx_all_fn84( 8, 16, R);
+    assign_itx_all_fn32( 8, 32, R);
+    assign_itx_all_fn84(16,  4, R);
+    assign_itx_all_fn84(16,  8, R);
+    assign_itx_all_fn16(16, 16, );
+    assign_itx_all_fn32(16, 32, R);
+    assign_itx_all_fn64(16, 64, R);
+    assign_itx_all_fn32(32,  8, R);
+    assign_itx_all_fn32(32, 16, R);
+    assign_itx_all_fn32(32, 32, );
+    assign_itx_all_fn64(32, 64, R);
+    assign_itx_all_fn64(64, 16, R);
+    assign_itx_all_fn64(64, 32, R);
+    assign_itx_all_fn64(64, 64, );
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    bitfn(dav1d_itx_dsp_init_arm)(c, bpc);
+#endif
+#if ARCH_X86
+    bitfn(dav1d_itx_dsp_init_x86)(c);
+#endif
+#endif
+}
diff --git a/src/levels.h b/src/levels.h
new file mode 100644 (file)
index 0000000..571c580
--- /dev/null
@@ -0,0 +1,288 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LEVELS_H
+#define DAV1D_SRC_LEVELS_H
+
+#include <stdint.h>
+
+#include "dav1d/headers.h"
+
+enum ObuMetaType {
+    OBU_META_HDR_CLL     = 1,
+    OBU_META_HDR_MDCV    = 2,
+    OBU_META_SCALABILITY = 3,
+    OBU_META_ITUT_T35    = 4,
+    OBU_META_TIMECODE    = 5,
+};
+
+enum TxfmSize {
+    TX_4X4,
+    TX_8X8,
+    TX_16X16,
+    TX_32X32,
+    TX_64X64,
+    N_TX_SIZES,
+};
+
+enum BlockLevel {
+    BL_128X128,
+    BL_64X64,
+    BL_32X32,
+    BL_16X16,
+    BL_8X8,
+    N_BL_LEVELS,
+};
+
+enum RectTxfmSize {
+    RTX_4X8 = N_TX_SIZES,
+    RTX_8X4,
+    RTX_8X16,
+    RTX_16X8,
+    RTX_16X32,
+    RTX_32X16,
+    RTX_32X64,
+    RTX_64X32,
+    RTX_4X16,
+    RTX_16X4,
+    RTX_8X32,
+    RTX_32X8,
+    RTX_16X64,
+    RTX_64X16,
+    N_RECT_TX_SIZES
+};
+
+enum TxfmType {
+    DCT_DCT,    // DCT  in both horizontal and vertical
+    ADST_DCT,   // ADST in vertical, DCT in horizontal
+    DCT_ADST,   // DCT  in vertical, ADST in horizontal
+    ADST_ADST,  // ADST in both directions
+    FLIPADST_DCT,
+    DCT_FLIPADST,
+    FLIPADST_FLIPADST,
+    ADST_FLIPADST,
+    FLIPADST_ADST,
+    IDTX,
+    V_DCT,
+    H_DCT,
+    V_ADST,
+    H_ADST,
+    V_FLIPADST,
+    H_FLIPADST,
+    N_TX_TYPES,
+    WHT_WHT = N_TX_TYPES,
+    N_TX_TYPES_PLUS_LL,
+};
+
+enum TxClass {
+    TX_CLASS_2D,
+    TX_CLASS_H,
+    TX_CLASS_V,
+};
+
+enum IntraPredMode {
+    DC_PRED,
+    VERT_PRED,
+    HOR_PRED,
+    DIAG_DOWN_LEFT_PRED,
+    DIAG_DOWN_RIGHT_PRED,
+    VERT_RIGHT_PRED,
+    HOR_DOWN_PRED,
+    HOR_UP_PRED,
+    VERT_LEFT_PRED,
+    SMOOTH_PRED,
+    SMOOTH_V_PRED,
+    SMOOTH_H_PRED,
+    PAETH_PRED,
+    N_INTRA_PRED_MODES,
+    CFL_PRED = N_INTRA_PRED_MODES,
+    N_UV_INTRA_PRED_MODES,
+    N_IMPL_INTRA_PRED_MODES = N_UV_INTRA_PRED_MODES,
+    LEFT_DC_PRED = DIAG_DOWN_LEFT_PRED,
+    TOP_DC_PRED,
+    DC_128_PRED,
+    Z1_PRED,
+    Z2_PRED,
+    Z3_PRED,
+    FILTER_PRED = N_INTRA_PRED_MODES,
+};
+
+enum InterIntraPredMode {
+    II_DC_PRED,
+    II_VERT_PRED,
+    II_HOR_PRED,
+    II_SMOOTH_PRED,
+    N_INTER_INTRA_PRED_MODES,
+};
+
+enum BlockPartition {
+    PARTITION_NONE,     // [ ] <-.
+    PARTITION_H,        // [-]   |
+    PARTITION_V,        // [|]   |
+    PARTITION_SPLIT,    // [+] --'
+    PARTITION_T_TOP_SPLIT,    // [⊥] i.e. split top, H bottom
+    PARTITION_T_BOTTOM_SPLIT, // [т] i.e. H top, split bottom
+    PARTITION_T_LEFT_SPLIT,   // [-|] i.e. split left, V right
+    PARTITION_T_RIGHT_SPLIT,  // [|-] i.e. V left, split right
+    PARTITION_H4,       // [Ⲷ]
+    PARTITION_V4,       // [Ⲽ]
+    N_PARTITIONS,
+    N_SUB8X8_PARTITIONS = PARTITION_T_TOP_SPLIT,
+};
+
+enum BlockSize {
+    BS_128x128,
+    BS_128x64,
+    BS_64x128,
+    BS_64x64,
+    BS_64x32,
+    BS_64x16,
+    BS_32x64,
+    BS_32x32,
+    BS_32x16,
+    BS_32x8,
+    BS_16x64,
+    BS_16x32,
+    BS_16x16,
+    BS_16x8,
+    BS_16x4,
+    BS_8x32,
+    BS_8x16,
+    BS_8x8,
+    BS_8x4,
+    BS_4x16,
+    BS_4x8,
+    BS_4x4,
+    N_BS_SIZES,
+};
+
+enum Filter2d { // order is horizontal, vertical
+    FILTER_2D_8TAP_REGULAR,
+    FILTER_2D_8TAP_REGULAR_SMOOTH,
+    FILTER_2D_8TAP_REGULAR_SHARP,
+    FILTER_2D_8TAP_SHARP_REGULAR,
+    FILTER_2D_8TAP_SHARP_SMOOTH,
+    FILTER_2D_8TAP_SHARP,
+    FILTER_2D_8TAP_SMOOTH_REGULAR,
+    FILTER_2D_8TAP_SMOOTH,
+    FILTER_2D_8TAP_SMOOTH_SHARP,
+    FILTER_2D_BILINEAR,
+    N_2D_FILTERS,
+};
+
+enum MVJoint {
+    MV_JOINT_ZERO,
+    MV_JOINT_H,
+    MV_JOINT_V,
+    MV_JOINT_HV,
+    N_MV_JOINTS,
+};
+
+enum InterPredMode {
+    NEARESTMV,
+    NEARMV,
+    GLOBALMV,
+    NEWMV,
+    N_INTER_PRED_MODES,
+};
+
+enum DRL_PROXIMITY {
+    NEAREST_DRL,
+    NEARER_DRL,
+    NEAR_DRL,
+    NEARISH_DRL
+};
+
+enum CompInterPredMode {
+    NEARESTMV_NEARESTMV,
+    NEARMV_NEARMV,
+    NEARESTMV_NEWMV,
+    NEWMV_NEARESTMV,
+    NEARMV_NEWMV,
+    NEWMV_NEARMV,
+    GLOBALMV_GLOBALMV,
+    NEWMV_NEWMV,
+    N_COMP_INTER_PRED_MODES,
+};
+
+enum CompInterType {
+    COMP_INTER_NONE,
+    COMP_INTER_WEIGHTED_AVG,
+    COMP_INTER_AVG,
+    COMP_INTER_SEG,
+    COMP_INTER_WEDGE,
+};
+
+enum InterIntraType {
+    INTER_INTRA_NONE,
+    INTER_INTRA_BLEND,
+    INTER_INTRA_WEDGE,
+};
+
+typedef union mv {
+    struct {
+        int16_t y, x;
+    };
+    uint32_t n;
+} mv;
+
+enum MotionMode {
+    MM_TRANSLATION,
+    MM_OBMC,
+    MM_WARP,
+};
+
+#define QINDEX_RANGE 256
+
+typedef struct Av1Block {
+    uint8_t bl, bs, bp;
+    uint8_t intra, seg_id, skip_mode, skip, uvtx;
+    union {
+        struct {
+            uint8_t y_mode, uv_mode, tx, pal_sz[2];
+            int8_t y_angle, uv_angle, cfl_alpha[2];
+        }; // intra
+        struct {
+            union {
+                struct {
+                    union mv mv[2];
+                    uint8_t wedge_idx, mask_sign, interintra_mode;
+                };
+                struct {
+                    union mv mv2d;
+                    int16_t matrix[4];
+                };
+            };
+            uint8_t comp_type, inter_mode, motion_mode, drl_idx;
+            int8_t ref[2];
+            uint8_t max_ytx, filter2d, interintra_type, tx_split0;
+            uint16_t tx_split1;
+        }; // inter
+    };
+} Av1Block;
+
+#endif /* DAV1D_SRC_LEVELS_H */
diff --git a/src/lf_apply.h b/src/lf_apply.h
new file mode 100644 (file)
index 0000000..6b63b62
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LF_APPLY_H
+#define DAV1D_SRC_LF_APPLY_H
+
+#include <stdint.h>
+
+#include "common/bitdepth.h"
+
+#include "src/internal.h"
+#include "src/levels.h"
+
+void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *f,
+                                    pixel *const p[3], Av1Filter *lflvl,
+                                    int sby, int start_of_tile_row);
+
+#endif /* DAV1D_SRC_LF_APPLY_H */
diff --git a/src/lf_apply_tmpl.c b/src/lf_apply_tmpl.c
new file mode 100644 (file)
index 0000000..4e860f4
--- /dev/null
@@ -0,0 +1,306 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/lf_apply.h"
+
+static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,
+                                       const int have_left,
+                                       const uint8_t (*lvl)[4],
+                                       const ptrdiff_t b4_stride,
+                                       const uint16_t (*const mask)[3][2],
+                                       pixel *dst, const ptrdiff_t ls,
+                                       const int w,
+                                       const int starty4, const int endy4)
+{
+    const Dav1dDSPContext *const dsp = f->dsp;
+
+    // filter edges between columns (e.g. block1 | block2)
+    for (int x = 0; x < w; x++) {
+        if (!have_left && !x) continue;
+        uint32_t hmask[4];
+        if (!starty4) {
+            hmask[0] = mask[x][0][0];
+            hmask[1] = mask[x][1][0];
+            hmask[2] = mask[x][2][0];
+            if (endy4 > 16) {
+                hmask[0] |= (unsigned) mask[x][0][1] << 16;
+                hmask[1] |= (unsigned) mask[x][1][1] << 16;
+                hmask[2] |= (unsigned) mask[x][2][1] << 16;
+            }
+        } else {
+            hmask[0] = mask[x][0][1];
+            hmask[1] = mask[x][1][1];
+            hmask[2] = mask[x][2][1];
+        }
+        hmask[3] = 0;
+        dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask,
+                                     (const uint8_t(*)[4]) &lvl[x][0], b4_stride,
+                                     &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX);
+    }
+}
+
+static inline void filter_plane_rows_y(const Dav1dFrameContext *const f,
+                                       const int have_top,
+                                       const uint8_t (*lvl)[4],
+                                       const ptrdiff_t b4_stride,
+                                       const uint16_t (*const mask)[3][2],
+                                       pixel *dst, const ptrdiff_t ls,
+                                       const int w,
+                                       const int starty4, const int endy4)
+{
+    const Dav1dDSPContext *const dsp = f->dsp;
+
+    //                                 block1
+    // filter edges between rows (e.g. ------)
+    //                                 block2
+    for (int y = starty4; y < endy4;
+         y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
+    {
+        if (!have_top && !y) continue;
+        const uint32_t vmask[4] = {
+            mask[y][0][0] | ((unsigned) mask[y][0][1] << 16),
+            mask[y][1][0] | ((unsigned) mask[y][1][1] << 16),
+            mask[y][2][0] | ((unsigned) mask[y][2][1] << 16),
+            0,
+        };
+        dsp->lf.loop_filter_sb[0][1](dst, ls, vmask,
+                                     (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
+                                     &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX);
+    }
+}
+
+static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f,
+                                        const int have_left,
+                                        const uint8_t (*lvl)[4],
+                                        const ptrdiff_t b4_stride,
+                                        const uint16_t (*const mask)[2][2],
+                                        pixel *const u, pixel *const v,
+                                        const ptrdiff_t ls, const int w,
+                                        const int starty4, const int endy4,
+                                        const int ss_ver)
+{
+    const Dav1dDSPContext *const dsp = f->dsp;
+
+    // filter edges between columns (e.g. block1 | block2)
+    for (int x = 0; x < w; x++) {
+        if (!have_left && !x) continue;
+        uint32_t hmask[3];
+        if (!starty4) {
+            hmask[0] = mask[x][0][0];
+            hmask[1] = mask[x][1][0];
+            if (endy4 > (16 >> ss_ver)) {
+                hmask[0] |= (unsigned) mask[x][0][1] << (16 >> ss_ver);
+                hmask[1] |= (unsigned) mask[x][1][1] << (16 >> ss_ver);
+            }
+        } else {
+            hmask[0] = mask[x][0][1];
+            hmask[1] = mask[x][1][1];
+        }
+        hmask[2] = 0;
+        dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask,
+                                     (const uint8_t(*)[4]) &lvl[x][2], b4_stride,
+                                     &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX);
+        dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask,
+                                     (const uint8_t(*)[4]) &lvl[x][3], b4_stride,
+                                     &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX);
+    }
+}
+
+static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f,
+                                        const int have_top,
+                                        const uint8_t (*lvl)[4],
+                                        const ptrdiff_t b4_stride,
+                                        const uint16_t (*const mask)[2][2],
+                                        pixel *const u, pixel *const v,
+                                        const ptrdiff_t ls, const int w,
+                                        const int starty4, const int endy4,
+                                        const int ss_hor)
+{
+    const Dav1dDSPContext *const dsp = f->dsp;
+    ptrdiff_t off_l = 0;
+
+    //                                 block1
+    // filter edges between rows (e.g. ------)
+    //                                 block2
+    for (int y = starty4; y < endy4;
+         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
+    {
+        if (!have_top && !y) continue;
+        const uint32_t vmask[3] = {
+            mask[y][0][0] | ((unsigned) mask[y][0][1] << (16 >> ss_hor)),
+            mask[y][1][0] | ((unsigned) mask[y][1][1] << (16 >> ss_hor)),
+            0,
+        };
+        dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask,
+                                     (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
+                                     &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX);
+        dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask,
+                                     (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
+                                     &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX);
+    }
+}
+
+void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
+                                    pixel *const p[3], Av1Filter *const lflvl,
+                                    int sby, const int start_of_tile_row)
+{
+    int x, have_left;
+    // Don't filter outside the frame
+    const int have_top = sby > 0;
+    const int is_sb64 = !f->seq_hdr->sb128;
+    const int starty4 = (sby & is_sb64) << 4;
+    const int sbsz = 32 >> is_sb64;
+    const int sbl2 = 5 - is_sb64;
+    const int halign = (f->bh + 31) & ~31;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
+    const unsigned vmax = 1U << vmask, hmax = 1U << hmask;
+    const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
+    const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
+
+    // fix lpf strength at tile col boundaries
+    const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];
+    const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];
+    for (int tile_col = 1;; tile_col++) {
+        x = f->frame_hdr->tiling.col_start_sb[tile_col];
+        if ((x << sbl2) >= f->bw) break;
+        const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
+        x >>= is_sb64;
+
+        uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];
+        for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
+            const int sidx = mask >= 0x10000U;
+            const unsigned smask = mask >> (sidx << 4);
+            const int idx = 2 * !!(y_hmask[2][sidx] & smask) +
+                                !!(y_hmask[1][sidx] & smask);
+            y_hmask[2][sidx] &= ~smask;
+            y_hmask[1][sidx] &= ~smask;
+            y_hmask[0][sidx] &= ~smask;
+            y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;
+        }
+
+        if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+            uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];
+            for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
+                 y++, uv_mask <<= 1)
+            {
+                const int sidx = uv_mask >= vmax;
+                const unsigned smask = uv_mask >> (sidx << (4 - ss_ver));
+                const int idx = !!(uv_hmask[1][sidx] & smask);
+                uv_hmask[1][sidx] &= ~smask;
+                uv_hmask[0][sidx] &= ~smask;
+                uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask;
+            }
+        }
+        lpf_y  += halign;
+        lpf_uv += halign >> ss_ver;
+    }
+
+    // fix lpf strength at tile row boundaries
+    if (start_of_tile_row) {
+        const BlockContext *a;
+        for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];
+             x < f->sb128w; x++, a++)
+        {
+            uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];
+            const unsigned w = imin(32, f->w4 - (x << 5));
+            for (unsigned mask = 1, i = 0; i < w; mask <<= 1, i++) {
+                const int sidx = mask >= 0x10000U;
+                const unsigned smask = mask >> (sidx << 4);
+                const int idx = 2 * !!(y_vmask[2][sidx] & smask) +
+                                    !!(y_vmask[1][sidx] & smask);
+                y_vmask[2][sidx] &= ~smask;
+                y_vmask[1][sidx] &= ~smask;
+                y_vmask[0][sidx] &= ~smask;
+                y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;
+            }
+
+            if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+                const unsigned cw = (w + ss_hor) >> ss_hor;
+                uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];
+                for (unsigned uv_mask = 1, i = 0; i < cw; uv_mask <<= 1, i++) {
+                    const int sidx = uv_mask >= hmax;
+                    const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));
+                    const int idx = !!(uv_vmask[1][sidx] & smask);
+                    uv_vmask[1][sidx] &= ~smask;
+                    uv_vmask[0][sidx] &= ~smask;
+                    uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;
+                }
+            }
+        }
+    }
+
+    pixel *ptr;
+    uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
+    for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w;
+         x++, have_left = 1, ptr += 128, level_ptr += 32)
+    {
+        filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
+                            lflvl[x].filter_y[0], ptr, f->cur.stride[0],
+                            imin(32, f->w4 - x * 32), starty4, endy4);
+    }
+
+    level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
+    for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
+        filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
+                            lflvl[x].filter_y[1], ptr, f->cur.stride[0],
+                            imin(32, f->w4 - x * 32), starty4, endy4);
+    }
+
+    if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
+        return;
+
+    ptrdiff_t uv_off;
+    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
+    for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;
+         x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
+    {
+        filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
+                             lflvl[x].filter_uv[0],
+                             &p[1][uv_off], &p[2][uv_off], f->cur.stride[1],
+                             (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
+                             starty4 >> ss_ver, uv_endy4, ss_ver);
+    }
+
+    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
+    for (uv_off = 0, x = 0; x < f->sb128w;
+         x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
+    {
+        filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,
+                             lflvl[x].filter_uv[1],
+                             &p[1][uv_off], &p[2][uv_off], f->cur.stride[1],
+                             (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
+                             starty4 >> ss_ver, uv_endy4, ss_hor);
+    }
+}
diff --git a/src/lf_mask.c b/src/lf_mask.c
new file mode 100644 (file)
index 0000000..4c99864
--- /dev/null
@@ -0,0 +1,482 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/ctx.h"
+#include "src/levels.h"
+#include "src/lf_mask.h"
+#include "src/tables.h"
+
+static void decomp_tx(uint8_t (*const txa)[2 /* txsz, step */][32 /* y */][32 /* x */],
+                      const enum RectTxfmSize from,
+                      const int depth,
+                      const int y_off, const int x_off,
+                      const uint16_t *const tx_masks)
+{
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from];
+    const int is_split = (from == (int) TX_4X4 || depth > 1) ? 0 :
+        (tx_masks[depth] >> (y_off * 4 + x_off)) & 1;
+
+    if (is_split) {
+        const enum RectTxfmSize sub = t_dim->sub;
+        const int htw4 = t_dim->w >> 1, hth4 = t_dim->h >> 1;
+
+        decomp_tx(txa, sub, depth + 1, y_off * 2 + 0, x_off * 2 + 0, tx_masks);
+        if (t_dim->w >= t_dim->h)
+            decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][0][htw4],
+                      sub, depth + 1, y_off * 2 + 0, x_off * 2 + 1, tx_masks);
+        if (t_dim->h >= t_dim->w) {
+            decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][hth4][0],
+                      sub, depth + 1, y_off * 2 + 1, x_off * 2 + 0, tx_masks);
+            if (t_dim->w >= t_dim->h)
+                decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][hth4][htw4],
+                          sub, depth + 1, y_off * 2 + 1, x_off * 2 + 1, tx_masks);
+        }
+    } else {
+        const int lw = imin(2, t_dim->lw), lh = imin(2, t_dim->lh);
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        for (int y = 0; y < t_dim->h; y++) { \
+            rep_macro(type, txa[0][0][y], off, mul * lw); \
+            rep_macro(type, txa[1][0][y], off, mul * lh); \
+            txa[0][1][y][0] = t_dim->w; \
+        }
+        case_set_upto16(t_dim->w,,, 0);
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, txa[1][1][0], off, mul * t_dim->h)
+        case_set_upto16(t_dim->w,,, 0);
+#undef set_ctx
+    }
+}
+
+static inline void mask_edges_inter(uint16_t (*const masks)[32][3][2],
+                                    const int by4, const int bx4,
+                                    const int w4, const int h4, const int skip,
+                                    const enum RectTxfmSize max_tx,
+                                    const uint16_t *const tx_masks,
+                                    uint8_t *const a, uint8_t *const l)
+{
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[max_tx];
+    int y, x;
+
+    uint8_t txa[2 /* edge */][2 /* txsz, step */][32 /* y */][32 /* x */];
+    for (int y_off = 0, y = 0; y < h4; y += t_dim->h, y_off++)
+        for (int x_off = 0, x = 0; x < w4; x += t_dim->w, x_off++)
+            decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][y][x],
+                      max_tx, 0, y_off, x_off, tx_masks);
+
+    // left block edge
+    unsigned mask = 1U << by4;
+    for (y = 0; y < h4; y++, mask <<= 1) {
+        const int sidx = mask >= 0x10000;
+        const unsigned smask = mask >> (sidx << 4);
+        masks[0][bx4][imin(txa[0][0][y][0], l[y])][sidx] |= smask;
+    }
+
+    // top block edge
+    for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
+        const int sidx = mask >= 0x10000;
+        const unsigned smask = mask >> (sidx << 4);
+        masks[1][by4][imin(txa[1][0][0][x], a[x])][sidx] |= smask;
+    }
+
+    if (!skip) {
+        // inner (tx) left|right edges
+        for (y = 0, mask = 1U << by4; y < h4; y++, mask <<= 1) {
+            const int sidx = mask >= 0x10000U;
+            const unsigned smask = mask >> (sidx << 4);
+            int ltx = txa[0][0][y][0];
+            int step = txa[0][1][y][0];
+            for (x = step; x < w4; x += step) {
+                const int rtx = txa[0][0][y][x];
+                masks[0][bx4 + x][imin(rtx, ltx)][sidx] |= smask;
+                ltx = rtx;
+                step = txa[0][1][y][x];
+            }
+        }
+
+        //            top
+        // inner (tx) --- edges
+        //           bottom
+        for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
+            const int sidx = mask >= 0x10000U;
+            const unsigned smask = mask >> (sidx << 4);
+            int ttx = txa[1][0][0][x];
+            int step = txa[1][1][0][x];
+            for (y = step; y < h4; y += step) {
+                const int btx = txa[1][0][y][x];
+                masks[1][by4 + y][imin(ttx, btx)][sidx] |= smask;
+                ttx = btx;
+                step = txa[1][1][y][x];
+            }
+        }
+    }
+
+    for (y = 0; y < h4; y++)
+        l[y] = txa[0][0][y][w4 - 1];
+    memcpy(a, txa[1][0][h4 - 1], w4);
+}
+
+static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2],
+                                    const int by4, const int bx4,
+                                    const int w4, const int h4,
+                                    const enum RectTxfmSize tx,
+                                    uint8_t *const a, uint8_t *const l)
+{
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+    const int twl4 = t_dim->lw, thl4 = t_dim->lh;
+    const int twl4c = imin(2, twl4), thl4c = imin(2, thl4);
+    int y, x;
+
+    // left block edge
+    unsigned mask = 1U << by4;
+    for (y = 0; y < h4; y++, mask <<= 1) {
+        const int sidx = mask >= 0x10000;
+        const unsigned smask = mask >> (sidx << 4);
+        masks[0][bx4][imin(twl4c, l[y])][sidx] |= smask;
+    }
+
+    // top block edge
+    for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
+        const int sidx = mask >= 0x10000;
+        const unsigned smask = mask >> (sidx << 4);
+        masks[1][by4][imin(thl4c, a[x])][sidx] |= smask;
+    }
+
+    // inner (tx) left|right edges
+    const int hstep = t_dim->w;
+    unsigned t = 1U << by4;
+    unsigned inner = (unsigned) ((((uint64_t) t) << h4) - t);
+    unsigned inner1 = inner & 0xffff, inner2 = inner >> 16;
+    for (x = hstep; x < w4; x += hstep) {
+        if (inner1) masks[0][bx4 + x][twl4c][0] |= inner1;
+        if (inner2) masks[0][bx4 + x][twl4c][1] |= inner2;
+    }
+
+    //            top
+    // inner (tx) --- edges
+    //           bottom
+    const int vstep = t_dim->h;
+    t = 1U << bx4;
+    inner = (unsigned) ((((uint64_t) t) << w4) - t);
+    inner1 = inner & 0xffff;
+    inner2 = inner >> 16;
+    for (y = vstep; y < h4; y += vstep) {
+        if (inner1) masks[1][by4 + y][thl4c][0] |= inner1;
+        if (inner2) masks[1][by4 + y][thl4c][1] |= inner2;
+    }
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+    rep_macro(type, a, off, mul * thl4c)
+#define default_memset(dir, diridx, off, var) \
+    memset(a, thl4c, var)
+    case_set_upto32_with_default(w4,,, 0);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+    rep_macro(type, l, off, mul * twl4c)
+#define default_memset(dir, diridx, off, var) \
+    memset(l, twl4c, var)
+    case_set_upto32_with_default(h4,,, 0);
+#undef default_memset
+#undef set_ctx
+}
+
+static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
+                                     const int cby4, const int cbx4,
+                                     const int cw4, const int ch4,
+                                     const int skip_inter,
+                                     const enum RectTxfmSize tx,
+                                     uint8_t *const a, uint8_t *const l,
+                                     const int ss_hor, const int ss_ver)
+{
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+    const int twl4 = t_dim->lw, thl4 = t_dim->lh;
+    const int twl4c = !!twl4, thl4c = !!thl4;
+    int y, x;
+    const int vbits = 4 - ss_ver, hbits = 4 - ss_hor;
+    const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
+    const unsigned vmax = 1 << vmask, hmax = 1 << hmask;
+
+    // left block edge
+    unsigned mask = 1U << cby4;
+    for (y = 0; y < ch4; y++, mask <<= 1) {
+        const int sidx = mask >= vmax;
+        const unsigned smask = mask >> (sidx << vbits);
+        masks[0][cbx4][imin(twl4c, l[y])][sidx] |= smask;
+    }
+
+    // top block edge
+    for (x = 0, mask = 1U << cbx4; x < cw4; x++, mask <<= 1) {
+        const int sidx = mask >= hmax;
+        const unsigned smask = mask >> (sidx << hbits);
+        masks[1][cby4][imin(thl4c, a[x])][sidx] |= smask;
+    }
+
+    if (!skip_inter) {
+        // inner (tx) left|right edges
+        const int hstep = t_dim->w;
+        unsigned t = 1U << cby4;
+        unsigned inner = (unsigned) ((((uint64_t) t) << ch4) - t);
+        unsigned inner1 = inner & ((1 << vmask) - 1), inner2 = inner >> vmask;
+        for (x = hstep; x < cw4; x += hstep) {
+            if (inner1) masks[0][cbx4 + x][twl4c][0] |= inner1;
+            if (inner2) masks[0][cbx4 + x][twl4c][1] |= inner2;
+        }
+
+        //            top
+        // inner (tx) --- edges
+        //           bottom
+        const int vstep = t_dim->h;
+        t = 1U << cbx4;
+        inner = (unsigned) ((((uint64_t) t) << cw4) - t);
+        inner1 = inner & ((1 << hmask) - 1), inner2 = inner >> hmask;
+        for (y = vstep; y < ch4; y += vstep) {
+            if (inner1) masks[1][cby4 + y][thl4c][0] |= inner1;
+            if (inner2) masks[1][cby4 + y][thl4c][1] |= inner2;
+        }
+    }
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+    rep_macro(type, a, off, mul * thl4c)
+#define default_memset(dir, diridx, off, var) \
+    memset(a, thl4c, var)
+    case_set_upto32_with_default(cw4,,, 0);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+    rep_macro(type, l, off, mul * twl4c)
+#define default_memset(dir, diridx, off, var) \
+    memset(l, twl4c, var)
+    case_set_upto32_with_default(ch4,,, 0);
+#undef default_memset
+#undef set_ctx
+}
+
+void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
+                                uint8_t (*const level_cache)[4],
+                                const ptrdiff_t b4_stride,
+                                const uint8_t (*filter_level)[8][2],
+                                const int bx, const int by,
+                                const int iw, const int ih,
+                                const enum BlockSize bs,
+                                const enum RectTxfmSize ytx,
+                                const enum RectTxfmSize uvtx,
+                                const enum Dav1dPixelLayout layout,
+                                uint8_t *const ay, uint8_t *const ly,
+                                uint8_t *const auv, uint8_t *const luv)
+{
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+    const int bw4 = imin(iw - bx, b_dim[0]);
+    const int bh4 = imin(ih - by, b_dim[1]);
+    const int bx4 = bx & 31;
+    const int by4 = by & 31;
+
+    if (bw4 && bh4) {
+        uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx;
+        for (int y = 0; y < bh4; y++) {
+            for (int x = 0; x < bw4; x++) {
+                level_cache_ptr[x][0] = filter_level[0][0][0];
+                level_cache_ptr[x][1] = filter_level[1][0][0];
+            }
+            level_cache_ptr += b4_stride;
+        }
+
+        mask_edges_intra(lflvl->filter_y, by4, bx4, bw4, bh4, ytx, ay, ly);
+    }
+
+    if (!auv) return;
+
+    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int cbw4 = imin(((iw + ss_hor) >> ss_hor) - (bx >> ss_hor),
+                          (b_dim[0] + ss_hor) >> ss_hor);
+    const int cbh4 = imin(((ih + ss_ver) >> ss_ver) - (by >> ss_ver),
+                          (b_dim[1] + ss_ver) >> ss_ver);
+
+    if (!cbw4 || !cbh4) return;
+
+    const int cbx4 = bx4 >> ss_hor;
+    const int cby4 = by4 >> ss_ver;
+
+    uint8_t (*level_cache_ptr)[4] =
+        level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor);
+    for (int y = 0; y < cbh4; y++) {
+        for (int x = 0; x < cbw4; x++) {
+            level_cache_ptr[x][2] = filter_level[2][0][0];
+            level_cache_ptr[x][3] = filter_level[3][0][0];
+        }
+        level_cache_ptr += b4_stride;
+    }
+
+    mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, 0, uvtx,
+                      auv, luv, ss_hor, ss_ver);
+}
+
+void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
+                                uint8_t (*const level_cache)[4],
+                                const ptrdiff_t b4_stride,
+                                const uint8_t (*filter_level)[8][2],
+                                const int bx, const int by,
+                                const int iw, const int ih,
+                                const int skip, const enum BlockSize bs,
+                                const enum RectTxfmSize max_ytx,
+                                const uint16_t *const tx_masks,
+                                const enum RectTxfmSize uvtx,
+                                const enum Dav1dPixelLayout layout,
+                                uint8_t *const ay, uint8_t *const ly,
+                                uint8_t *const auv, uint8_t *const luv)
+{
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+    const int bw4 = imin(iw - bx, b_dim[0]);
+    const int bh4 = imin(ih - by, b_dim[1]);
+    const int bx4 = bx & 31;
+    const int by4 = by & 31;
+
+    if (bw4 && bh4) {
+        uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx;
+        for (int y = 0; y < bh4; y++) {
+            for (int x = 0; x < bw4; x++) {
+                level_cache_ptr[x][0] = filter_level[0][0][0];
+                level_cache_ptr[x][1] = filter_level[1][0][0];
+            }
+            level_cache_ptr += b4_stride;
+        }
+
+        mask_edges_inter(lflvl->filter_y, by4, bx4, bw4, bh4, skip,
+                         max_ytx, tx_masks, ay, ly);
+    }
+
+    if (!auv) return;
+
+    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int cbw4 = imin(((iw + ss_hor) >> ss_hor) - (bx >> ss_hor),
+                          (b_dim[0] + ss_hor) >> ss_hor);
+    const int cbh4 = imin(((ih + ss_ver) >> ss_ver) - (by >> ss_ver),
+                          (b_dim[1] + ss_ver) >> ss_ver);
+
+    if (!cbw4 || !cbh4) return;
+
+    const int cbx4 = bx4 >> ss_hor;
+    const int cby4 = by4 >> ss_ver;
+
+    uint8_t (*level_cache_ptr)[4] =
+        level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor);
+    for (int y = 0; y < cbh4; y++) {
+        for (int x = 0; x < cbw4; x++) {
+            level_cache_ptr[x][2] = filter_level[2][0][0];
+            level_cache_ptr[x][3] = filter_level[3][0][0];
+        }
+        level_cache_ptr += b4_stride;
+    }
+
+    mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, skip, uvtx,
+                      auv, luv, ss_hor, ss_ver);
+}
+
+void dav1d_calc_eih(Av1FilterLUT *const lim_lut, const int filter_sharpness) {
+    // set E/I/H values from loopfilter level
+    const int sharp = filter_sharpness;
+    for (int level = 0; level < 64; level++) {
+        int limit = level;
+
+        if (sharp > 0) {
+            limit >>= (sharp + 3) >> 2;
+            limit = imin(limit, 9 - sharp);
+        }
+        limit = imax(limit, 1);
+
+        lim_lut->i[level] = limit;
+        lim_lut->e[level] = 2 * (level + 2) + limit;
+    }
+    lim_lut->sharp[0] = (sharp + 3) >> 2;
+    lim_lut->sharp[1] = sharp ? 9 - sharp : 0xff;
+}
+
+static inline void calc_lf_value(uint8_t (*const lflvl_values)[2],
+                                 const int is_chroma, const int base_lvl,
+                                 const int lf_delta, const int seg_delta,
+                                 const Dav1dLoopfilterModeRefDeltas *const mr_delta)
+{
+    const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63);
+
+    if (!base_lvl && is_chroma) {
+        memset(lflvl_values, 0, 8 * 2);
+    } else if (!mr_delta) {
+        memset(lflvl_values, base, 8 * 2);
+    } else {
+        const int sh = base >= 32;
+        lflvl_values[0][0] = lflvl_values[0][1] =
+            iclip(base + (mr_delta->ref_delta[0] * (1 << sh)), 0, 63);
+        for (int r = 1; r < 8; r++) {
+            for (int m = 0; m < 2; m++) {
+                const int delta =
+                    mr_delta->mode_delta[m] + mr_delta->ref_delta[r];
+                lflvl_values[r][m] = iclip(base + (delta * (1 << sh)), 0, 63);
+            }
+        }
+    }
+}
+
+void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2],
+                          const Dav1dFrameHeader *const hdr,
+                          const int8_t lf_delta[4])
+{
+    const int n_seg = hdr->segmentation.enabled ? 8 : 1;
+
+    if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1]) {
+        memset(lflvl_values, 0, 8 * 4 * 2 * n_seg);
+        return;
+    }
+
+    const Dav1dLoopfilterModeRefDeltas *const mr_deltas =
+        hdr->loopfilter.mode_ref_delta_enabled ?
+        &hdr->loopfilter.mode_ref_deltas : NULL;
+    for (int s = 0; s < n_seg; s++) {
+        const Dav1dSegmentationData *const segd =
+            hdr->segmentation.enabled ? &hdr->segmentation.seg_data.d[s] : NULL;
+
+        calc_lf_value(lflvl_values[s][0], 0, hdr->loopfilter.level_y[0],
+                      lf_delta[0], segd ? segd->delta_lf_y_v : 0, mr_deltas);
+        calc_lf_value(lflvl_values[s][1], 0, hdr->loopfilter.level_y[1],
+                      lf_delta[hdr->delta.lf.multi ? 1 : 0],
+                      segd ? segd->delta_lf_y_h : 0, mr_deltas);
+        calc_lf_value(lflvl_values[s][2], 1, hdr->loopfilter.level_u,
+                      lf_delta[hdr->delta.lf.multi ? 2 : 0],
+                      segd ? segd->delta_lf_u : 0, mr_deltas);
+        calc_lf_value(lflvl_values[s][3], 1, hdr->loopfilter.level_v,
+                      lf_delta[hdr->delta.lf.multi ? 3 : 0],
+                      segd ? segd->delta_lf_v : 0, mr_deltas);
+    }
+}
diff --git a/src/lf_mask.h b/src/lf_mask.h
new file mode 100644 (file)
index 0000000..0c9caa6
--- /dev/null
@@ -0,0 +1,83 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LF_MASK_H
+#define DAV1D_SRC_LF_MASK_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "src/levels.h"
+
+typedef struct Av1FilterLUT {
+    uint8_t e[64];
+    uint8_t i[64];
+    uint64_t sharp[2];
+} Av1FilterLUT;
+
+typedef struct Av1RestorationUnit {
+    enum Dav1dRestorationType type;
+    int16_t filter_h[3];
+    int16_t filter_v[3];
+    uint8_t sgr_idx;
+    int16_t sgr_weights[2];
+} Av1RestorationUnit;
+
+// each struct describes one 128x128 area (1 or 4 SBs), pre-superres-scaling
+typedef struct Av1Filter {
+    // each bit is 1 col
+    uint16_t filter_y[2 /* 0=col, 1=row */][32][3][2];
+    uint16_t filter_uv[2 /* 0=col, 1=row */][32][2][2];
+    int8_t cdef_idx[4]; // -1 means "unset"
+    uint16_t noskip_mask[32][2];
+} Av1Filter;
+
+// each struct describes one 128x128 area (1 or 4 SBs), post-superres-scaling
+typedef struct Av1Restoration {
+    Av1RestorationUnit lr[3][4];
+} Av1Restoration;
+
+void dav1d_create_lf_mask_intra(Av1Filter *lflvl, uint8_t (*level_cache)[4],
+                                const ptrdiff_t b4_stride,
+                                const uint8_t (*level)[8][2], int bx, int by,
+                                int iw, int ih, enum BlockSize bs,
+                                enum RectTxfmSize ytx, enum RectTxfmSize uvtx,
+                                enum Dav1dPixelLayout layout, uint8_t *ay,
+                                uint8_t *ly, uint8_t *auv, uint8_t *luv);
+void dav1d_create_lf_mask_inter(Av1Filter *lflvl, uint8_t (*level_cache)[4],
+                                const ptrdiff_t b4_stride,
+                                const uint8_t (*level)[8][2], int bx, int by,
+                                int iw, int ih, int skip_inter,
+                                enum BlockSize bs, enum RectTxfmSize max_ytx,
+                                const uint16_t *tx_mask, enum RectTxfmSize uvtx,
+                                enum Dav1dPixelLayout layout, uint8_t *ay,
+                                uint8_t *ly, uint8_t *auv, uint8_t *luv);
+void dav1d_calc_eih(Av1FilterLUT *lim_lut, int filter_sharpness);
+void dav1d_calc_lf_values(uint8_t (*values)[4][8][2], const Dav1dFrameHeader *hdr,
+                          const int8_t lf_delta[4]);
+
+#endif /* DAV1D_SRC_LF_MASK_H */
diff --git a/src/lib.c b/src/lib.c
new file mode 100644 (file)
index 0000000..82af64a
--- /dev/null
+++ b/src/lib.c
@@ -0,0 +1,609 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "vcs_version.h"
+
+#include <errno.h>
+#include <string.h>
+
+#if defined(__linux__) && defined(HAVE_DLSYM)
+#include <dlfcn.h>
+#endif
+
+#include "dav1d/dav1d.h"
+#include "dav1d/data.h"
+
+#include "common/mem.h"
+#include "common/validate.h"
+
+#include "src/cpu.h"
+#include "src/fg_apply.h"
+#include "src/internal.h"
+#include "src/log.h"
+#include "src/obu.h"
+#include "src/qm.h"
+#include "src/ref.h"
+#include "src/thread_task.h"
+#include "src/wedge.h"
+
+static COLD void init_internal(void) {
+    dav1d_init_cpu();
+    dav1d_init_interintra_masks();
+    dav1d_init_qm_tables();
+    dav1d_init_thread();
+    dav1d_init_wedge_masks();
+}
+
+COLD const char *dav1d_version(void) {
+    return DAV1D_VERSION;
+}
+
+COLD void dav1d_default_settings(Dav1dSettings *const s) {
+    s->n_frame_threads = 1;
+    s->n_tile_threads = 1;
+    s->apply_grain = 1;
+    s->allocator.cookie = NULL;
+    s->allocator.alloc_picture_callback = dav1d_default_picture_alloc;
+    s->allocator.release_picture_callback = dav1d_default_picture_release;
+    s->logger.cookie = NULL;
+    s->logger.callback = dav1d_log_default_callback;
+    s->operating_point = 0;
+    s->all_layers = 1; // just until the tests are adjusted
+    s->frame_size_limit = 0;
+}
+
+static void close_internal(Dav1dContext **const c_out, int flush);
+
+NO_SANITIZE("cfi-icall") // CFI is broken with dlsym()
+static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) {
+#if defined(__linux__) && defined(HAVE_DLSYM) && defined(__GLIBC__)
+    /* glibc has an issue where the size of the TLS is subtracted from the stack
+     * size instead of allocated separately. As a result the specified stack
+     * size may be insufficient when used in an application with large amounts
+     * of TLS data. The following is a workaround to compensate for that.
+     * See https://sourceware.org/bugzilla/show_bug.cgi?id=11787 */
+    size_t (*const get_minstack)(const pthread_attr_t*) =
+        dlsym(RTLD_DEFAULT, "__pthread_get_minstack");
+    if (get_minstack)
+        return get_minstack(thread_attr) - PTHREAD_STACK_MIN;
+#endif
+    return 0;
+}
+
+COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
+    static pthread_once_t initted = PTHREAD_ONCE_INIT;
+    pthread_once(&initted, init_internal);
+
+    validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(s->n_tile_threads >= 1 &&
+                          s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(s->n_frame_threads >= 1 &&
+                          s->n_frame_threads <= DAV1D_MAX_FRAME_THREADS, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(s->allocator.alloc_picture_callback != NULL,
+                          DAV1D_ERR(EINVAL));
+    validate_input_or_ret(s->allocator.release_picture_callback != NULL,
+                          DAV1D_ERR(EINVAL));
+    validate_input_or_ret(s->operating_point >= 0 &&
+                          s->operating_point <= 31, DAV1D_ERR(EINVAL));
+
+    pthread_attr_t thread_attr;
+    if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM);
+    size_t stack_size = 1024 * 1024 + get_stack_size_internal(&thread_attr);
+
+    pthread_attr_setstacksize(&thread_attr, stack_size);
+
+    Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 32);
+    if (!c) goto error;
+    memset(c, 0, sizeof(*c));
+
+    c->allocator = s->allocator;
+    c->logger = s->logger;
+    c->apply_grain = s->apply_grain;
+    c->operating_point = s->operating_point;
+    c->all_layers = s->all_layers;
+    c->frame_size_limit = s->frame_size_limit;
+
+    /* On 32-bit systems extremely large frame sizes can cause overflows in
+     * dav1d_decode_frame() malloc size calculations. Prevent that from occuring
+     * by enforcing a maximum frame size limit, chosen to roughly correspond to
+     * the largest size possible to decode without exhausting virtual memory. */
+    if (sizeof(size_t) < 8 && s->frame_size_limit - 1 >= 8192 * 8192) {
+        c->frame_size_limit = 8192 * 8192;
+        if (s->frame_size_limit)
+            dav1d_log(c, "Frame size limit reduced from %u to %u.\n",
+                      s->frame_size_limit, c->frame_size_limit);
+    }
+
+    c->frame_thread.flush = &c->frame_thread.flush_mem;
+    atomic_init(c->frame_thread.flush, 0);
+    c->n_fc = s->n_frame_threads;
+    c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32);
+    if (!c->fc) goto error;
+    memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
+    if (c->n_fc > 1) {
+        c->frame_thread.out_delayed =
+            calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
+        if (!c->frame_thread.out_delayed) goto error;
+    }
+    for (int n = 0; n < s->n_frame_threads; n++) {
+        Dav1dFrameContext *const f = &c->fc[n];
+        f->c = c;
+        f->lf.last_sharpness = -1;
+        f->n_tc = s->n_tile_threads;
+        f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 64);
+        if (!f->tc) goto error;
+        memset(f->tc, 0, sizeof(*f->tc) * s->n_tile_threads);
+        if (f->n_tc > 1) {
+            if (pthread_mutex_init(&f->tile_thread.lock, NULL)) goto error;
+            if (pthread_cond_init(&f->tile_thread.cond, NULL)) {
+                pthread_mutex_destroy(&f->tile_thread.lock);
+                goto error;
+            }
+            if (pthread_cond_init(&f->tile_thread.icond, NULL)) {
+                pthread_mutex_destroy(&f->tile_thread.lock);
+                pthread_cond_destroy(&f->tile_thread.cond);
+                goto error;
+            }
+            f->tile_thread.inited = 1;
+        }
+        for (int m = 0; m < s->n_tile_threads; m++) {
+            Dav1dTileContext *const t = &f->tc[m];
+            t->f = f;
+            memset(t->cf_16bpc, 0, sizeof(t->cf_16bpc));
+            if (f->n_tc > 1) {
+                if (pthread_mutex_init(&t->tile_thread.td.lock, NULL)) goto error;
+                if (pthread_cond_init(&t->tile_thread.td.cond, NULL)) {
+                    pthread_mutex_destroy(&t->tile_thread.td.lock);
+                    goto error;
+                }
+                t->tile_thread.fttd = &f->tile_thread;
+                if (pthread_create(&t->tile_thread.td.thread, &thread_attr, dav1d_tile_task, t)) {
+                    pthread_cond_destroy(&t->tile_thread.td.cond);
+                    pthread_mutex_destroy(&t->tile_thread.td.lock);
+                    goto error;
+                }
+                t->tile_thread.td.inited = 1;
+            }
+        }
+        dav1d_refmvs_init(&f->rf);
+        if (c->n_fc > 1) {
+            if (pthread_mutex_init(&f->frame_thread.td.lock, NULL)) goto error;
+            if (pthread_cond_init(&f->frame_thread.td.cond, NULL)) {
+                pthread_mutex_destroy(&f->frame_thread.td.lock);
+                goto error;
+            }
+            if (pthread_create(&f->frame_thread.td.thread, &thread_attr, dav1d_frame_task, f)) {
+                pthread_cond_destroy(&f->frame_thread.td.cond);
+                pthread_mutex_destroy(&f->frame_thread.td.lock);
+                goto error;
+            }
+            f->frame_thread.td.inited = 1;
+        }
+    }
+
+    // intra edge tree
+    c->intra_edge.root[BL_128X128] = &c->intra_edge.branch_sb128[0].node;
+    dav1d_init_mode_tree(c->intra_edge.root[BL_128X128], c->intra_edge.tip_sb128, 1);
+    c->intra_edge.root[BL_64X64] = &c->intra_edge.branch_sb64[0].node;
+    dav1d_init_mode_tree(c->intra_edge.root[BL_64X64], c->intra_edge.tip_sb64, 0);
+
+    pthread_attr_destroy(&thread_attr);
+
+    return 0;
+
+error:
+    if (c) close_internal(c_out, 0);
+    pthread_attr_destroy(&thread_attr);
+    return DAV1D_ERR(ENOMEM);
+}
+
+static void dummy_free(const uint8_t *const data, void *const user_data) {
+    assert(data && !user_data);
+}
+
+int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
+                                const uint8_t *const ptr, const size_t sz)
+{
+    Dav1dData buf = { 0 };
+    int res;
+
+    validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
+
+    Dav1dSettings s;
+    dav1d_default_settings(&s);
+    s.logger.callback = NULL;
+
+    Dav1dContext *c;
+    res = dav1d_open(&c, &s);
+    if (res < 0) return res;
+
+    if (ptr) {
+        res = dav1d_data_wrap_internal(&buf, ptr, sz, dummy_free, NULL);
+        if (res < 0) goto error;
+    }
+
+    while (buf.sz > 0) {
+        res = dav1d_parse_obus(c, &buf, 1);
+        if (res < 0) goto error;
+
+        assert((size_t)res <= buf.sz);
+        buf.sz -= res;
+        buf.data += res;
+    }
+
+    if (!c->seq_hdr) {
+        res = DAV1D_ERR(EINVAL);
+        goto error;
+    }
+
+    memcpy(out, c->seq_hdr, sizeof(*out));
+
+    res = 0;
+error:
+    dav1d_data_unref_internal(&buf);
+    dav1d_close(&c);
+
+    return res;
+}
+
+static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
+                        Dav1dPicture *const in)
+{
+    const Dav1dFilmGrainData *fgdata = &in->frame_hdr->film_grain.data;
+    int has_grain = fgdata->num_y_points || fgdata->num_uv_points[0] ||
+                    fgdata->num_uv_points[1];
+
+    // If there is nothing to be done, skip the allocation/copy
+    if (!c->apply_grain || !has_grain) {
+        dav1d_picture_move_ref(out, in);
+        return 0;
+    }
+
+    // Apply film grain to a new copy of the image to avoid corrupting refs
+    int res = dav1d_picture_alloc_copy(c, out, in->p.w, in);
+    if (res < 0) {
+        dav1d_picture_unref_internal(in);
+        dav1d_picture_unref_internal(out);
+        return res;
+    }
+
+    switch (out->p.bpc) {
+#if CONFIG_8BPC
+    case 8:
+        dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in);
+        break;
+#endif
+#if CONFIG_16BPC
+    case 10:
+    case 12:
+        dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in);
+        break;
+#endif
+    default:
+        assert(0);
+    }
+
+    dav1d_picture_unref_internal(in);
+    return 0;
+}
+
+static int output_picture_ready(Dav1dContext *const c) {
+
+    if (!c->out.data[0]) return 0;
+
+    // skip lower spatial layers
+    if (c->operating_point_idc && !c->all_layers) {
+        const int max_spatial_id = ulog2(c->operating_point_idc >> 8);
+        if (max_spatial_id > c->out.frame_hdr->spatial_id) {
+            dav1d_picture_unref_internal(&c->out);
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
+    unsigned drain_count = 0;
+    do {
+        const unsigned next = c->frame_thread.next;
+        Dav1dFrameContext *const f = &c->fc[next];
+        pthread_mutex_lock(&f->frame_thread.td.lock);
+        while (f->n_tile_data > 0)
+            pthread_cond_wait(&f->frame_thread.td.cond,
+                              &f->frame_thread.td.lock);
+        pthread_mutex_unlock(&f->frame_thread.td.lock);
+        Dav1dThreadPicture *const out_delayed =
+            &c->frame_thread.out_delayed[next];
+        if (++c->frame_thread.next == c->n_fc)
+            c->frame_thread.next = 0;
+        if (out_delayed->p.data[0]) {
+            const unsigned progress =
+                atomic_load_explicit(&out_delayed->progress[1],
+                                     memory_order_relaxed);
+            if (out_delayed->visible && progress != FRAME_ERROR)
+                dav1d_picture_ref(&c->out, &out_delayed->p);
+            dav1d_thread_picture_unref(out_delayed);
+            if (output_picture_ready(c))
+                return output_image(c, out, &c->out);
+        }
+    } while (++drain_count < c->n_fc);
+
+    return DAV1D_ERR(EAGAIN);
+}
+
+static int gen_picture(Dav1dContext *const c)
+{
+    int res;
+    Dav1dData *const in = &c->in;
+
+    if (output_picture_ready(c))
+        return 0;
+
+    while (in->sz > 0) {
+        res = dav1d_parse_obus(c, in, 0);
+        if (res < 0) {
+            dav1d_data_unref_internal(in);
+        } else {
+            assert((size_t)res <= in->sz);
+            in->sz -= res;
+            in->data += res;
+            if (!in->sz) dav1d_data_unref_internal(in);
+        }
+        if (output_picture_ready(c))
+            break;
+        if (res < 0)
+            return res;
+    }
+
+    return 0;
+}
+
+int dav1d_send_data(Dav1dContext *const c, Dav1dData *const in)
+{
+    validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(in != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(in->data == NULL || in->sz, DAV1D_ERR(EINVAL));
+
+    if (in->data)
+        c->drain = 0;
+    if (c->in.data)
+        return DAV1D_ERR(EAGAIN);
+    dav1d_data_ref(&c->in, in);
+
+    int res = gen_picture(c);
+    if (!res)
+        dav1d_data_unref_internal(in);
+
+    return res;
+}
+
+int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
+{
+    validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
+
+    const int drain = c->drain;
+    c->drain = 1;
+
+    int res = gen_picture(c);
+    if (res < 0)
+        return res;
+
+    if (output_picture_ready(c))
+        return output_image(c, out, &c->out);
+
+    if (c->n_fc > 1 && drain)
+        return drain_picture(c, out);
+
+    return DAV1D_ERR(EAGAIN);
+}
+
+void dav1d_flush(Dav1dContext *const c) {
+    dav1d_data_unref_internal(&c->in);
+    c->drain = 0;
+
+    for (int i = 0; i < 8; i++) {
+        if (c->refs[i].p.p.data[0])
+            dav1d_thread_picture_unref(&c->refs[i].p);
+        dav1d_ref_dec(&c->refs[i].segmap);
+        dav1d_ref_dec(&c->refs[i].refmvs);
+        dav1d_cdf_thread_unref(&c->cdf[i]);
+    }
+    c->frame_hdr = NULL;
+    c->seq_hdr = NULL;
+    dav1d_ref_dec(&c->seq_hdr_ref);
+
+    c->mastering_display = NULL;
+    c->content_light = NULL;
+    c->itut_t35 = NULL;
+    dav1d_ref_dec(&c->mastering_display_ref);
+    dav1d_ref_dec(&c->content_light_ref);
+    dav1d_ref_dec(&c->itut_t35_ref);
+
+    if (c->n_fc == 1) return;
+
+    // mark each currently-running frame as flushing, so that we
+    // exit out as quickly as the running thread checks this flag
+    atomic_store(c->frame_thread.flush, 1);
+    for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
+        if (next == c->n_fc) next = 0;
+        Dav1dFrameContext *const f = &c->fc[next];
+        pthread_mutex_lock(&f->frame_thread.td.lock);
+        if (f->n_tile_data > 0) {
+            while (f->n_tile_data > 0)
+                pthread_cond_wait(&f->frame_thread.td.cond,
+                                  &f->frame_thread.td.lock);
+            assert(!f->cur.data[0]);
+        }
+        pthread_mutex_unlock(&f->frame_thread.td.lock);
+        Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next];
+        if (out_delayed->p.data[0])
+            dav1d_thread_picture_unref(out_delayed);
+    }
+    atomic_store(c->frame_thread.flush, 0);
+
+    c->frame_thread.next = 0;
+}
+
+COLD void dav1d_close(Dav1dContext **const c_out) {
+    validate_input(c_out != NULL);
+    close_internal(c_out, 1);
+}
+
+static COLD void close_internal(Dav1dContext **const c_out, int flush) {
+    Dav1dContext *const c = *c_out;
+    if (!c) return;
+
+    if (flush) dav1d_flush(c);
+
+    for (unsigned n = 0; c->fc && n < c->n_fc; n++) {
+        Dav1dFrameContext *const f = &c->fc[n];
+
+        // clean-up threading stuff
+        if (c->n_fc > 1 && f->frame_thread.td.inited) {
+            pthread_mutex_lock(&f->frame_thread.td.lock);
+            f->frame_thread.die = 1;
+            pthread_cond_signal(&f->frame_thread.td.cond);
+            pthread_mutex_unlock(&f->frame_thread.td.lock);
+            pthread_join(f->frame_thread.td.thread, NULL);
+            freep(&f->frame_thread.b);
+            dav1d_freep_aligned(&f->frame_thread.pal_idx);
+            dav1d_freep_aligned(&f->frame_thread.cf);
+            freep(&f->frame_thread.tile_start_off);
+            dav1d_freep_aligned(&f->frame_thread.pal);
+            freep(&f->frame_thread.cbi);
+            pthread_mutex_destroy(&f->frame_thread.td.lock);
+            pthread_cond_destroy(&f->frame_thread.td.cond);
+        }
+        if (f->n_tc > 1 && f->tc && f->tile_thread.inited) {
+            pthread_mutex_lock(&f->tile_thread.lock);
+            for (int m = 0; m < f->n_tc; m++) {
+                Dav1dTileContext *const t = &f->tc[m];
+                t->tile_thread.die = 1;
+                // mark not created tile threads as available
+                if (!t->tile_thread.td.inited)
+                    f->tile_thread.available |= 1ULL<<m;
+            }
+            pthread_cond_broadcast(&f->tile_thread.cond);
+            while (f->tile_thread.available != ~0ULL >> (64 - f->n_tc))
+                pthread_cond_wait(&f->tile_thread.icond,
+                                  &f->tile_thread.lock);
+            pthread_mutex_unlock(&f->tile_thread.lock);
+            for (int m = 0; m < f->n_tc; m++) {
+                Dav1dTileContext *const t = &f->tc[m];
+                if (f->n_tc > 1 && t->tile_thread.td.inited) {
+                    pthread_join(t->tile_thread.td.thread, NULL);
+                    pthread_mutex_destroy(&t->tile_thread.td.lock);
+                    pthread_cond_destroy(&t->tile_thread.td.cond);
+                }
+            }
+            pthread_mutex_destroy(&f->tile_thread.lock);
+            pthread_cond_destroy(&f->tile_thread.cond);
+            pthread_cond_destroy(&f->tile_thread.icond);
+            freep(&f->tile_thread.task_idx_to_sby_and_tile_idx);
+        }
+        for (int m = 0; f->ts && m < f->n_ts; m++) {
+            Dav1dTileState *const ts = &f->ts[m];
+            pthread_cond_destroy(&ts->tile_thread.cond);
+            pthread_mutex_destroy(&ts->tile_thread.lock);
+        }
+        dav1d_free_aligned(f->ts);
+        dav1d_free_aligned(f->tc);
+        dav1d_free_aligned(f->ipred_edge[0]);
+        free(f->a);
+        free(f->tile);
+        free(f->lf.mask);
+        free(f->lf.lr_mask);
+        free(f->lf.level);
+        free(f->lf.tx_lpf_right_edge[0]);
+        dav1d_refmvs_clear(&f->rf);
+        dav1d_free_aligned(f->lf.cdef_line_buf);
+        dav1d_free_aligned(f->lf.lr_lpf_line[0]);
+    }
+    dav1d_free_aligned(c->fc);
+    dav1d_data_unref_internal(&c->in);
+    if (c->n_fc > 1 && c->frame_thread.out_delayed) {
+        for (unsigned n = 0; n < c->n_fc; n++)
+            if (c->frame_thread.out_delayed[n].p.data[0])
+                dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]);
+        free(c->frame_thread.out_delayed);
+    }
+    for (int n = 0; n < c->n_tile_data; n++)
+        dav1d_data_unref_internal(&c->tile[n].data);
+    free(c->tile);
+    for (int n = 0; n < 8; n++) {
+        dav1d_cdf_thread_unref(&c->cdf[n]);
+        if (c->refs[n].p.p.data[0])
+            dav1d_thread_picture_unref(&c->refs[n].p);
+        dav1d_ref_dec(&c->refs[n].refmvs);
+        dav1d_ref_dec(&c->refs[n].segmap);
+    }
+    dav1d_ref_dec(&c->seq_hdr_ref);
+    dav1d_ref_dec(&c->frame_hdr_ref);
+
+    dav1d_ref_dec(&c->mastering_display_ref);
+    dav1d_ref_dec(&c->content_light_ref);
+    dav1d_ref_dec(&c->itut_t35_ref);
+
+    dav1d_freep_aligned(c_out);
+}
+
+void dav1d_picture_unref(Dav1dPicture *const p) {
+    dav1d_picture_unref_internal(p);
+}
+
+uint8_t *dav1d_data_create(Dav1dData *const buf, const size_t sz) {
+    return dav1d_data_create_internal(buf, sz);
+}
+
+int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr,
+                    const size_t sz,
+                    void (*const free_callback)(const uint8_t *data,
+                                                void *user_data),
+                    void *const user_data)
+{
+    return dav1d_data_wrap_internal(buf, ptr, sz, free_callback, user_data);
+}
+
+int dav1d_data_wrap_user_data(Dav1dData *const buf,
+                              const uint8_t *const user_data,
+                              void (*const free_callback)(const uint8_t *user_data,
+                                                          void *cookie),
+                              void *const cookie)
+{
+    return dav1d_data_wrap_user_data_internal(buf,
+                                              user_data,
+                                              free_callback,
+                                              cookie);
+}
+
+void dav1d_data_unref(Dav1dData *const buf) {
+    dav1d_data_unref_internal(buf);
+}
diff --git a/src/log.c b/src/log.c
new file mode 100644 (file)
index 0000000..de6776a
--- /dev/null
+++ b/src/log.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "dav1d/dav1d.h"
+
+#include "common/validate.h"
+
+#include "src/internal.h"
+#include "src/log.h"
+
+#if CONFIG_LOG
+COLD void dav1d_log_default_callback(void *const cookie,
+                                     const char *const format, va_list ap)
+{
+    vfprintf(stderr, format, ap);
+}
+
+COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) {
+    validate_input(c != NULL);
+
+    if (!c->logger.callback)
+        return;
+
+    va_list ap;
+    va_start(ap, format);
+    c->logger.callback(c->logger.cookie, format, ap);
+    va_end(ap);
+}
+#endif
diff --git a/src/log.h b/src/log.h
new file mode 100644 (file)
index 0000000..df32de7
--- /dev/null
+++ b/src/log.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOG_H
+#define DAV1D_SRC_LOG_H
+
+#include "config.h"
+
+#include <stdarg.h>
+
+#include "dav1d/dav1d.h"
+
+#include "common/attributes.h"
+
+#if CONFIG_LOG
+#define dav1d_log dav1d_log
+void dav1d_log_default_callback(void *cookie, const char *format, va_list ap);
+void dav1d_log(Dav1dContext *c, const char *format, ...) ATTR_FORMAT_PRINTF(2, 3);
+#else
+#define dav1d_log_default_callback NULL
+#define dav1d_log(...) do { } while(0)
+#endif
+
+#endif /* DAV1D_SRC_LOG_H */
diff --git a/src/loopfilter.h b/src/loopfilter.h
new file mode 100644 (file)
index 0000000..c159050
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOOPFILTER_H
+#define DAV1D_SRC_LOOPFILTER_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+#include "src/lf_mask.h"
+
+#define decl_loopfilter_sb_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const uint32_t *mask, \
+            const uint8_t (*lvl)[4], ptrdiff_t lvl_stride, \
+            const Av1FilterLUT *lut, int w HIGHBD_DECL_SUFFIX)
+typedef decl_loopfilter_sb_fn(*loopfilter_sb_fn);
+
+typedef struct Dav1dLoopFilterDSPContext {
+    /*
+     * dimension 1: plane (0=luma, 1=chroma)
+     * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * dst/stride are aligned by 32
+     */
+    loopfilter_sb_fn loop_filter_sb[2][2];
+} Dav1dLoopFilterDSPContext;
+
+bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c);
+bitfn_decls(void dav1d_loop_filter_dsp_init_arm, Dav1dLoopFilterDSPContext *c);
+bitfn_decls(void dav1d_loop_filter_dsp_init_x86, Dav1dLoopFilterDSPContext *c);
+
+#endif /* DAV1D_SRC_LOOPFILTER_H */
diff --git a/src/loopfilter_tmpl.c b/src/loopfilter_tmpl.c
new file mode 100644 (file)
index 0000000..6ea744f
--- /dev/null
@@ -0,0 +1,260 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/loopfilter.h"
+
+static NOINLINE void
+loop_filter(pixel *dst, int E, int I, int H,
+            const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd
+            HIGHBD_DECL_SUFFIX)
+{
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    const int F = 1 << bitdepth_min_8;
+    E <<= bitdepth_min_8;
+    I <<= bitdepth_min_8;
+    H <<= bitdepth_min_8;
+
+    for (int i = 0; i < 4; i++, dst += stridea) {
+        int p6, p5, p4, p3, p2;
+        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
+        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
+        int q2, q3, q4, q5, q6;
+        int fm, flat8out, flat8in;
+
+        fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I &&
+             abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;
+
+        if (wd > 4) {
+            p2 = dst[strideb * -3];
+            q2 = dst[strideb * +2];
+
+            fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I;
+
+            if (wd > 6) {
+                p3 = dst[strideb * -4];
+                q3 = dst[strideb * +3];
+
+                fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;
+            }
+        }
+        if (!fm) continue;
+
+        if (wd >= 16) {
+            p6 = dst[strideb * -7];
+            p5 = dst[strideb * -6];
+            p4 = dst[strideb * -5];
+            q4 = dst[strideb * +4];
+            q5 = dst[strideb * +5];
+            q6 = dst[strideb * +6];
+
+            flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F &&
+                       abs(p4 - p0) <= F && abs(q4 - q0) <= F &&
+                       abs(q5 - q0) <= F && abs(q6 - q0) <= F;
+        }
+
+        if (wd >= 6)
+            flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F &&
+                      abs(q1 - q0) <= F && abs(q2 - q0) <= F;
+
+        if (wd >= 8)
+            flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F;
+
+        if (wd >= 16 && (flat8out & flat8in)) {
+            dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 +
+                                 p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+            dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 +
+                                 p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+            dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 +
+                                 p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+            dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 +
+                                 p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+            dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
+                                 p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+            dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                 q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+            dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                 q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+            dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                 q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4;
+            dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
+                                 q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4;
+            dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
+                                 q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4;
+            dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
+                                 q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
+            dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 +
+                                 q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
+        } else if (wd >= 8 && flat8in) {
+            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
+            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
+            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
+            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
+            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
+            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
+        } else if (wd == 6 && flat8in) {
+            dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3;
+            dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3;
+            dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3;
+            dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3;
+        } else {
+            const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H;
+
+#define iclip_diff(v) iclip(v, -128 * (1 << bitdepth_min_8), \
+                                128 * (1 << bitdepth_min_8) - 1)
+
+            if (hev) {
+                int f = iclip_diff(p1 - q1), f1, f2;
+                f = iclip_diff(3 * (q0 - p0) + f);
+
+                f1 = imin(f + 4, (128 << bitdepth_min_8) - 1) >> 3;
+                f2 = imin(f + 3, (128 << bitdepth_min_8) - 1) >> 3;
+
+                dst[strideb * -1] = iclip_pixel(p0 + f2);
+                dst[strideb * +0] = iclip_pixel(q0 - f1);
+            } else {
+                int f = iclip_diff(3 * (q0 - p0)), f1, f2;
+
+                f1 = imin(f + 4, (128 << bitdepth_min_8) - 1) >> 3;
+                f2 = imin(f + 3, (128 << bitdepth_min_8) - 1) >> 3;
+
+                dst[strideb * -1] = iclip_pixel(p0 + f2);
+                dst[strideb * +0] = iclip_pixel(q0 - f1);
+
+                f = (f1 + 1) >> 1;
+                dst[strideb * -2] = iclip_pixel(p1 + f);
+                dst[strideb * +1] = iclip_pixel(q1 - f);
+            }
+#undef iclip_diff
+        }
+    }
+}
+
+static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
+                                   const uint32_t *const vmask,
+                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                   const Av1FilterLUT *lut, const int h
+                                   HIGHBD_DECL_SUFFIX)
+{
+    const unsigned vm = vmask[0] | vmask[1] | vmask[2];
+    for (unsigned y = 1; vm & ~(y - 1);
+         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+    {
+        if (vm & y) {
+            const int L = l[0][0] ? l[0][0] : l[-1][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);
+            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx
+                        HIGHBD_TAIL_SUFFIX);
+        }
+    }
+}
+
+static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
+                                   const uint32_t *const vmask,
+                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                   const Av1FilterLUT *lut, const int w
+                                   HIGHBD_DECL_SUFFIX)
+{
+    const unsigned vm = vmask[0] | vmask[1] | vmask[2];
+    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+        if (vm & x) {
+            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);
+            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx
+                        HIGHBD_TAIL_SUFFIX);
+        }
+    }
+}
+
+static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+                                    const uint32_t *const vmask,
+                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                    const Av1FilterLUT *lut, const int h
+                                    HIGHBD_DECL_SUFFIX)
+{
+    const unsigned vm = vmask[0] | vmask[1];
+    for (unsigned y = 1; vm & ~(y - 1);
+         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+    {
+        if (vm & y) {
+            const int L = l[0][0] ? l[0][0] : l[-1][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = !!(vmask[1] & y);
+            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx
+                        HIGHBD_TAIL_SUFFIX);
+        }
+    }
+}
+
+static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+                                    const uint32_t *const vmask,
+                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                    const Av1FilterLUT *lut, const int w
+                                    HIGHBD_DECL_SUFFIX)
+{
+    const unsigned vm = vmask[0] | vmask[1];
+    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+        if (vm & x) {
+            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = !!(vmask[1] & x);
+            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx
+                        HIGHBD_TAIL_SUFFIX);
+        }
+    }
+}
+
+COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
+    c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
+    c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
+    c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
+    c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    bitfn(dav1d_loop_filter_dsp_init_arm)(c);
+#elif ARCH_X86
+    bitfn(dav1d_loop_filter_dsp_init_x86)(c);
+#endif
+#endif
+}
diff --git a/src/looprestoration.h b/src/looprestoration.h
new file mode 100644 (file)
index 0000000..539a76b
--- /dev/null
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOOPRESTORATION_H
+#define DAV1D_SRC_LOOPRESTORATION_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+enum LrEdgeFlags {
+    LR_HAVE_LEFT = 1 << 0,
+    LR_HAVE_RIGHT = 1 << 1,
+    LR_HAVE_TOP = 1 << 2,
+    LR_HAVE_BOTTOM = 1 << 3,
+};
+
+#ifdef BITDEPTH
+typedef const pixel (*const_left_pixel_row)[4];
+#else
+typedef const void *const_left_pixel_row;
+#endif
+
+// Although the spec applies restoration filters over 4x4 blocks, the wiener
+// filter can be applied to a bigger surface.
+//    * w is constrained by the restoration unit size (w <= 256)
+//    * h is constrained by the stripe height (h <= 64)
+#define decl_wiener_filter_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const_left_pixel_row left, \
+            const pixel *lpf, ptrdiff_t lpf_stride, \
+            int w, int h, const int16_t filterh[7], \
+            const int16_t filterv[7], enum LrEdgeFlags edges \
+            HIGHBD_DECL_SUFFIX)
+typedef decl_wiener_filter_fn(*wienerfilter_fn);
+
+#define decl_selfguided_filter_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const_left_pixel_row left, \
+            const pixel *lpf, ptrdiff_t lpf_stride, \
+            int w, int h, int sgr_idx, const int16_t sgr_w[2], \
+            const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+typedef decl_selfguided_filter_fn(*selfguided_fn);
+
+typedef struct Dav1dLoopRestorationDSPContext {
+    wienerfilter_fn wiener;
+    selfguided_fn selfguided;
+} Dav1dLoopRestorationDSPContext;
+
+bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc);
+bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c, int bpc);
+bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c);
+bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c);
+
+#endif /* DAV1D_SRC_LOOPRESTORATION_H */
diff --git a/src/looprestoration_tmpl.c b/src/looprestoration_tmpl.c
new file mode 100644 (file)
index 0000000..a8a2472
--- /dev/null
@@ -0,0 +1,589 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/looprestoration.h"
+#include "src/tables.h"
+
+// 256 * 1.5 + 3 + 3 = 390
+#define REST_UNIT_STRIDE (390)
+
+// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
+// TODO Chroma only requires 2 rows of padding.
+static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
+                    const pixel (*left)[4],
+                    const pixel *lpf, const ptrdiff_t lpf_stride,
+                    int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
+{
+    const int have_left = !!(edges & LR_HAVE_LEFT);
+    const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+    // Copy more pixels if we don't have to pad them
+    unit_w += 3 * have_left + 3 * have_right;
+    pixel *dst_l = dst + 3 * !have_left;
+    p -= 3 * have_left;
+    lpf -= 3 * have_left;
+
+    if (edges & LR_HAVE_TOP) {
+        // Copy previous loop filtered rows
+        const pixel *const above_1 = lpf;
+        const pixel *const above_2 = above_1 + PXSTRIDE(lpf_stride);
+        pixel_copy(dst_l, above_1, unit_w);
+        pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+    } else {
+        // Pad with first row
+        pixel_copy(dst_l, p, unit_w);
+        pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+        if (have_left) {
+            pixel_copy(dst_l, &left[0][1], 3);
+            pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+            pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+        }
+    }
+
+    pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+    if (edges & LR_HAVE_BOTTOM) {
+        // Copy next loop filtered rows
+        const pixel *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride);
+        const pixel *const below_2 = below_1 + PXSTRIDE(lpf_stride);
+        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+    } else {
+        // Pad with last row
+        const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride);
+        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+        if (have_left) {
+            pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+            pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+            pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+        }
+    }
+
+    // Inner UNIT_WxSTRIPE_H
+    for (int j = 0; j < stripe_h; j++) {
+        pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+        dst_tl += REST_UNIT_STRIDE;
+        p += PXSTRIDE(p_stride);
+    }
+
+    if (!have_right) {
+        pixel *pad = dst_l + unit_w;
+        pixel *row_last = &dst_l[unit_w - 1];
+        // Pad 3x(STRIPE_H+6) with last column
+        for (int j = 0; j < stripe_h + 6; j++) {
+            pixel_set(pad, *row_last, 3);
+            pad += REST_UNIT_STRIDE;
+            row_last += REST_UNIT_STRIDE;
+        }
+    }
+
+    if (!have_left) {
+        // Pad 3x(STRIPE_H+6) with first column
+        for (int j = 0; j < stripe_h + 6; j++) {
+            pixel_set(dst, *dst_l, 3);
+            dst += REST_UNIT_STRIDE;
+            dst_l += REST_UNIT_STRIDE;
+        }
+    } else {
+        dst += 3 * REST_UNIT_STRIDE;
+        for (int j = 0; j < stripe_h; j++) {
+            pixel_copy(dst, &left[j][1], 3);
+            dst += REST_UNIT_STRIDE;
+        }
+    }
+}
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+static void wiener_c(pixel *p, const ptrdiff_t p_stride,
+                     const pixel (*const left)[4],
+                     const pixel *lpf, const ptrdiff_t lpf_stride,
+                     const int w, const int h,
+                     const int16_t filterh[7], const int16_t filterv[7],
+                     const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+    // of padding above and below
+    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+    pixel *tmp_ptr = tmp;
+
+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
+
+    // Values stored between horizontal and vertical filtering don't
+    // fit in a uint8_t.
+    uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+    uint16_t *hor_ptr = hor;
+
+    const int bitdepth = bitdepth_from_max(bitdepth_max);
+    const int round_bits_h = 3 + (bitdepth == 12) * 2;
+    const int rounding_off_h = 1 << (round_bits_h - 1);
+    const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
+    for (int j = 0; j < h + 6; j++) {
+        for (int i = 0; i < w; i++) {
+            int sum = (tmp_ptr[i + 3] << 7) + (1 << (bitdepth + 6));
+
+            for (int k = 0; k < 7; k++) {
+                sum += tmp_ptr[i + k] * filterh[k];
+            }
+
+            hor_ptr[i] =
+                iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
+        }
+        tmp_ptr += REST_UNIT_STRIDE;
+        hor_ptr += REST_UNIT_STRIDE;
+    }
+
+    const int round_bits_v = 11 - (bitdepth == 12) * 2;
+    const int rounding_off_v = 1 << (round_bits_v - 1);
+    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
+    for (int j = 0; j < h; j++) {
+        for (int i = 0; i < w; i++) {
+            int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset;
+
+            for (int k = 0; k < 7; k++) {
+                sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filterv[k];
+            }
+
+            p[j * PXSTRIDE(p_stride) + i] =
+                iclip_pixel((sum + rounding_off_v) >> round_bits_v);
+        }
+    }
+}
+
+// Sum over a 3x3 area
+// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
+// left of the top left corner. However, the self guided filter only needs 1
+// pixel above and one pixel to the left. As for the pixels below and to the
+// right they must be computed in the sums, but don't need to be stored.
+//
+// Example for a 4x4 block:
+//      x x x x x x x x x x
+//      x c c c c c c c c x
+//      x i s s s s s s i x
+//      x i s s s s s s i x
+//      x i s s s s s s i x
+//      x i s s s s s s i x
+//      x i s s s s s s i x
+//      x i s s s s s s i x
+//      x c c c c c c c c x
+//      x x x x x x x x x x
+//
+// s: Pixel summed and stored
+// i: Pixel summed and stored (between loops)
+// c: Pixel summed not stored
+// x: Pixel not summed not stored
+static void boxsum3(coef *dst, const pixel *src, const int w, const int h) {
+    // We skip the first row, as it is never used
+    src += REST_UNIT_STRIDE;
+    dst += REST_UNIT_STRIDE;
+
+    // We skip the first and last columns, as they are never used
+    for (int x = 1; x < w - 1; x++) {
+        coef *ds = dst + x;
+        const pixel *s = src + x;
+        int a = s[0], b = s[REST_UNIT_STRIDE];
+
+        // We skip the first 2 rows, as they are skipped in the next loop and
+        // we don't need the last 2 row as it is skipped in the next loop
+        for (int y = 2; y < h - 2; y++) {
+            s += REST_UNIT_STRIDE;
+            const int c = s[REST_UNIT_STRIDE];
+            ds += REST_UNIT_STRIDE;
+            *ds = a + b + c;
+            a = b;
+            b = c;
+        }
+     }
+
+    // We skip the first 2 rows as they are never read
+    dst += REST_UNIT_STRIDE;
+    // We skip the last 2 rows as it is never read
+    for (int y = 2; y < h - 2; y++) {
+        int a = dst[1], b = dst[2];
+
+        // We don't store the first column as it is never read and
+        // we don't store the last 2 columns as they are never read
+        for (int x = 2; x < w - 2; x++) {
+            const int c = dst[x + 1];
+            dst[x] = a + b + c;
+            a = b;
+            b = c;
+        }
+        dst += REST_UNIT_STRIDE;
+    }
+}
+
+// Sum over a 5x5 area
+// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
+// left of the top left corner. However, the self guided filter only needs 1
+// pixel above and one pixel to the left. As for the pixels below and to the
+// right they must be computed in the sums, but don't need to be stored.
+//
+// Example for a 4x4 block:
+//      c c c c c c c c c c
+//      c c c c c c c c c c
+//      i i s s s s s s i i
+//      i i s s s s s s i i
+//      i i s s s s s s i i
+//      i i s s s s s s i i
+//      i i s s s s s s i i
+//      i i s s s s s s i i
+//      c c c c c c c c c c
+//      c c c c c c c c c c
+//
+// s: Pixel summed and stored
+// i: Pixel summed and stored (between loops)
+// c: Pixel summed not stored
+// x: Pixel not summed not stored
+static void boxsum5(coef *dst, const pixel *const src, const int w, const int h) {
+    // We skip the first row, as it is never used
+    dst += REST_UNIT_STRIDE;
+
+    for (int x = 0; x < w; x++) {
+        coef *ds = dst + x;
+        const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
+        int a = s[-3 * REST_UNIT_STRIDE];
+        int b = s[-2 * REST_UNIT_STRIDE];
+        int c = s[-1 * REST_UNIT_STRIDE];
+        int d = s[0];
+
+        // We skip the first 2 rows, as they are skipped in the next loop and
+        // we don't need the last 2 row as it is skipped in the next loop
+        for (int y = 2; y < h - 2; y++) {
+            s += REST_UNIT_STRIDE;
+            const int e = *s;
+            ds += REST_UNIT_STRIDE;
+            *ds = a + b + c + d + e;
+            a = b;
+            b = c;
+            c = d;
+            d = e;
+        }
+    }
+
+    // We skip the first 2 rows as they are never read
+    dst += REST_UNIT_STRIDE;
+    for (int y = 2; y < h - 2; y++) {
+        int a = dst[0];
+        int b = dst[1];
+        int c = dst[2];
+        int d = dst[3];
+
+        for (int x = 2; x < w - 2; x++) {
+            const int e = dst[x + 2];
+            dst[x] = a + b + c + d + e;
+            a = b;
+            b = c;
+            c = d;
+            d = e;
+        }
+        dst += REST_UNIT_STRIDE;
+    }
+}
+
+// See boxsum3 function comments for details on row and column skipping
+static void boxsum3sqr(int32_t *dst, const pixel *src, const int w, const int h) {
+    // We skip the first row, as it is never used
+    src += REST_UNIT_STRIDE;
+    dst += REST_UNIT_STRIDE;
+
+    // We skip the first and last columns, as they are never used
+    for (int x = 1; x < w - 1; x++) {
+        int32_t *ds = dst + x;
+        const pixel *s = src + x;
+        int a = s[0] * s[0];
+        int b = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
+
+        // We skip the first row, as it is skipped in the next loop and
+        // we don't need the last row as it is skipped in the next loop
+        for (int y = 2; y < h - 2; y++) {
+            s += REST_UNIT_STRIDE;
+            const int c = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
+            ds += REST_UNIT_STRIDE;
+            *ds = a + b + c;
+            a = b;
+            b = c;
+        }
+     }
+
+    // We skip the first row as it is never read
+    dst += REST_UNIT_STRIDE;
+    // We skip the last row as it is never read
+    for (int y = 2; y < h - 2; y++) {
+        int a = dst[1], b = dst[2];
+
+        // We don't store the first column as it is never read and
+        // we don't store the last 2 columns as they are never read
+        for (int x = 2; x < w - 2; x++) {
+            const int c = dst[x + 1];
+            dst[x] = a + b + c;
+            a = b;
+            b = c;
+        }
+        dst += REST_UNIT_STRIDE;
+    }
+}
+
+// See boxsum5 function comments for details on row and column skipping
+static void boxsum5sqr(int32_t *dst, const pixel *const src, const int w,
+                       const int h)
+{
+    // We skip the first row, as it is never used
+    dst += REST_UNIT_STRIDE;
+
+    for (int x = 0; x < w; x++) {
+        int32_t *ds = dst + x;
+        const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
+        int a = s[-3 * REST_UNIT_STRIDE] * s[-3 * REST_UNIT_STRIDE];
+        int b = s[-2 * REST_UNIT_STRIDE] * s[-2 * REST_UNIT_STRIDE];
+        int c = s[-1 * REST_UNIT_STRIDE] * s[-1 * REST_UNIT_STRIDE];
+        int d = s[0] * s[0];
+
+        // We skip the first 2 rows, as they are skipped in the next loop and
+        // we don't need the last 2 row as it is skipped in the next loop
+        for (int y = 2; y < h - 2; y++) {
+            s += REST_UNIT_STRIDE;
+            const int e = s[0] * s[0];
+            ds += REST_UNIT_STRIDE;
+            *ds = a + b + c + d + e;
+            a = b;
+            b = c;
+            c = d;
+            d = e;
+        }
+    }
+
+    // We skip the first 2 rows as they are never read
+    dst += REST_UNIT_STRIDE;
+    for (int y = 2; y < h - 2; y++) {
+        int a = dst[0];
+        int b = dst[1];
+        int c = dst[2];
+        int d = dst[3];
+
+        for (int x = 2; x < w - 2; x++) {
+            const int e = dst[x + 2];
+            dst[x] = a + b + c + d + e;
+            a = b;
+            b = c;
+            c = d;
+            d = e;
+        }
+        dst += REST_UNIT_STRIDE;
+    }
+}
+
+static void selfguided_filter(coef *dst, const pixel *src,
+                              const ptrdiff_t src_stride, const int w,
+                              const int h, const int n, const int s
+                              HIGHBD_DECL_SUFFIX)
+{
+    const int sgr_one_by_x = n == 25 ? 164 : 455;
+
+    // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
+    // of padding above and below
+    int32_t A_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+    int32_t *A = A_ + 3 * REST_UNIT_STRIDE + 3;
+    // By inverting A and B after the boxsums, B can be of size coef instead
+    // of int32_t
+    coef B_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+    coef *B = B_ + 3 * REST_UNIT_STRIDE + 3;
+
+    const int step = (n == 25) + 1;
+    if (n == 25) {
+        boxsum5(B_, src, w + 6, h + 6);
+        boxsum5sqr(A_, src, w + 6, h + 6);
+    } else {
+        boxsum3(B_, src, w + 6, h + 6);
+        boxsum3sqr(A_, src, w + 6, h + 6);
+    }
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+
+    int32_t *AA = A - REST_UNIT_STRIDE;
+    coef *BB = B - REST_UNIT_STRIDE;
+    for (int j = -1; j < h + 1; j+= step) {
+        for (int i = -1; i < w + 1; i++) {
+            const int a =
+                (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8);
+            const int b =
+                (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8;
+
+            const unsigned p = imax(a * n - b * b, 0);
+            const unsigned z = (p * s + (1 << 19)) >> 20;
+            const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)];
+
+            // This is where we invert A and B, so that B is of size coef.
+            AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
+            BB[i] = 256 - x;
+        }
+        AA += step * REST_UNIT_STRIDE;
+        BB += step * REST_UNIT_STRIDE;
+    }
+
+    src += 3 * REST_UNIT_STRIDE + 3;
+    if (n == 25) {
+        int j = 0;
+#define SIX_NEIGHBORS(P, i)\
+    ((P[i - REST_UNIT_STRIDE]     + P[i + REST_UNIT_STRIDE]) * 6 +   \
+     (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] +    \
+      P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5)
+        for (; j < h - 1; j+=2) {
+            for (int i = 0; i < w; i++) {
+                const int a = SIX_NEIGHBORS(B, i);
+                const int b = SIX_NEIGHBORS(A, i);
+                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
+            }
+            dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
+            src += REST_UNIT_STRIDE;
+            B += REST_UNIT_STRIDE;
+            A += REST_UNIT_STRIDE;
+            for (int i = 0; i < w; i++) {
+                const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
+                const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
+                dst[i] = (a * src[i] + b + (1 << 7)) >> 8;
+            }
+            dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
+            src += REST_UNIT_STRIDE;
+            B += REST_UNIT_STRIDE;
+            A += REST_UNIT_STRIDE;
+        }
+        if (j + 1 == h) { // Last row, when number of rows is odd
+            for (int i = 0; i < w; i++) {
+                const int a = SIX_NEIGHBORS(B, i);
+                const int b = SIX_NEIGHBORS(A, i);
+                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
+            }
+        }
+#undef SIX_NEIGHBORS
+    } else {
+#define EIGHT_NEIGHBORS(P, i)\
+    ((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \
+     (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] +                           \
+      P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3)
+        for (int j = 0; j < h; j++) {
+            for (int i = 0; i < w; i++) {
+                const int a = EIGHT_NEIGHBORS(B, i);
+                const int b = EIGHT_NEIGHBORS(A, i);
+                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
+            }
+            dst += 384;
+            src += REST_UNIT_STRIDE;
+            B += REST_UNIT_STRIDE;
+            A += REST_UNIT_STRIDE;
+        }
+    }
+#undef EIGHT_NEIGHBORS
+}
+
+static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
+                         const pixel (*const left)[4],
+                         const pixel *lpf, const ptrdiff_t lpf_stride,
+                         const int w, const int h, const int sgr_idx,
+                         const int16_t sgr_w[2], const enum LrEdgeFlags edges
+                         HIGHBD_DECL_SUFFIX)
+{
+    // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
+    // of padding above and below
+    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+
+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
+
+    // Selfguided filter outputs to a maximum stripe height of 64 and a
+    // maximum restoration width of 384 (256 * 1.5)
+    coef dst[64 * 384];
+
+    // both r1 and r0 can't be zero
+    if (!dav1d_sgr_params[sgr_idx][0]) {
+        const int s1 = dav1d_sgr_params[sgr_idx][3];
+        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX);
+        const int w1 = (1 << 7) - sgr_w[1];
+        for (int j = 0; j < h; j++) {
+            for (int i = 0; i < w; i++) {
+                const int u = (p[i] << 4);
+                const int v = (u << 7) + w1 * (dst[j * 384 + i] - u);
+                p[i] = iclip_pixel((v + (1 << 10)) >> 11);
+            }
+            p += PXSTRIDE(p_stride);
+        }
+    } else if (!dav1d_sgr_params[sgr_idx][1]) {
+        const int s0 = dav1d_sgr_params[sgr_idx][2];
+        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX);
+        const int w0 = sgr_w[0];
+        for (int j = 0; j < h; j++) {
+            for (int i = 0; i < w; i++) {
+                const int u = (p[i] << 4);
+                const int v = (u << 7) + w0 * (dst[j * 384 + i] - u);
+                p[i] = iclip_pixel((v + (1 << 10)) >> 11);
+            }
+            p += PXSTRIDE(p_stride);
+        }
+    } else {
+        coef dst1[64 * 384];
+        const int s0 = dav1d_sgr_params[sgr_idx][2];
+        const int s1 = dav1d_sgr_params[sgr_idx][3];
+        const int w0 = sgr_w[0];
+        const int w1 = (1 << 7) - w0 - sgr_w[1];
+        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX);
+        selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX);
+        for (int j = 0; j < h; j++) {
+            for (int i = 0; i < w; i++) {
+                const int u = (p[i] << 4);
+                const int v = (u << 7) + w0 * (dst[j * 384 + i] - u) +
+                              w1 * (dst1[j * 384 + i] - u);
+                p[i] = iclip_pixel((v + (1 << 10)) >> 11);
+            }
+            p += PXSTRIDE(p_stride);
+        }
+    }
+}
+
+COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
+    c->wiener = wiener_c;
+    c->selfguided = selfguided_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    bitfn(dav1d_loop_restoration_dsp_init_arm)(c, bpc);
+#elif ARCH_PPC64LE
+    bitfn(dav1d_loop_restoration_dsp_init_ppc)(c);
+#elif ARCH_X86
+    bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
+#endif
+#endif
+}
diff --git a/src/lr_apply.h b/src/lr_apply.h
new file mode 100644 (file)
index 0000000..638bb8b
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LR_APPLY_H
+#define DAV1D_SRC_LR_APPLY_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/internal.h"
+
+void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
+                               /*const*/ pixel *const src[3], int sby);
+
+void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
+                            int sby);
+
+#endif /* DAV1D_SRC_LR_APPLY_H */
diff --git a/src/lr_apply_tmpl.c b/src/lr_apply_tmpl.c
new file mode 100644 (file)
index 0000000..02413b9
--- /dev/null
@@ -0,0 +1,302 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+
+#include "common/intops.h"
+
+#include "src/lr_apply.h"
+
+enum LrRestorePlanes {
+    LR_RESTORE_Y = 1 << 0,
+    LR_RESTORE_U = 1 << 1,
+    LR_RESTORE_V = 1 << 2,
+};
+
+// The loop filter buffer stores 12 rows of pixels. A superblock block will
+// contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above
+// and 2 below) the final 4 rows are used to swap the bottom of the last
+// stripe with the top of the next super block row.
+static void backup_lpf(const Dav1dFrameContext *const f,
+                       pixel *dst, const ptrdiff_t dst_stride,
+                       const pixel *src, const ptrdiff_t src_stride,
+                       const int ss_ver, const int sb128,
+                       int row, const int row_h, const int src_w,
+                       const int h, const int ss_hor)
+{
+    const int dst_w = f->frame_hdr->super_res.enabled ?
+                      (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;
+
+    // The first stripe of the frame is shorter by 8 luma pixel rows.
+    int stripe_h = (64 - 8 * !row) >> ss_ver;
+
+    if (row) {
+        const int top = 4 << sb128;
+        // Copy the top part of the stored loop filtered pixels from the
+        // previous sb row needed above the first stripe of this sb row.
+        pixel_copy(&dst[PXSTRIDE(dst_stride) *  0],
+                   &dst[PXSTRIDE(dst_stride) *  top],      dst_w);
+        pixel_copy(&dst[PXSTRIDE(dst_stride) *  1],
+                   &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
+        pixel_copy(&dst[PXSTRIDE(dst_stride) *  2],
+                   &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
+        pixel_copy(&dst[PXSTRIDE(dst_stride) *  3],
+                   &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
+    }
+
+    dst += 4 * PXSTRIDE(dst_stride);
+    src += (stripe_h - 2) * PXSTRIDE(src_stride);
+
+    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+        while (row + stripe_h <= row_h) {
+            const int n_lines = 4 - (row + stripe_h + 1 == h);
+            f->dsp->mc.resize(dst, dst_stride, src, src_stride,
+                              dst_w, n_lines, src_w, f->resize_step[ss_hor],
+                              f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX);
+            row += stripe_h; // unmodified stripe_h for the 1st stripe
+            stripe_h = 64 >> ss_ver;
+            src += stripe_h * PXSTRIDE(src_stride);
+            dst += n_lines * PXSTRIDE(dst_stride);
+            if (n_lines == 3) {
+                pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], dst_w);
+                dst += PXSTRIDE(dst_stride);
+            }
+        }
+    } else {
+        while (row + stripe_h <= row_h) {
+            const int n_lines = 4 - (row + stripe_h + 1 == h);
+            for (int i = 0; i < 4; i++) {
+                pixel_copy(dst, i == n_lines ? &dst[-PXSTRIDE(dst_stride)] :
+                                               src, src_w);
+                dst += PXSTRIDE(dst_stride);
+                src += PXSTRIDE(src_stride);
+            }
+            row += stripe_h; // unmodified stripe_h for the 1st stripe
+            stripe_h = 64 >> ss_ver;
+            src += (stripe_h - 4) * PXSTRIDE(src_stride);
+        }
+    }
+}
+
+void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
+                               /*const*/ pixel *const src[3], const int sby)
+{
+    const int offset = 8 * !!sby;
+    const ptrdiff_t *const src_stride = f->cur.stride;
+    const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel);
+
+    // TODO Also check block level restore type to reduce copying.
+    const int restore_planes = f->lf.restore_planes;
+
+    if (restore_planes & LR_RESTORE_Y) {
+        const int h = f->cur.p.h;
+        const int w = f->bw << 2;
+        const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
+        const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
+        backup_lpf(f, f->lf.lr_lpf_line[0], lr_stride,
+                   src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
+                   0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0);
+    }
+    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
+        const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int h = (f->cur.p.h + ss_ver) >> ss_ver;
+        const int w = f->bw << (2 - ss_hor);
+        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1);
+        const int offset_uv = offset >> ss_ver;
+        const int y_stripe =
+            (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
+
+        if (restore_planes & LR_RESTORE_U) {
+            backup_lpf(f, f->lf.lr_lpf_line[1], lr_stride,
+                       src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
+                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
+        }
+        if (restore_planes & LR_RESTORE_V) {
+            backup_lpf(f, f->lf.lr_lpf_line[2], lr_stride,
+                       src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
+                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
+        }
+    }
+}
+
+static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
+                      const pixel (*left)[4], int x, int y,
+                      const int plane, const int unit_w, const int row_h,
+                      const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)
+{
+    const Dav1dDSPContext *const dsp = f->dsp;
+    const int chroma = !!plane;
+    const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
+    const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);
+    const pixel *lpf = f->lf.lr_lpf_line[plane] + x;
+    const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
+    const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31);
+
+    // The first stripe of the frame is shorter by 8 luma pixel rows.
+    int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
+
+    // FIXME [8] might be easier for SIMD
+    int16_t filterh[7], filterv[7];
+    if (lr->type == DAV1D_RESTORATION_WIENER) {
+        filterh[0] = filterh[6] = lr->filter_h[0];
+        filterh[1] = filterh[5] = lr->filter_h[1];
+        filterh[2] = filterh[4] = lr->filter_h[2];
+        filterh[3] = -((filterh[0] + filterh[1] + filterh[2]) * 2);
+
+        filterv[0] = filterv[6] = lr->filter_v[0];
+        filterv[1] = filterv[5] = lr->filter_v[1];
+        filterv[2] = filterv[4] = lr->filter_v[2];
+        filterv[3] = -((filterv[0] + filterv[1] + filterv[2]) * 2);
+    }
+
+    while (y + stripe_h <= row_h) {
+        // Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h)
+        edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
+        if (lr->type == DAV1D_RESTORATION_WIENER) {
+            dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
+                           filterh, filterv, edges HIGHBD_CALL_SUFFIX);
+        } else {
+            assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
+            dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
+                               lr->sgr_idx, lr->sgr_weights, edges HIGHBD_CALL_SUFFIX);
+        }
+
+        left += stripe_h;
+        y += stripe_h;
+        if (y + stripe_h > row_h && sbrow_has_bottom) break;
+        p += stripe_h * PXSTRIDE(p_stride);
+        edges |= LR_HAVE_TOP;
+        stripe_h = imin(64 >> ss_ver, row_h - y);
+        if (stripe_h == 0) break;
+        lpf += 4 * PXSTRIDE(lpf_stride);
+    }
+}
+
+static void backup4xU(pixel (*dst)[4], const pixel *src, const ptrdiff_t src_stride,
+                      int u)
+{
+    for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride))
+        pixel_copy(dst, src, 4);
+}
+
+static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
+                     const int w, const int h, const int row_h, const int plane)
+{
+    const int chroma = !!plane;
+    const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
+    const int ss_hor = chroma & (f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444);
+    const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
+
+    const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!plane];
+    const int unit_size = 1 << unit_size_log2;
+    const int half_unit_size = unit_size >> 1;
+    const int max_unit_size = unit_size + half_unit_size;
+
+    // Y coordinate of the sbrow (y is 8 luma pixel rows above row_y)
+    const int row_y = y + ((8 >> ss_ver) * !!y);
+
+    // FIXME This is an ugly hack to lookup the proper AV1Filter unit for
+    // chroma planes. Question: For Multithreaded decoding, is it better
+    // to store the chroma LR information with collocated Luma information?
+    // In other words. For a chroma restoration unit locate at 128,128 and
+    // with a 4:2:0 chroma subsampling, do we store the filter information at
+    // the AV1Filter unit located at (128,128) or (256,256)
+    // TODO Support chroma subsampling.
+    const int shift_hor = 7 - ss_hor;
+
+    pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
+    const Av1RestorationUnit *lr[2];
+
+    enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT |
+                             (row_h < h ? LR_HAVE_BOTTOM : 0);
+
+    int aligned_unit_pos = row_y & ~(unit_size - 1);
+    if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h)
+        aligned_unit_pos -= unit_size;
+    aligned_unit_pos <<= ss_ver;
+    const int sb_idx = (aligned_unit_pos >> 7) * f->sr_sb128w;
+    const int unit_idx = ((aligned_unit_pos >> 6) & 1) << 1;
+    lr[0] = &f->lf.lr_mask[sb_idx].lr[plane][unit_idx];
+    int restore = lr[0]->type != DAV1D_RESTORATION_NONE;
+    int x = 0, bit = 0;
+    for (; x + max_unit_size <= w; p += unit_size, edges |= LR_HAVE_LEFT, bit ^= 1) {
+        const int next_x = x + unit_size;
+        const int next_u_idx = unit_idx + ((next_x >> (shift_hor - 1)) & 1);
+        lr[!bit] =
+            &f->lf.lr_mask[sb_idx + (next_x >> shift_hor)].lr[plane][next_u_idx];
+        const int restore_next = lr[!bit]->type != DAV1D_RESTORATION_NONE;
+        if (restore_next)
+            backup4xU(pre_lr_border[bit], p + unit_size - 4, p_stride, row_h - y);
+        if (restore)
+            lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_size, row_h,
+                      lr[bit], edges);
+        x = next_x;
+        restore = restore_next;
+    }
+    if (restore) {
+        edges &= ~LR_HAVE_RIGHT;
+        const int unit_w = w - x;
+        lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr[bit], edges);
+    }
+}
+
+void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
+                            const int sby)
+{
+    const int offset_y = 8 * !!sby;
+    const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;
+    const int restore_planes = f->lf.restore_planes;
+
+    if (restore_planes & LR_RESTORE_Y) {
+        const int h = f->sr_cur.p.p.h;
+        const int w = f->sr_cur.p.p.w;
+        const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h);
+        const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y;
+        lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
+                 h, row_h, 0);
+    }
+    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
+        const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver;
+        const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h);
+        const int offset_uv = offset_y >> ss_ver;
+        const int y_stripe =
+            (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
+        if (restore_planes & LR_RESTORE_U)
+            lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
+                     w, h, row_h, 1);
+
+        if (restore_planes & LR_RESTORE_V)
+            lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
+                     w, h, row_h, 2);
+    }
+}
diff --git a/src/mc.h b/src/mc.h
new file mode 100644 (file)
index 0000000..784b58d
--- /dev/null
+++ b/src/mc.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_MC_H
+#define DAV1D_SRC_MC_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+#define decl_mc_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const pixel *src, ptrdiff_t src_stride, \
+            int w, int h, int mx, int my HIGHBD_DECL_SUFFIX)
+typedef decl_mc_fn(*mc_fn);
+
+#define decl_mc_scaled_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const pixel *src, ptrdiff_t src_stride, \
+            int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX)
+typedef decl_mc_scaled_fn(*mc_scaled_fn);
+
+#define decl_warp8x8_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const pixel *src, ptrdiff_t src_stride, \
+            const int16_t *abcd, int mx, int my HIGHBD_DECL_SUFFIX)
+typedef decl_warp8x8_fn(*warp8x8_fn);
+
+#define decl_mct_fn(name) \
+void (name)(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, \
+            int w, int h, int mx, int my HIGHBD_DECL_SUFFIX)
+typedef decl_mct_fn(*mct_fn);
+
+#define decl_mct_scaled_fn(name) \
+void (name)(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, \
+            int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX)
+typedef decl_mct_scaled_fn(*mct_scaled_fn);
+
+#define decl_warp8x8t_fn(name) \
+void (name)(int16_t *tmp, const ptrdiff_t tmp_stride, \
+            const pixel *src, ptrdiff_t src_stride, \
+            const int16_t *abcd, int mx, int my HIGHBD_DECL_SUFFIX)
+typedef decl_warp8x8t_fn(*warp8x8t_fn);
+
+#define decl_avg_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const int16_t *tmp1, const int16_t *tmp2, int w, int h \
+            HIGHBD_DECL_SUFFIX)
+typedef decl_avg_fn(*avg_fn);
+
+#define decl_w_avg_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const int16_t *tmp1, const int16_t *tmp2, int w, int h, int weight \
+            HIGHBD_DECL_SUFFIX)
+typedef decl_w_avg_fn(*w_avg_fn);
+
+#define decl_mask_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const int16_t *tmp1, const int16_t *tmp2, int w, int h, \
+            const uint8_t *mask HIGHBD_DECL_SUFFIX)
+typedef decl_mask_fn(*mask_fn);
+
+#define decl_w_mask_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const int16_t *tmp1, const int16_t *tmp2, int w, int h, \
+            uint8_t *mask, int sign HIGHBD_DECL_SUFFIX)
+typedef decl_w_mask_fn(*w_mask_fn);
+
+#define decl_blend_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, \
+            int w, int h, const uint8_t *mask)
+typedef decl_blend_fn(*blend_fn);
+
+#define decl_blend_dir_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, int w, int h)
+typedef decl_blend_dir_fn(*blend_dir_fn);
+
+#define decl_emu_edge_fn(name) \
+void (name)(intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, intptr_t x, intptr_t y, \
+            pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride)
+typedef decl_emu_edge_fn(*emu_edge_fn);
+
+#define decl_resize_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const pixel *src, ptrdiff_t src_stride, \
+            int dst_w, int h, int src_w, int dx, int mx HIGHBD_DECL_SUFFIX)
+typedef decl_resize_fn(*resize_fn);
+
+typedef struct Dav1dMCDSPContext {
+    mc_fn mc[N_2D_FILTERS];
+    mc_scaled_fn mc_scaled[N_2D_FILTERS];
+    mct_fn mct[N_2D_FILTERS];
+    mct_scaled_fn mct_scaled[N_2D_FILTERS];
+    avg_fn avg;
+    w_avg_fn w_avg;
+    mask_fn mask;
+    w_mask_fn w_mask[3 /* 444, 422, 420 */];
+    blend_fn blend;
+    blend_dir_fn blend_v;
+    blend_dir_fn blend_h;
+    warp8x8_fn warp8x8;
+    warp8x8t_fn warp8x8t;
+    emu_edge_fn emu_edge;
+    resize_fn resize;
+} Dav1dMCDSPContext;
+
+bitfn_decls(void dav1d_mc_dsp_init, Dav1dMCDSPContext *c);
+bitfn_decls(void dav1d_mc_dsp_init_arm, Dav1dMCDSPContext *c);
+bitfn_decls(void dav1d_mc_dsp_init_x86, Dav1dMCDSPContext *c);
+
+#endif /* DAV1D_SRC_MC_H */
diff --git a/src/mc_tmpl.c b/src/mc_tmpl.c
new file mode 100644 (file)
index 0000000..c4d9e14
--- /dev/null
@@ -0,0 +1,954 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/mc.h"
+#include "src/tables.h"
+
+#if BITDEPTH == 8
+#define get_intermediate_bits(bitdepth_max) 4
+// Output in interval [-5132, 9212], fits in int16_t as is
+#define PREP_BIAS 0
+#else
+// 4 for 10 bits/component, 2 for 12 bits/component
+#define get_intermediate_bits(bitdepth_max) (14 - bitdepth_from_max(bitdepth_max))
+// Output in interval [-20588, 36956] (10-bit), [-20602, 36983] (12-bit)
+// Subtract a bias to ensure the output fits in int16_t
+#define PREP_BIAS 8192
+#endif
+
+static NOINLINE void
+put_c(pixel *dst, const ptrdiff_t dst_stride,
+      const pixel *src, const ptrdiff_t src_stride, const int w, int h)
+{
+    do {
+        pixel_copy(dst, src, w);
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+static NOINLINE void
+prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride,
+       const int w, int h HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    do {
+        for (int x = 0; x < w; x++)
+            tmp[x] = (src[x] << intermediate_bits) - PREP_BIAS;
+
+        tmp += w;
+        src += src_stride;
+    } while (--h);
+}
+
+#define FILTER_8TAP(src, x, F, stride) \
+    (F[0] * src[x + -3 * stride] + \
+     F[1] * src[x + -2 * stride] + \
+     F[2] * src[x + -1 * stride] + \
+     F[3] * src[x + +0 * stride] + \
+     F[4] * src[x + +1 * stride] + \
+     F[5] * src[x + +2 * stride] + \
+     F[6] * src[x + +3 * stride] + \
+     F[7] * src[x + +4 * stride])
+
+#define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \
+    ((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh))
+
+#define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \
+    iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh))
+
+#define GET_H_FILTER(mx) \
+    const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
+        dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \
+        dav1d_mc_subpel_filters[3 + (filter_type & 1)][(mx) - 1]
+
+#define GET_V_FILTER(my) \
+    const int8_t *const fv = !(my) ? NULL : h > 4 ? \
+        dav1d_mc_subpel_filters[filter_type >> 2][(my) - 1] : \
+        dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][(my) - 1]
+
+#define GET_FILTERS() \
+    GET_H_FILTER(mx); \
+    GET_V_FILTER(my)
+
+static NOINLINE void
+put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
+           const pixel *src, ptrdiff_t src_stride,
+           const int w, int h, const int mx, const int my,
+           const int filter_type HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    const int intermediate_rnd = (1 << intermediate_bits) >> 1;
+
+    GET_FILTERS();
+    dst_stride = PXSTRIDE(dst_stride);
+    src_stride = PXSTRIDE(src_stride);
+
+    if (fh) {
+        if (fv) {
+            int tmp_h = h + 7;
+            int16_t mid[128 * 135], *mid_ptr = mid;
+
+            src -= src_stride * 3;
+            do {
+                for (int x = 0; x < w; x++)
+                    mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
+                                                       6 - intermediate_bits);
+
+                mid_ptr += 128;
+                src += src_stride;
+            } while (--tmp_h);
+
+            mid_ptr = mid + 128 * 3;
+            do {
+                for (int x = 0; x < w; x++)
+                    dst[x] = DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128,
+                                                    6 + intermediate_bits);
+
+                mid_ptr += 128;
+                dst += dst_stride;
+            } while (--h);
+        } else {
+            do {
+                for (int x = 0; x < w; x++) {
+                    const int px = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
+                                                         6 - intermediate_bits);
+                    dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits);
+                }
+
+                dst += dst_stride;
+                src += src_stride;
+            } while (--h);
+        }
+    } else if (fv) {
+        do {
+            for (int x = 0; x < w; x++)
+                dst[x] = DAV1D_FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);
+
+            dst += dst_stride;
+            src += src_stride;
+        } while (--h);
+    } else
+        put_c(dst, dst_stride, src, src_stride, w, h);
+}
+
+static NOINLINE void
+put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride,
+                  const pixel *src, ptrdiff_t src_stride,
+                  const int w, int h, const int mx, int my,
+                  const int dx, const int dy, const int filter_type
+                  HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    const int intermediate_rnd = (1 << intermediate_bits) >> 1;
+    int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
+    int16_t mid[128 * (256 + 7)], *mid_ptr = mid;
+    src_stride = PXSTRIDE(src_stride);
+
+    src -= src_stride * 3;
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            GET_H_FILTER(imx >> 6);
+            mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
+                                                    6 - intermediate_bits) :
+                              src[ioff] << intermediate_bits;
+            imx += dx;
+            ioff += imx >> 10;
+            imx &= 0x3ff;
+        }
+
+        mid_ptr += 128;
+        src += src_stride;
+    } while (--tmp_h);
+
+    mid_ptr = mid + 128 * 3;
+    for (int y = 0; y < h; y++) {
+        int x;
+        GET_V_FILTER(my >> 6);
+
+        for (x = 0; x < w; x++)
+            dst[x] = fv ? DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128,
+                                                 6 + intermediate_bits) :
+                          iclip_pixel((mid_ptr[x] + intermediate_rnd) >>
+                                              intermediate_bits);
+
+        my += dy;
+        mid_ptr += (my >> 10) * 128;
+        my &= 0x3ff;
+        dst += PXSTRIDE(dst_stride);
+    }
+}
+
+static NOINLINE void
+prep_8tap_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
+            const int w, int h, const int mx, const int my,
+            const int filter_type HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    GET_FILTERS();
+    src_stride = PXSTRIDE(src_stride);
+
+    if (fh) {
+        if (fv) {
+            int tmp_h = h + 7;
+            int16_t mid[128 * 135], *mid_ptr = mid;
+
+            src -= src_stride * 3;
+            do {
+                for (int x = 0; x < w; x++)
+                    mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
+                                                       6 - intermediate_bits);
+
+                mid_ptr += 128;
+                src += src_stride;
+            } while (--tmp_h);
+
+            mid_ptr = mid + 128 * 3;
+            do {
+                for (int x = 0; x < w; x++) {
+                    int t = DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) -
+                                  PREP_BIAS;
+                    assert(t >= INT16_MIN && t <= INT16_MAX);
+                    tmp[x] = t;
+                }
+
+                mid_ptr += 128;
+                tmp += w;
+            } while (--h);
+        } else {
+            do {
+                for (int x = 0; x < w; x++)
+                    tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
+                                                   6 - intermediate_bits) -
+                             PREP_BIAS;
+
+                tmp += w;
+                src += src_stride;
+            } while (--h);
+        }
+    } else if (fv) {
+        do {
+            for (int x = 0; x < w; x++)
+                tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fv, src_stride,
+                                               6 - intermediate_bits) -
+                         PREP_BIAS;
+
+            tmp += w;
+            src += src_stride;
+        } while (--h);
+    } else
+        prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX);
+}
+
+static NOINLINE void
+prep_8tap_scaled_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
+                   const int w, int h, const int mx, int my,
+                   const int dx, const int dy, const int filter_type
+                   HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
+    int16_t mid[128 * (256 + 7)], *mid_ptr = mid;
+    src_stride = PXSTRIDE(src_stride);
+
+    src -= src_stride * 3;
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            GET_H_FILTER(imx >> 6);
+            mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
+                                                    6 - intermediate_bits) :
+                              src[ioff] << intermediate_bits;
+            imx += dx;
+            ioff += imx >> 10;
+            imx &= 0x3ff;
+        }
+
+        mid_ptr += 128;
+        src += src_stride;
+    } while (--tmp_h);
+
+    mid_ptr = mid + 128 * 3;
+    for (int y = 0; y < h; y++) {
+        int x;
+        GET_V_FILTER(my >> 6);
+
+        for (x = 0; x < w; x++)
+            tmp[x] = (fv ? DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6)
+                         : mid_ptr[x]) - PREP_BIAS;
+
+        my += dy;
+        mid_ptr += (my >> 10) * 128;
+        my &= 0x3ff;
+        tmp += w;
+    }
+}
+
+#define filter_fns(type, type_h, type_v) \
+static void put_8tap_##type##_c(pixel *const dst, \
+                                const ptrdiff_t dst_stride, \
+                                const pixel *const src, \
+                                const ptrdiff_t src_stride, \
+                                const int w, const int h, \
+                                const int mx, const int my \
+                                HIGHBD_DECL_SUFFIX) \
+{ \
+    put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \
+               type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
+} \
+static void put_8tap_##type##_scaled_c(pixel *const dst, \
+                                       const ptrdiff_t dst_stride, \
+                                       const pixel *const src, \
+                                       const ptrdiff_t src_stride, \
+                                       const int w, const int h, \
+                                       const int mx, const int my, \
+                                       const int dx, const int dy \
+                                       HIGHBD_DECL_SUFFIX) \
+{ \
+    put_8tap_scaled_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
+                      type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
+} \
+static void prep_8tap_##type##_c(int16_t *const tmp, \
+                                 const pixel *const src, \
+                                 const ptrdiff_t src_stride, \
+                                 const int w, const int h, \
+                                 const int mx, const int my \
+                                 HIGHBD_DECL_SUFFIX) \
+{ \
+    prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \
+                type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
+} \
+static void prep_8tap_##type##_scaled_c(int16_t *const tmp, \
+                                        const pixel *const src, \
+                                        const ptrdiff_t src_stride, \
+                                        const int w, const int h, \
+                                        const int mx, const int my, \
+                                        const int dx, const int dy \
+                                        HIGHBD_DECL_SUFFIX) \
+{ \
+    prep_8tap_scaled_c(tmp, src, src_stride, w, h, mx, my, dx, dy, \
+                       type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
+}
+
+filter_fns(regular,        DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR)
+filter_fns(regular_sharp,  DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP)
+filter_fns(regular_smooth, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH)
+filter_fns(smooth,         DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_SMOOTH)
+filter_fns(smooth_regular, DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_REGULAR)
+filter_fns(smooth_sharp,   DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_SHARP)
+filter_fns(sharp,          DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_SHARP)
+filter_fns(sharp_regular,  DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_REGULAR)
+filter_fns(sharp_smooth,   DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_SMOOTH)
+
+#define FILTER_BILIN(src, x, mxy, stride) \
+    (16 * src[x] + ((mxy) * (src[x + stride] - src[x])))
+
+#define FILTER_BILIN_RND(src, x, mxy, stride, sh) \
+    ((FILTER_BILIN(src, x, mxy, stride) + ((1 << (sh)) >> 1)) >> (sh))
+
+#define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \
+    iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
+
+static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,
+                        const pixel *src, ptrdiff_t src_stride,
+                        const int w, int h, const int mx, const int my
+                        HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    const int intermediate_rnd = (1 << intermediate_bits) >> 1;
+    dst_stride = PXSTRIDE(dst_stride);
+    src_stride = PXSTRIDE(src_stride);
+
+    if (mx) {
+        if (my) {
+            int16_t mid[128 * 129], *mid_ptr = mid;
+            int tmp_h = h + 1;
+
+            do {
+                for (int x = 0; x < w; x++)
+                    mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1,
+                                                  4 - intermediate_bits);
+
+                mid_ptr += 128;
+                src += src_stride;
+            } while (--tmp_h);
+
+            mid_ptr = mid;
+            do {
+                for (int x = 0; x < w; x++)
+                    dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128,
+                                               4 + intermediate_bits);
+
+                mid_ptr += 128;
+                dst += dst_stride;
+            } while (--h);
+        } else {
+            do {
+                for (int x = 0; x < w; x++) {
+                    const int px = FILTER_BILIN_RND(src, x, mx, 1,
+                                                    4 - intermediate_bits);
+                    dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits);
+                }
+
+                dst += dst_stride;
+                src += src_stride;
+            } while (--h);
+        }
+    } else if (my) {
+        do {
+            for (int x = 0; x < w; x++)
+                dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4);
+
+            dst += dst_stride;
+            src += src_stride;
+        } while (--h);
+    } else
+        put_c(dst, dst_stride, src, src_stride, w, h);
+}
+
+static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride,
+                               const pixel *src, ptrdiff_t src_stride,
+                               const int w, int h, const int mx, int my,
+                               const int dx, const int dy
+                               HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
+    int16_t mid[128 * (256 + 1)], *mid_ptr = mid;
+
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
+                                          4 - intermediate_bits);
+            imx += dx;
+            ioff += imx >> 10;
+            imx &= 0x3ff;
+        }
+
+        mid_ptr += 128;
+        src += PXSTRIDE(src_stride);
+    } while (--tmp_h);
+
+    mid_ptr = mid;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my >> 6, 128,
+                                       4 + intermediate_bits);
+
+        my += dy;
+        mid_ptr += (my >> 10) * 128;
+        my &= 0x3ff;
+        dst += PXSTRIDE(dst_stride);
+    } while (--h);
+}
+
+static void prep_bilin_c(int16_t *tmp,
+                         const pixel *src, ptrdiff_t src_stride,
+                         const int w, int h, const int mx, const int my
+                         HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    src_stride = PXSTRIDE(src_stride);
+
+    if (mx) {
+        if (my) {
+            int16_t mid[128 * 129], *mid_ptr = mid;
+            int tmp_h = h + 1;
+
+            do {
+                for (int x = 0; x < w; x++)
+                    mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1,
+                                                  4 - intermediate_bits);
+
+                mid_ptr += 128;
+                src += src_stride;
+            } while (--tmp_h);
+
+            mid_ptr = mid;
+            do {
+                for (int x = 0; x < w; x++)
+                    tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4) -
+                             PREP_BIAS;
+
+                mid_ptr += 128;
+                tmp += w;
+            } while (--h);
+        } else {
+            do {
+                for (int x = 0; x < w; x++)
+                    tmp[x] = FILTER_BILIN_RND(src, x, mx, 1,
+                                              4 - intermediate_bits) -
+                             PREP_BIAS;
+
+                tmp += w;
+                src += src_stride;
+            } while (--h);
+        }
+    } else if (my) {
+        do {
+            for (int x = 0; x < w; x++)
+                tmp[x] = FILTER_BILIN_RND(src, x, my, src_stride,
+                                          4 - intermediate_bits) - PREP_BIAS;
+
+            tmp += w;
+            src += src_stride;
+        } while (--h);
+    } else
+        prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX);
+}
+
+static void prep_bilin_scaled_c(int16_t *tmp,
+                                const pixel *src, ptrdiff_t src_stride,
+                                const int w, int h, const int mx, int my,
+                                const int dx, const int dy HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
+    int16_t mid[128 * (256 + 1)], *mid_ptr = mid;
+
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
+                                          4 - intermediate_bits);
+            imx += dx;
+            ioff += imx >> 10;
+            imx &= 0x3ff;
+        }
+
+        mid_ptr += 128;
+        src += PXSTRIDE(src_stride);
+    } while (--tmp_h);
+
+    mid_ptr = mid;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my >> 6, 128, 4) - PREP_BIAS;
+
+        my += dy;
+        mid_ptr += (my >> 10) * 128;
+        my &= 0x3ff;
+        tmp += w;
+    } while (--h);
+}
+
+static void avg_c(pixel *dst, const ptrdiff_t dst_stride,
+                  const int16_t *tmp1, const int16_t *tmp2, const int w, int h
+                  HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    const int sh = intermediate_bits + 1;
+    const int rnd = (1 << intermediate_bits) + PREP_BIAS * 2;
+    do {
+        for (int x = 0; x < w; x++)
+            dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + rnd) >> sh);
+
+        tmp1 += w;
+        tmp2 += w;
+        dst += PXSTRIDE(dst_stride);
+    } while (--h);
+}
+
+static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
+                    const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+                    const int weight HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    const int sh = intermediate_bits + 4;
+    const int rnd = (8 << intermediate_bits) + PREP_BIAS * 16;
+    do {
+        for (int x = 0; x < w; x++)
+            dst[x] = iclip_pixel((tmp1[x] * weight +
+                                  tmp2[x] * (16 - weight) + rnd) >> sh);
+
+        tmp1 += w;
+        tmp2 += w;
+        dst += PXSTRIDE(dst_stride);
+    } while (--h);
+}
+
+static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
+                   const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+                   const uint8_t *mask HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    const int sh = intermediate_bits + 6;
+    const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64;
+    do {
+        for (int x = 0; x < w; x++)
+            dst[x] = iclip_pixel((tmp1[x] * mask[x] +
+                                  tmp2[x] * (64 - mask[x]) + rnd) >> sh);
+
+        tmp1 += w;
+        tmp2 += w;
+        mask += w;
+        dst += PXSTRIDE(dst_stride);
+    } while (--h);
+}
+
+#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
+static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+                    const int w, int h, const uint8_t *mask)
+{
+    do {
+        for (int x = 0; x < w; x++) {
+            dst[x] = blend_px(dst[x], tmp[x], mask[x]);
+        }
+        dst += PXSTRIDE(dst_stride);
+        tmp += w;
+        mask += w;
+    } while (--h);
+}
+
+static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+                      const int w, int h)
+{
+    const uint8_t *const mask = &dav1d_obmc_masks[w];
+    do {
+        for (int x = 0; x < (w * 3) >> 2; x++) {
+            dst[x] = blend_px(dst[x], tmp[x], mask[x]);
+        }
+        dst += PXSTRIDE(dst_stride);
+        tmp += w;
+    } while (--h);
+}
+
+static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+                      const int w, int h)
+{
+    const uint8_t *mask = &dav1d_obmc_masks[h];
+    h = (h * 3) >> 2;
+    do {
+        const int m = *mask++;
+        for (int x = 0; x < w; x++) {
+            dst[x] = blend_px(dst[x], tmp[x], m);
+        }
+        dst += PXSTRIDE(dst_stride);
+        tmp += w;
+    } while (--h);
+}
+
+static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
+                     const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+                     uint8_t *mask, const int sign,
+                     const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
+{
+    // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,
+    // and then load this intermediate to calculate final value for odd rows
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    const int bitdepth = bitdepth_from_max(bitdepth_max);
+    const int sh = intermediate_bits + 6;
+    const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64;
+    const int mask_sh = bitdepth + intermediate_bits - 4;
+    const int mask_rnd = 1 << (mask_sh - 5);
+    do {
+        for (int x = 0; x < w; x++) {
+            const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64);
+            dst[x] = iclip_pixel((tmp1[x] * m +
+                                  tmp2[x] * (64 - m) + rnd) >> sh);
+
+            if (ss_hor) {
+                x++;
+
+                const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64);
+                dst[x] = iclip_pixel((tmp1[x] * n +
+                                      tmp2[x] * (64 - n) + rnd) >> sh);
+
+                if (h & ss_ver) {
+                    mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2;
+                } else if (ss_ver) {
+                    mask[x >> 1] = m + n;
+                } else {
+                    mask[x >> 1] = (m + n + 1 - sign) >> 1;
+                }
+            } else {
+                mask[x] = m;
+            }
+        }
+
+        tmp1 += w;
+        tmp2 += w;
+        dst += PXSTRIDE(dst_stride);
+        if (!ss_ver || (h & 1)) mask += w >> ss_hor;
+    } while (--h);
+}
+
+#define w_mask_fns(ssn, ss_hor, ss_ver) \
+static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \
+                             const int16_t *const tmp1, const int16_t *const tmp2, \
+                             const int w, const int h, uint8_t *mask, \
+                             const int sign HIGHBD_DECL_SUFFIX) \
+{ \
+    w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver \
+             HIGHBD_TAIL_SUFFIX); \
+}
+
+w_mask_fns(444, 0, 0);
+w_mask_fns(422, 1, 0);
+w_mask_fns(420, 1, 1);
+
+#undef w_mask_fns
+
+#if ARCH_X86
+#define FILTER_WARP(src, x, F, stride) \
+    (F[0] * src[x + -3 * stride] + \
+     F[4] * src[x + -2 * stride] + \
+     F[1] * src[x + -1 * stride] + \
+     F[5] * src[x + +0 * stride] + \
+     F[2] * src[x + +1 * stride] + \
+     F[6] * src[x + +2 * stride] + \
+     F[3] * src[x + +3 * stride] + \
+     F[7] * src[x + +4 * stride])
+#else
+#define FILTER_WARP(src, x, F, stride) \
+    (F[0] * src[x + -3 * stride] + \
+     F[1] * src[x + -2 * stride] + \
+     F[2] * src[x + -1 * stride] + \
+     F[3] * src[x + +0 * stride] + \
+     F[4] * src[x + +1 * stride] + \
+     F[5] * src[x + +2 * stride] + \
+     F[6] * src[x + +3 * stride] + \
+     F[7] * src[x + +4 * stride])
+#endif
+
+#define FILTER_WARP_RND(src, x, F, stride, sh) \
+    ((FILTER_WARP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh))
+
+#define FILTER_WARP_CLIP(src, x, F, stride, sh) \
+    iclip_pixel(FILTER_WARP_RND(src, x, F, stride, sh))
+
+static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
+                              const pixel *src, const ptrdiff_t src_stride,
+                              const int16_t *const abcd, int mx, int my
+                              HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    int16_t mid[15 * 8], *mid_ptr = mid;
+
+    src -= 3 * PXSTRIDE(src_stride);
+    for (int y = 0; y < 15; y++, mx += abcd[1]) {
+        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
+            const int8_t *const filter =
+                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
+
+            mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
+                                         7 - intermediate_bits);
+        }
+        src += PXSTRIDE(src_stride);
+        mid_ptr += 8;
+    }
+
+    mid_ptr = &mid[3 * 8];
+    for (int y = 0; y < 8; y++, my += abcd[3]) {
+        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
+            const int8_t *const filter =
+                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
+
+            dst[x] = FILTER_WARP_CLIP(mid_ptr, x, filter, 8,
+                                      7 + intermediate_bits);
+        }
+        mid_ptr += 8;
+        dst += PXSTRIDE(dst_stride);
+    }
+}
+
+static void warp_affine_8x8t_c(int16_t *tmp, const ptrdiff_t tmp_stride,
+                               const pixel *src, const ptrdiff_t src_stride,
+                               const int16_t *const abcd, int mx, int my
+                               HIGHBD_DECL_SUFFIX)
+{
+    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+    int16_t mid[15 * 8], *mid_ptr = mid;
+
+    src -= 3 * PXSTRIDE(src_stride);
+    for (int y = 0; y < 15; y++, mx += abcd[1]) {
+        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
+            const int8_t *const filter =
+                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
+
+            mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
+                                         7 - intermediate_bits);
+        }
+        src += PXSTRIDE(src_stride);
+        mid_ptr += 8;
+    }
+
+    mid_ptr = &mid[3 * 8];
+    for (int y = 0; y < 8; y++, my += abcd[3]) {
+        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
+            const int8_t *const filter =
+                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
+
+            tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS;
+        }
+        mid_ptr += 8;
+        tmp += tmp_stride;
+    }
+}
+
+static void emu_edge_c(const intptr_t bw, const intptr_t bh,
+                       const intptr_t iw, const intptr_t ih,
+                       const intptr_t x, const intptr_t y,
+                       pixel *dst, const ptrdiff_t dst_stride,
+                       const pixel *ref, const ptrdiff_t ref_stride)
+{
+    // find offset in reference of visible block to copy
+    ref += iclip((int) y, 0, (int) ih - 1) * PXSTRIDE(ref_stride) +
+           iclip((int) x, 0, (int) iw - 1);
+
+    // number of pixels to extend (left, right, top, bottom)
+    const int left_ext = iclip((int) -x, 0, (int) bw - 1);
+    const int right_ext = iclip((int) (x + bw - iw), 0, (int) bw - 1);
+    assert(left_ext + right_ext < bw);
+    const int top_ext = iclip((int) -y, 0, (int) bh - 1);
+    const int bottom_ext = iclip((int) (y + bh - ih), 0, (int) bh - 1);
+    assert(top_ext + bottom_ext < bh);
+
+    // copy visible portion first
+    pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
+    const int center_w = (int) (bw - left_ext - right_ext);
+    const int center_h = (int) (bh - top_ext - bottom_ext);
+    for (int y = 0; y < center_h; y++) {
+        pixel_copy(blk + left_ext, ref, center_w);
+        // extend left edge for this line
+        if (left_ext)
+            pixel_set(blk, blk[left_ext], left_ext);
+        // extend right edge for this line
+        if (right_ext)
+            pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
+                      right_ext);
+        ref += PXSTRIDE(ref_stride);
+        blk += PXSTRIDE(dst_stride);
+    }
+
+    // copy top
+    blk = dst + top_ext * PXSTRIDE(dst_stride);
+    for (int y = 0; y < top_ext; y++) {
+        pixel_copy(dst, blk, bw);
+        dst += PXSTRIDE(dst_stride);
+    }
+
+    // copy bottom
+    dst += center_h * PXSTRIDE(dst_stride);
+    for (int y = 0; y < bottom_ext; y++) {
+        pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
+        dst += PXSTRIDE(dst_stride);
+    }
+}
+
+static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
+                     const pixel *src, const ptrdiff_t src_stride,
+                     const int dst_w, int h, const int src_w,
+                     const int dx, const int mx0 HIGHBD_DECL_SUFFIX)
+{
+    do {
+        int mx = mx0, src_x = -1;
+        for (int x = 0; x < dst_w; x++) {
+            const int8_t *const F = dav1d_resize_filter[mx >> 8];
+            dst[x] = iclip_pixel((-(F[0] * src[iclip(src_x - 3, 0, src_w - 1)] +
+                                    F[1] * src[iclip(src_x - 2, 0, src_w - 1)] +
+                                    F[2] * src[iclip(src_x - 1, 0, src_w - 1)] +
+                                    F[3] * src[iclip(src_x + 0, 0, src_w - 1)] +
+                                    F[4] * src[iclip(src_x + 1, 0, src_w - 1)] +
+                                    F[5] * src[iclip(src_x + 2, 0, src_w - 1)] +
+                                    F[6] * src[iclip(src_x + 3, 0, src_w - 1)] +
+                                    F[7] * src[iclip(src_x + 4, 0, src_w - 1)]) +
+                                  64) >> 7);
+            mx += dx;
+            src_x += mx >> 14;
+            mx &= 0x3fff;
+        }
+
+        dst += PXSTRIDE(dst_stride);
+        src += PXSTRIDE(src_stride);
+    } while (--h);
+}
+
+COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
+#define init_mc_fns(type, name) do { \
+    c->mc        [type] = put_##name##_c; \
+    c->mc_scaled [type] = put_##name##_scaled_c; \
+    c->mct       [type] = prep_##name##_c; \
+    c->mct_scaled[type] = prep_##name##_scaled_c; \
+} while (0)
+
+    init_mc_fns(FILTER_2D_8TAP_REGULAR,        8tap_regular);
+    init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
+    init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp);
+    init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular);
+    init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth);
+    init_mc_fns(FILTER_2D_8TAP_SHARP,          8tap_sharp);
+    init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);
+    init_mc_fns(FILTER_2D_8TAP_SMOOTH,         8tap_smooth);
+    init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp);
+    init_mc_fns(FILTER_2D_BILINEAR,            bilin);
+
+    c->avg      = avg_c;
+    c->w_avg    = w_avg_c;
+    c->mask     = mask_c;
+    c->blend    = blend_c;
+    c->blend_v  = blend_v_c;
+    c->blend_h  = blend_h_c;
+    c->w_mask[0] = w_mask_444_c;
+    c->w_mask[1] = w_mask_422_c;
+    c->w_mask[2] = w_mask_420_c;
+    c->warp8x8  = warp_affine_8x8_c;
+    c->warp8x8t = warp_affine_8x8t_c;
+    c->emu_edge = emu_edge_c;
+    c->resize   = resize_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    bitfn(dav1d_mc_dsp_init_arm)(c);
+#elif ARCH_X86
+    bitfn(dav1d_mc_dsp_init_x86)(c);
+#endif
+#endif
+}
diff --git a/src/meson.build b/src/meson.build
new file mode 100644 (file)
index 0000000..fd8ad02
--- /dev/null
@@ -0,0 +1,336 @@
+# Copyright © 2018-2019, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# Build definition for the dav1d library
+#
+
+# libdav1d source files
+libdav1d_sources = files(
+    'cdf.c',
+    'cpu.c',
+    'data.c',
+    'decode.c',
+    'dequant_tables.c',
+    'getbits.c',
+    'intra_edge.c',
+    'itx_1d.c',
+    'lf_mask.c',
+    'log.c',
+    'msac.c',
+    'obu.c',
+    'picture.c',
+    'qm.c',
+    'ref.c',
+    'refmvs.c',
+    'scan.c',
+    'tables.c',
+    'warpmv.c',
+    'wedge.c',
+)
+
+# libdav1d bitdepth source files
+# These files are compiled for each bitdepth with
+# `BITDEPTH` defined to the currently built bitdepth.
+libdav1d_tmpl_sources = files(
+    'cdef_apply_tmpl.c',
+    'cdef_tmpl.c',
+    'fg_apply_tmpl.c',
+    'film_grain_tmpl.c',
+    'ipred_prepare_tmpl.c',
+    'ipred_tmpl.c',
+    'itx_tmpl.c',
+    'lf_apply_tmpl.c',
+    'loopfilter_tmpl.c',
+    'looprestoration_tmpl.c',
+    'lr_apply_tmpl.c',
+    'mc_tmpl.c',
+    'recon_tmpl.c',
+)
+
+libdav1d_arch_tmpl_sources = []
+
+libdav1d_bitdepth_objs = []
+
+# libdav1d entrypoint source files
+# These source files contain library entry points and are
+# built with the stack-realign flag set, where necessary.
+libdav1d_entrypoints_sources = files(
+    'lib.c',
+    'thread_task.c'
+)
+
+# ASM specific sources
+libdav1d_nasm_objs = []
+# Arch-specific flags
+arch_flags = []
+if is_asm_enabled
+    if (host_machine.cpu_family() == 'aarch64' or
+        host_machine.cpu_family().startswith('arm'))
+
+        libdav1d_sources += files(
+            'arm/cpu.c',
+        )
+        libdav1d_tmpl_sources += files(
+            'arm/cdef_init_tmpl.c',
+            'arm/ipred_init_tmpl.c',
+            'arm/itx_init_tmpl.c',
+            'arm/loopfilter_init_tmpl.c',
+            'arm/looprestoration_init_tmpl.c',
+            'arm/mc_init_tmpl.c',
+        )
+        if host_machine.cpu_family() == 'aarch64'
+            libdav1d_sources += files(
+                # itx.S is used for both 8 and 16 bpc.
+                'arm/64/itx.S',
+                'arm/64/looprestoration_common.S',
+                'arm/64/msac.S',
+            )
+
+            if dav1d_bitdepths.contains('8')
+                libdav1d_sources += files(
+                    'arm/64/cdef.S',
+                    'arm/64/ipred.S',
+                    'arm/64/loopfilter.S',
+                    'arm/64/looprestoration.S',
+                    'arm/64/mc.S',
+                )
+            endif
+
+            if dav1d_bitdepths.contains('16')
+                libdav1d_sources += files(
+                    'arm/64/cdef16.S',
+                    'arm/64/ipred16.S',
+                    'arm/64/itx16.S',
+                    'arm/64/loopfilter16.S',
+                    'arm/64/looprestoration16.S',
+                    'arm/64/mc16.S',
+                )
+            endif
+        elif host_machine.cpu_family().startswith('arm')
+            libdav1d_sources += files(
+                'arm/32/msac.S',
+            )
+
+            if dav1d_bitdepths.contains('8')
+                libdav1d_sources += files(
+                    'arm/32/cdef.S',
+                    'arm/32/ipred.S',
+                    'arm/32/itx.S',
+                    'arm/32/loopfilter.S',
+                    'arm/32/looprestoration.S',
+                    'arm/32/mc.S',
+                )
+            endif
+
+            if dav1d_bitdepths.contains('16')
+                libdav1d_sources += files(
+                )
+            endif
+        endif
+    elif host_machine.cpu_family().startswith('x86')
+
+        libdav1d_sources += files(
+            'x86/cpu.c',
+            'x86/msac_init.c',
+        )
+
+        libdav1d_tmpl_sources += files(
+            'x86/cdef_init_tmpl.c',
+            'x86/film_grain_init_tmpl.c',
+            'x86/ipred_init_tmpl.c',
+            'x86/itx_init_tmpl.c',
+            'x86/loopfilter_init_tmpl.c',
+            'x86/looprestoration_init_tmpl.c',
+            'x86/mc_init_tmpl.c',
+        )
+
+        # NASM source files
+        libdav1d_sources_asm = files(
+            'x86/cpuid.asm',
+            'x86/msac.asm',
+        )
+
+        if dav1d_bitdepths.contains('8')
+            libdav1d_sources_asm += files(
+                'x86/cdef_avx512.asm',
+                'x86/cdef_avx2.asm',
+                'x86/film_grain.asm',
+                'x86/ipred.asm',
+                'x86/itx.asm',
+                'x86/loopfilter.asm',
+                'x86/looprestoration.asm',
+                'x86/mc.asm',
+                'x86/cdef_sse.asm',
+                'x86/film_grain_ssse3.asm',
+                'x86/ipred_ssse3.asm',
+                'x86/itx_ssse3.asm',
+                'x86/loopfilter_ssse3.asm',
+                'x86/looprestoration_ssse3.asm',
+                'x86/mc_sse.asm',
+            )
+        endif
+
+        if dav1d_bitdepths.contains('16')
+            libdav1d_sources_asm += files(
+            )
+        endif
+
+        # Compile the ASM sources with NASM
+        libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
+    elif host_machine.cpu() == 'ppc64le'
+        arch_flags = ['-maltivec', '-mvsx']
+        libdav1d_sources += files(
+            'ppc/cpu.c',
+        )
+        libdav1d_arch_tmpl_sources += files(
+            'ppc/cdef_init_tmpl.c',
+            'ppc/looprestoration_init_tmpl.c',
+        )
+    endif
+endif
+
+
+
+api_export_flags = []
+
+#
+# Windows .rc file and API export flags
+#
+
+if host_machine.system() == 'windows' and get_option('default_library') != 'static'
+    rc_version_array = meson.project_version().split('.')
+    winmod = import('windows')
+    rc_data = configuration_data()
+    rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
+    rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
+    rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
+    rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
+    rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
+    rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
+    rc_data.set('COPYRIGHT_YEARS', '2019')
+
+    rc_file = configure_file(
+        input : 'dav1d.rc.in',
+        output : 'dav1d.rc',
+        configuration : rc_data
+    )
+
+    libdav1d_rc_obj = winmod.compile_resources(rc_file)
+
+    api_export_flags = ['-DDAV1D_BUILDING_DLL']
+else
+    libdav1d_rc_obj = []
+endif
+
+
+
+
+#
+# Library definitions
+#
+
+# Helper library for dav1d entrypoints
+libdav1d_entrypoints_objs = static_library('dav1d_entrypoint',
+    libdav1d_entrypoints_sources,
+    rev_target, config_h_target,
+
+    include_directories : dav1d_inc_dirs,
+    dependencies: [stdatomic_dependency],
+    c_args : [stackalign_flag, stackrealign_flag, api_export_flags],
+    install : false,
+    build_by_default : false,
+).extract_all_objects()
+
+# Helper library for each bitdepth
+libdav1d_bitdepth_objs = []
+foreach bitdepth : dav1d_bitdepths
+    libdav1d_bitdepth_objs += static_library(
+        'dav1d_bitdepth_@0@'.format(bitdepth),
+        libdav1d_tmpl_sources, config_h_target,
+        include_directories: dav1d_inc_dirs,
+        dependencies : [stdatomic_dependency],
+        c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag,
+        install : false,
+        build_by_default : false,
+    ).extract_all_objects()
+endforeach
+
+# Helper library for each bitdepth and architecture-specific flags
+foreach bitdepth : dav1d_bitdepths
+    libdav1d_bitdepth_objs += static_library(
+        'dav1d_arch_bitdepth_@0@'.format(bitdepth),
+        libdav1d_arch_tmpl_sources, config_h_target,
+        include_directories: dav1d_inc_dirs,
+        dependencies : [stdatomic_dependency],
+        c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags,
+        install : false,
+        build_by_default : false,
+    ).extract_all_objects()
+endforeach
+
+# The final dav1d library
+if host_machine.system() == 'windows'
+    dav1d_soversion = ''
+else
+    dav1d_soversion = dav1d_api_version_major
+endif
+
+libdav1d = library('dav1d',
+    libdav1d_sources,
+    libdav1d_nasm_objs,
+    libdav1d_rc_obj,
+
+    objects : [
+        libdav1d_bitdepth_objs,
+        libdav1d_entrypoints_objs
+        ],
+
+    include_directories : dav1d_inc_dirs,
+    dependencies : [
+        stdatomic_dependency,
+        thread_dependency,
+        thread_compat_dep,
+        libdl_dependency,
+        ],
+    c_args : [stackalign_flag, api_export_flags],
+    version : dav1d_soname_version,
+    soversion : dav1d_soversion,
+    install : true,
+)
+
+dav1d_dep = declare_dependency(link_with: libdav1d,
+    include_directories : include_directories('../include/dav1d')
+)
+
+#
+# Generate pkg-config .pc file
+#
+pkg_mod = import('pkgconfig')
+pkg_mod.generate(libraries: libdav1d,
+    version: meson.project_version(),
+    name: 'libdav1d',
+    filebase: 'dav1d',
+    description: 'AV1 decoding library'
+)
diff --git a/src/msac.c b/src/msac.c
new file mode 100644 (file)
index 0000000..8195977
--- /dev/null
@@ -0,0 +1,208 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <limits.h>
+
+#include "common/intops.h"
+
+#include "src/msac.h"
+
+#define EC_PROB_SHIFT 6
+#define EC_MIN_PROB 4  // must be <= (1<<EC_PROB_SHIFT)/16
+
+#define EC_WIN_SIZE (sizeof(ec_win) << 3)
+
+static inline void ctx_refill(MsacContext *const s) {
+    const uint8_t *buf_pos = s->buf_pos;
+    const uint8_t *buf_end = s->buf_end;
+    int c = EC_WIN_SIZE - s->cnt - 24;
+    ec_win dif = s->dif;
+    while (c >= 0 && buf_pos < buf_end) {
+        dif ^= ((ec_win)*buf_pos++) << c;
+        c -= 8;
+    }
+    s->dif = dif;
+    s->cnt = EC_WIN_SIZE - c - 24;
+    s->buf_pos = buf_pos;
+}
+
+/* Takes updated dif and range values, renormalizes them so that
+ * 32768 <= rng < 65536 (reading more bytes from the stream into dif if
+ * necessary), and stores them back in the decoder context.
+ * dif: The new value of dif.
+ * rng: The new value of the range. */
+static inline void ctx_norm(MsacContext *const s, const ec_win dif,
+                            const unsigned rng)
+{
+    const int d = 15 ^ (31 ^ clz(rng));
+    assert(rng <= 65535U);
+    s->cnt -= d;
+    s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
+    s->rng = rng << d;
+    if (s->cnt < 0)
+        ctx_refill(s);
+}
+
+unsigned dav1d_msac_decode_bool_equi_c(MsacContext *const s) {
+    const unsigned r = s->rng;
+    ec_win dif = s->dif;
+    assert((dif >> (EC_WIN_SIZE - 16)) < r);
+    // When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can
+    // replace the multiply with a simple shift.
+    unsigned v = ((r >> 8) << 7) + EC_MIN_PROB;
+    const ec_win vw = (ec_win)v << (EC_WIN_SIZE - 16);
+    const unsigned ret = dif >= vw;
+    dif -= ret * vw;
+    v += ret * (r - 2 * v);
+    ctx_norm(s, dif, v);
+    return !ret;
+}
+
+/* Decode a single binary value.
+ * f: The probability that the bit is one
+ * Return: The value decoded (0 or 1). */
+unsigned dav1d_msac_decode_bool_c(MsacContext *const s, const unsigned f) {
+    const unsigned r = s->rng;
+    ec_win dif = s->dif;
+    assert((dif >> (EC_WIN_SIZE - 16)) < r);
+    unsigned v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
+    const ec_win vw = (ec_win)v << (EC_WIN_SIZE - 16);
+    const unsigned ret = dif >= vw;
+    dif -= ret * vw;
+    v += ret * (r - 2 * v);
+    ctx_norm(s, dif, v);
+    return !ret;
+}
+
+int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
+                             const int n, const unsigned k)
+{
+    int i = 0;
+    int a = 0;
+    int b = k;
+    while ((2 << b) < n) {
+        if (!dav1d_msac_decode_bool_equi(s)) break;
+        b = k + i++;
+        a = (1 << b);
+    }
+    const unsigned v = dav1d_msac_decode_bools(s, b) + a;
+    return ref * 2 <= n ? inv_recenter(ref, v) :
+                          n - 1 - inv_recenter(n - 1 - ref, v);
+}
+
+/* Decodes a symbol given an inverse cumulative distribution function (CDF)
+ * table in Q15. */
+unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
+                                          uint16_t *const cdf,
+                                          const size_t n_symbols)
+{
+    const unsigned c = s->dif >> (EC_WIN_SIZE - 16), r = s->rng >> 8;
+    unsigned u, v = s->rng, val = -1;
+
+    assert(n_symbols <= 15);
+    assert(cdf[n_symbols] <= 32);
+
+    do {
+        val++;
+        u = v;
+        v = r * (cdf[val] >> EC_PROB_SHIFT);
+        v >>= 7 - EC_PROB_SHIFT;
+        v += EC_MIN_PROB * ((unsigned)n_symbols - val);
+    } while (c < v);
+
+    assert(u <= s->rng);
+
+    ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v);
+
+    if (s->allow_update_cdf) {
+        const unsigned count = cdf[n_symbols];
+        const unsigned rate = 4 + (count >> 4) + (n_symbols > 2);
+        unsigned i;
+        for (i = 0; i < val; i++)
+            cdf[i] += (32768 - cdf[i]) >> rate;
+        for (; i < n_symbols; i++)
+            cdf[i] -= cdf[i] >> rate;
+        cdf[n_symbols] = count + (count < 32);
+    }
+
+    return val;
+}
+
+unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *const s,
+                                        uint16_t *const cdf)
+{
+    const unsigned bit = dav1d_msac_decode_bool(s, *cdf);
+
+    if (s->allow_update_cdf) {
+        // update_cdf() specialized for boolean CDFs
+        const unsigned count = cdf[1];
+        const int rate = 4 + (count >> 4);
+        if (bit)
+            cdf[0] += (32768 - cdf[0]) >> rate;
+        else
+            cdf[0] -= cdf[0] >> rate;
+        cdf[1] = count + (count < 32);
+    }
+
+    return bit;
+}
+
+unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) {
+    unsigned tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+    unsigned tok = 3 + tok_br;
+    if (tok_br == 3) {
+        tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+        tok = 6 + tok_br;
+        if (tok_br == 3) {
+            tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+            tok = 9 + tok_br;
+            if (tok_br == 3)
+                tok = 12 + dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+        }
+    }
+    return tok;
+}
+
+void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
+                     const size_t sz, const int disable_cdf_update_flag)
+{
+    s->buf_pos = data;
+    s->buf_end = data + sz;
+    s->dif = ((ec_win)1 << (EC_WIN_SIZE - 1)) - 1;
+    s->rng = 0x8000;
+    s->cnt = -15;
+    s->allow_update_cdf = !disable_cdf_update_flag;
+    ctx_refill(s);
+
+#if ARCH_X86_64 && HAVE_ASM
+    s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
+
+    dav1d_msac_init_x86(s);
+#endif
+}
diff --git a/src/msac.h b/src/msac.h
new file mode 100644 (file)
index 0000000..eb04f58
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_MSAC_H
+#define DAV1D_SRC_MSAC_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+typedef size_t ec_win;
+
+typedef struct MsacContext {
+    const uint8_t *buf_pos;
+    const uint8_t *buf_end;
+    ec_win dif;
+    unsigned rng;
+    int cnt;
+    int allow_update_cdf;
+
+#if ARCH_X86_64 && HAVE_ASM
+    unsigned (*symbol_adapt16)(struct MsacContext *s, uint16_t *cdf, size_t n_symbols);
+#endif
+} MsacContext;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/msac.h"
+#elif ARCH_X86
+#include "src/x86/msac.h"
+#endif
+#endif
+
+void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz,
+                     int disable_cdf_update_flag);
+unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
+                                          size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_c(MsacContext *s);
+unsigned dav1d_msac_decode_bool_c(MsacContext *s, unsigned f);
+unsigned dav1d_msac_decode_hi_tok_c(MsacContext *s, uint16_t *cdf);
+int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
+
+/* Supported n_symbols ranges: adapt4: 1-4, adapt8: 1-7, adapt16: 3-15 */
+#ifndef dav1d_msac_decode_symbol_adapt4
+#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_symbol_adapt8
+#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_symbol_adapt16
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_bool_adapt
+#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_c
+#endif
+#ifndef dav1d_msac_decode_bool_equi
+#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_c
+#endif
+#ifndef dav1d_msac_decode_bool
+#define dav1d_msac_decode_bool           dav1d_msac_decode_bool_c
+#endif
+#ifndef dav1d_msac_decode_hi_tok
+#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_c
+#endif
+
+static inline unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
+    unsigned v = 0;
+    while (n--)
+        v = (v << 1) | dav1d_msac_decode_bool_equi(s);
+    return v;
+}
+
+static inline int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) {
+    assert(n > 0);
+    const int l = ulog2(n) + 1;
+    assert(l > 1);
+    const unsigned m = (1 << l) - n;
+    const unsigned v = dav1d_msac_decode_bools(s, l - 1);
+    return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s);
+}
+
+#endif /* DAV1D_SRC_MSAC_H */
diff --git a/src/obu.c b/src/obu.c
new file mode 100644 (file)
index 0000000..ab9688c
--- /dev/null
+++ b/src/obu.c
@@ -0,0 +1,1559 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "dav1d/data.h"
+
+#include "common/intops.h"
+
+#include "src/decode.h"
+#include "src/getbits.h"
+#include "src/levels.h"
+#include "src/log.h"
+#include "src/obu.h"
+#include "src/ref.h"
+#include "src/thread_task.h"
+
+static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
+                         Dav1dSequenceHeader *const hdr)
+{
+#define DEBUG_SEQ_HDR 0
+
+#if DEBUG_SEQ_HDR
+    const unsigned init_bit_pos = dav1d_get_bits_pos(gb);
+#endif
+
+    hdr->profile = dav1d_get_bits(gb, 3);
+    if (hdr->profile > 2) goto error;
+#if DEBUG_SEQ_HDR
+    printf("SEQHDR: post-profile: off=%ld\n",
+           dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+    hdr->still_picture = dav1d_get_bits(gb, 1);
+    hdr->reduced_still_picture_header = dav1d_get_bits(gb, 1);
+    if (hdr->reduced_still_picture_header && !hdr->still_picture) goto error;
+#if DEBUG_SEQ_HDR
+    printf("SEQHDR: post-stillpicture_flags: off=%ld\n",
+           dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+    if (hdr->reduced_still_picture_header) {
+        hdr->timing_info_present = 0;
+        hdr->decoder_model_info_present = 0;
+        hdr->display_model_info_present = 0;
+        hdr->num_operating_points = 1;
+        hdr->operating_points[0].idc = 0;
+        hdr->operating_points[0].major_level = dav1d_get_bits(gb, 3);
+        hdr->operating_points[0].minor_level = dav1d_get_bits(gb, 2);
+        hdr->operating_points[0].tier = 0;
+        hdr->operating_points[0].decoder_model_param_present = 0;
+        hdr->operating_points[0].display_model_param_present = 0;
+    } else {
+        hdr->timing_info_present = dav1d_get_bits(gb, 1);
+        if (hdr->timing_info_present) {
+            hdr->num_units_in_tick = dav1d_get_bits(gb, 32);
+            hdr->time_scale = dav1d_get_bits(gb, 32);
+            hdr->equal_picture_interval = dav1d_get_bits(gb, 1);
+            if (hdr->equal_picture_interval) {
+                const unsigned num_ticks_per_picture = dav1d_get_vlc(gb);
+                if (num_ticks_per_picture == 0xFFFFFFFFU)
+                    goto error;
+                hdr->num_ticks_per_picture = num_ticks_per_picture + 1;
+            }
+
+            hdr->decoder_model_info_present = dav1d_get_bits(gb, 1);
+            if (hdr->decoder_model_info_present) {
+                hdr->encoder_decoder_buffer_delay_length = dav1d_get_bits(gb, 5) + 1;
+                hdr->num_units_in_decoding_tick = dav1d_get_bits(gb, 32);
+                hdr->buffer_removal_delay_length = dav1d_get_bits(gb, 5) + 1;
+                hdr->frame_presentation_delay_length = dav1d_get_bits(gb, 5) + 1;
+            }
+        } else {
+            hdr->decoder_model_info_present = 0;
+        }
+#if DEBUG_SEQ_HDR
+        printf("SEQHDR: post-timinginfo: off=%ld\n",
+               dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+        hdr->display_model_info_present = dav1d_get_bits(gb, 1);
+        hdr->num_operating_points = dav1d_get_bits(gb, 5) + 1;
+        for (int i = 0; i < hdr->num_operating_points; i++) {
+            struct Dav1dSequenceHeaderOperatingPoint *const op =
+                &hdr->operating_points[i];
+            op->idc = dav1d_get_bits(gb, 12);
+            op->major_level = 2 + dav1d_get_bits(gb, 3);
+            op->minor_level = dav1d_get_bits(gb, 2);
+            op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0;
+            op->decoder_model_param_present =
+                hdr->decoder_model_info_present && dav1d_get_bits(gb, 1);
+            if (op->decoder_model_param_present) {
+                struct Dav1dSequenceHeaderOperatingParameterInfo *const opi =
+                    &hdr->operating_parameter_info[i];
+                opi->decoder_buffer_delay =
+                    dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
+                opi->encoder_buffer_delay =
+                    dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
+                opi->low_delay_mode = dav1d_get_bits(gb, 1);
+            }
+            op->display_model_param_present =
+                hdr->display_model_info_present && dav1d_get_bits(gb, 1);
+            if (op->display_model_param_present) {
+                op->initial_display_delay = dav1d_get_bits(gb, 4) + 1;
+            }
+        }
+        const int op_idx =
+            c->operating_point < hdr->num_operating_points ? c->operating_point : 0;
+        c->operating_point_idc = hdr->operating_points[op_idx].idc;
+#if DEBUG_SEQ_HDR
+        printf("SEQHDR: post-operating-points: off=%ld\n",
+               dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+    }
+
+    hdr->width_n_bits = dav1d_get_bits(gb, 4) + 1;
+    hdr->height_n_bits = dav1d_get_bits(gb, 4) + 1;
+    hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1;
+    hdr->max_height = dav1d_get_bits(gb, hdr->height_n_bits) + 1;
+#if DEBUG_SEQ_HDR
+    printf("SEQHDR: post-size: off=%ld\n",
+           dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+    hdr->frame_id_numbers_present =
+        hdr->reduced_still_picture_header ? 0 : dav1d_get_bits(gb, 1);
+    if (hdr->frame_id_numbers_present) {
+        hdr->delta_frame_id_n_bits = dav1d_get_bits(gb, 4) + 2;
+        hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1;
+    }
+#if DEBUG_SEQ_HDR
+    printf("SEQHDR: post-frame-id-numbers-present: off=%ld\n",
+           dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+    hdr->sb128 = dav1d_get_bits(gb, 1);
+    hdr->filter_intra = dav1d_get_bits(gb, 1);
+    hdr->intra_edge_filter = dav1d_get_bits(gb, 1);
+    if (hdr->reduced_still_picture_header) {
+        hdr->inter_intra = 0;
+        hdr->masked_compound = 0;
+        hdr->warped_motion = 0;
+        hdr->dual_filter = 0;
+        hdr->order_hint = 0;
+        hdr->jnt_comp = 0;
+        hdr->ref_frame_mvs = 0;
+        hdr->order_hint_n_bits = 0;
+        hdr->screen_content_tools = DAV1D_ADAPTIVE;
+        hdr->force_integer_mv = DAV1D_ADAPTIVE;
+    } else {
+        hdr->inter_intra = dav1d_get_bits(gb, 1);
+        hdr->masked_compound = dav1d_get_bits(gb, 1);
+        hdr->warped_motion = dav1d_get_bits(gb, 1);
+        hdr->dual_filter = dav1d_get_bits(gb, 1);
+        hdr->order_hint = dav1d_get_bits(gb, 1);
+        if (hdr->order_hint) {
+            hdr->jnt_comp = dav1d_get_bits(gb, 1);
+            hdr->ref_frame_mvs = dav1d_get_bits(gb, 1);
+        } else {
+            hdr->jnt_comp = 0;
+            hdr->ref_frame_mvs = 0;
+            hdr->order_hint_n_bits = 0;
+        }
+        hdr->screen_content_tools = dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1);
+    #if DEBUG_SEQ_HDR
+        printf("SEQHDR: post-screentools: off=%ld\n",
+               dav1d_get_bits_pos(gb) - init_bit_pos);
+    #endif
+        hdr->force_integer_mv = hdr->screen_content_tools ?
+                                dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1) : 2;
+        if (hdr->order_hint)
+            hdr->order_hint_n_bits = dav1d_get_bits(gb, 3) + 1;
+    }
+    hdr->super_res = dav1d_get_bits(gb, 1);
+    hdr->cdef = dav1d_get_bits(gb, 1);
+    hdr->restoration = dav1d_get_bits(gb, 1);
+#if DEBUG_SEQ_HDR
+    printf("SEQHDR: post-featurebits: off=%ld\n",
+           dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+    hdr->hbd = dav1d_get_bits(gb, 1);
+    if (hdr->profile == 2 && hdr->hbd) hdr->hbd += dav1d_get_bits(gb, 1);
+    hdr->monochrome = hdr->profile != 1 ? dav1d_get_bits(gb, 1) : 0;
+    hdr->color_description_present = dav1d_get_bits(gb, 1);
+    if (hdr->color_description_present) {
+        hdr->pri = dav1d_get_bits(gb, 8);
+        hdr->trc = dav1d_get_bits(gb, 8);
+        hdr->mtrx = dav1d_get_bits(gb, 8);
+    } else {
+        hdr->pri = DAV1D_COLOR_PRI_UNKNOWN;
+        hdr->trc = DAV1D_TRC_UNKNOWN;
+        hdr->mtrx = DAV1D_MC_UNKNOWN;
+    }
+    if (hdr->monochrome) {
+        hdr->color_range = dav1d_get_bits(gb, 1);
+        hdr->layout = DAV1D_PIXEL_LAYOUT_I400;
+        hdr->ss_hor = hdr->ss_ver = 1;
+        hdr->chr = DAV1D_CHR_UNKNOWN;
+        hdr->separate_uv_delta_q = 0;
+    } else if (hdr->pri == DAV1D_COLOR_PRI_BT709 &&
+               hdr->trc == DAV1D_TRC_SRGB &&
+               hdr->mtrx == DAV1D_MC_IDENTITY)
+    {
+        hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
+        hdr->ss_hor = hdr->ss_ver = 0;
+        hdr->color_range = 1;
+        if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->hbd == 2))
+            goto error;
+    } else {
+        hdr->color_range = dav1d_get_bits(gb, 1);
+        switch (hdr->profile) {
+        case 0: hdr->layout = DAV1D_PIXEL_LAYOUT_I420;
+                hdr->ss_hor = hdr->ss_ver = 1;
+                break;
+        case 1: hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
+                hdr->ss_hor = hdr->ss_ver = 0;
+                break;
+        case 2:
+            if (hdr->hbd == 2) {
+                hdr->ss_hor = dav1d_get_bits(gb, 1);
+                hdr->ss_ver = hdr->ss_hor && dav1d_get_bits(gb, 1);
+            } else {
+                hdr->ss_hor = 1;
+                hdr->ss_ver = 0;
+            }
+            hdr->layout = hdr->ss_hor ?
+                          hdr->ss_ver ? DAV1D_PIXEL_LAYOUT_I420 :
+                                        DAV1D_PIXEL_LAYOUT_I422 :
+                                        DAV1D_PIXEL_LAYOUT_I444;
+            break;
+        }
+        hdr->chr = hdr->ss_hor == 1 && hdr->ss_ver == 1 ?
+                   dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN;
+    }
+    hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1);
+#if DEBUG_SEQ_HDR
+    printf("SEQHDR: post-colorinfo: off=%ld\n",
+           dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+    hdr->film_grain_present = dav1d_get_bits(gb, 1);
+#if DEBUG_SEQ_HDR
+    printf("SEQHDR: post-filmgrain: off=%ld\n",
+           dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+    dav1d_get_bits(gb, 1); // dummy bit
+
+    // We needn't bother flushing the OBU here: we'll check we didn't
+    // overrun in the caller and will then discard gb, so there's no
+    // point in setting its position properly.
+
+    return 0;
+
+error:
+    dav1d_log(c, "Error parsing sequence header\n");
+    return DAV1D_ERR(EINVAL);
+}
+
+static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
+                           const int use_ref)
+{
+    const Dav1dSequenceHeader *const seqhdr = c->seq_hdr;
+    Dav1dFrameHeader *const hdr = c->frame_hdr;
+
+    if (use_ref) {
+        for (int i = 0; i < 7; i++) {
+            if (dav1d_get_bits(gb, 1)) {
+                const Dav1dThreadPicture *const ref =
+                    &c->refs[c->frame_hdr->refidx[i]].p;
+                if (!ref->p.data[0]) return -1;
+                hdr->width[1] = ref->p.p.w;
+                hdr->height = ref->p.p.h;
+                hdr->render_width = ref->p.frame_hdr->render_width;
+                hdr->render_height = ref->p.frame_hdr->render_height;
+                hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1);
+                if (hdr->super_res.enabled) {
+                    const int d = hdr->super_res.width_scale_denominator =
+                        9 + dav1d_get_bits(gb, 3);
+                    hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d,
+                                         imin(16, hdr->width[1]));
+                } else {
+                    hdr->super_res.width_scale_denominator = 8;
+                    hdr->width[0] = hdr->width[1];
+                }
+                return 0;
+            }
+        }
+    }
+
+    if (hdr->frame_size_override) {
+        hdr->width[1] = dav1d_get_bits(gb, seqhdr->width_n_bits) + 1;
+        hdr->height = dav1d_get_bits(gb, seqhdr->height_n_bits) + 1;
+    } else {
+        hdr->width[1] = seqhdr->max_width;
+        hdr->height = seqhdr->max_height;
+    }
+    hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1);
+    if (hdr->super_res.enabled) {
+        const int d = hdr->super_res.width_scale_denominator = 9 + dav1d_get_bits(gb, 3);
+        hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d, imin(16, hdr->width[1]));
+    } else {
+        hdr->super_res.width_scale_denominator = 8;
+        hdr->width[0] = hdr->width[1];
+    }
+    hdr->have_render_size = dav1d_get_bits(gb, 1);
+    if (hdr->have_render_size) {
+        hdr->render_width = dav1d_get_bits(gb, 16) + 1;
+        hdr->render_height = dav1d_get_bits(gb, 16) + 1;
+    } else {
+        hdr->render_width = hdr->width[1];
+        hdr->render_height = hdr->height;
+    }
+    return 0;
+}
+
+static inline int tile_log2(const int sz, const int tgt) {
+    int k;
+    for (k = 0; (sz << k) < tgt; k++) ;
+    return k;
+}
+
+static const Dav1dLoopfilterModeRefDeltas default_mode_ref_deltas = {
+    .mode_delta = { 0, 0 },
+    .ref_delta = { 1, 0, 0, 0, -1, 0, -1, -1 },
+};
+
+static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
+#define DEBUG_FRAME_HDR 0
+
+#if DEBUG_FRAME_HDR
+    const uint8_t *const init_ptr = gb->ptr;
+#endif
+    const Dav1dSequenceHeader *const seqhdr = c->seq_hdr;
+    Dav1dFrameHeader *const hdr = c->frame_hdr;
+
+    hdr->show_existing_frame =
+        !seqhdr->reduced_still_picture_header && dav1d_get_bits(gb, 1);
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-show_existing_frame: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+    if (hdr->show_existing_frame) {
+        hdr->existing_frame_idx = dav1d_get_bits(gb, 3);
+        if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
+            hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
+        if (seqhdr->frame_id_numbers_present)
+            hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
+        return 0;
+    }
+
+    hdr->frame_type = seqhdr->reduced_still_picture_header ? DAV1D_FRAME_TYPE_KEY : dav1d_get_bits(gb, 2);
+    hdr->show_frame = seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1);
+    if (hdr->show_frame) {
+        if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
+            hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
+    } else
+        hdr->showable_frame = dav1d_get_bits(gb, 1);
+    hdr->error_resilient_mode =
+        (hdr->frame_type == DAV1D_FRAME_TYPE_KEY && hdr->show_frame) ||
+        hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ||
+        seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1);
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-frametype_bits: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+    hdr->disable_cdf_update = dav1d_get_bits(gb, 1);
+    hdr->allow_screen_content_tools = seqhdr->screen_content_tools == DAV1D_ADAPTIVE ?
+                                 dav1d_get_bits(gb, 1) : seqhdr->screen_content_tools;
+    if (hdr->allow_screen_content_tools)
+        hdr->force_integer_mv = seqhdr->force_integer_mv == DAV1D_ADAPTIVE ?
+                                dav1d_get_bits(gb, 1) : seqhdr->force_integer_mv;
+    else
+        hdr->force_integer_mv = 0;
+
+    if (!(hdr->frame_type & 1))
+        hdr->force_integer_mv = 1;
+
+    if (seqhdr->frame_id_numbers_present)
+        hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
+
+    hdr->frame_size_override = seqhdr->reduced_still_picture_header ? 0 :
+                               hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bits(gb, 1);
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-frame_size_override_flag: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+    hdr->frame_offset = seqhdr->order_hint ?
+                        dav1d_get_bits(gb, seqhdr->order_hint_n_bits) : 0;
+    hdr->primary_ref_frame = !hdr->error_resilient_mode && hdr->frame_type & 1 ?
+                             dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE;
+
+    if (seqhdr->decoder_model_info_present) {
+        hdr->buffer_removal_time_present = dav1d_get_bits(gb, 1);
+        if (hdr->buffer_removal_time_present) {
+            for (int i = 0; i < c->seq_hdr->num_operating_points; i++) {
+                const struct Dav1dSequenceHeaderOperatingPoint *const seqop = &seqhdr->operating_points[i];
+                struct Dav1dFrameHeaderOperatingPoint *const op = &hdr->operating_points[i];
+                if (seqop->decoder_model_param_present) {
+                    int in_temporal_layer = (seqop->idc >> hdr->temporal_id) & 1;
+                    int in_spatial_layer  = (seqop->idc >> (hdr->spatial_id + 8)) & 1;
+                    if (!seqop->idc || (in_temporal_layer && in_spatial_layer))
+                        op->buffer_removal_time = dav1d_get_bits(gb, seqhdr->buffer_removal_delay_length);
+                }
+            }
+        }
+    }
+
+    if (hdr->frame_type == DAV1D_FRAME_TYPE_KEY ||
+        hdr->frame_type == DAV1D_FRAME_TYPE_INTRA)
+    {
+        hdr->refresh_frame_flags = (hdr->frame_type == DAV1D_FRAME_TYPE_KEY &&
+                                    hdr->show_frame) ? 0xff : dav1d_get_bits(gb, 8);
+        if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint)
+            for (int i = 0; i < 8; i++)
+                dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
+        if (read_frame_size(c, gb, 0) < 0) goto error;
+        hdr->allow_intrabc = hdr->allow_screen_content_tools &&
+                             !hdr->super_res.enabled && dav1d_get_bits(gb, 1);
+        hdr->use_ref_frame_mvs = 0;
+    } else {
+        hdr->allow_intrabc = 0;
+        hdr->refresh_frame_flags = hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 0xff :
+                                   dav1d_get_bits(gb, 8);
+        if (hdr->error_resilient_mode && seqhdr->order_hint)
+            for (int i = 0; i < 8; i++)
+                dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
+        hdr->frame_ref_short_signaling =
+            seqhdr->order_hint && dav1d_get_bits(gb, 1);
+        if (hdr->frame_ref_short_signaling) { // FIXME: Nearly verbatim copy from section 7.8
+            hdr->refidx[0] = dav1d_get_bits(gb, 3);
+            hdr->refidx[1] = hdr->refidx[2] = -1;
+            hdr->refidx[3] = dav1d_get_bits(gb, 3);
+            hdr->refidx[4] = hdr->refidx[5] = hdr->refidx[6] = -1;
+
+            int shifted_frame_offset[8];
+            const int current_frame_offset = 1 << (seqhdr->order_hint_n_bits - 1);
+            for (int i = 0; i < 8; i++) {
+                if (!c->refs[i].p.p.frame_hdr) goto error;
+                shifted_frame_offset[i] = current_frame_offset +
+                    get_poc_diff(seqhdr->order_hint_n_bits,
+                                 c->refs[i].p.p.frame_hdr->frame_offset,
+                                 hdr->frame_offset);
+            }
+
+            int used_frame[8] = { 0 };
+            used_frame[hdr->refidx[0]] = 1;
+            used_frame[hdr->refidx[3]] = 1;
+
+            int latest_frame_offset = -1;
+            for (int i = 0; i < 8; i++) {
+                const int hint = shifted_frame_offset[i];
+                if (!used_frame[i] && hint >= current_frame_offset &&
+                    hint >= latest_frame_offset)
+                {
+                    hdr->refidx[6] = i;
+                    latest_frame_offset = hint;
+                }
+            }
+            if (latest_frame_offset != -1)
+                used_frame[hdr->refidx[6]] = 1;
+
+            int earliest_frame_offset = INT_MAX;
+            for (int i = 0; i < 8; i++) {
+                const int hint = shifted_frame_offset[i];
+                if (!used_frame[i] && hint >= current_frame_offset &&
+                    hint < earliest_frame_offset)
+                {
+                    hdr->refidx[4] = i;
+                    earliest_frame_offset = hint;
+                }
+            }
+            if (earliest_frame_offset != INT_MAX)
+                used_frame[hdr->refidx[4]] = 1;
+
+            earliest_frame_offset = INT_MAX;
+            for (int i = 0; i < 8; i++) {
+                const int hint = shifted_frame_offset[i];
+                if (!used_frame[i] && hint >= current_frame_offset &&
+                    (hint < earliest_frame_offset))
+                {
+                    hdr->refidx[5] = i;
+                    earliest_frame_offset = hint;
+                }
+            }
+            if (earliest_frame_offset != INT_MAX)
+                used_frame[hdr->refidx[5]] = 1;
+
+            for (int i = 1; i < 7; i++) {
+                if (hdr->refidx[i] < 0) {
+                    latest_frame_offset = -1;
+                    for (int j = 0; j < 8; j++) {
+                        const int hint = shifted_frame_offset[j];
+                        if (!used_frame[j] && hint < current_frame_offset &&
+                            hint >= latest_frame_offset)
+                        {
+                            hdr->refidx[i] = j;
+                            latest_frame_offset = hint;
+                        }
+                    }
+                    if (latest_frame_offset != -1)
+                        used_frame[hdr->refidx[i]] = 1;
+                }
+            }
+
+            earliest_frame_offset = INT_MAX;
+            int ref = -1;
+            for (int i = 0; i < 8; i++) {
+                const int hint = shifted_frame_offset[i];
+                if (hint < earliest_frame_offset) {
+                    ref = i;
+                    earliest_frame_offset = hint;
+                }
+            }
+            for (int i = 0; i < 7; i++) {
+                if (hdr->refidx[i] < 0)
+                    hdr->refidx[i] = ref;
+            }
+        }
+        for (int i = 0; i < 7; i++) {
+            if (!hdr->frame_ref_short_signaling)
+                hdr->refidx[i] = dav1d_get_bits(gb, 3);
+            if (seqhdr->frame_id_numbers_present)
+                dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits);
+        }
+        const int use_ref = !hdr->error_resilient_mode &&
+                            hdr->frame_size_override;
+        if (read_frame_size(c, gb, use_ref) < 0) goto error;
+        hdr->hp = !hdr->force_integer_mv && dav1d_get_bits(gb, 1);
+        hdr->subpel_filter_mode = dav1d_get_bits(gb, 1) ? DAV1D_FILTER_SWITCHABLE :
+                                                          dav1d_get_bits(gb, 2);
+        hdr->switchable_motion_mode = dav1d_get_bits(gb, 1);
+        hdr->use_ref_frame_mvs = !hdr->error_resilient_mode &&
+            seqhdr->ref_frame_mvs && seqhdr->order_hint &&
+            hdr->frame_type & 1 && dav1d_get_bits(gb, 1);
+    }
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-frametype-specific-bits: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+    hdr->refresh_context = !seqhdr->reduced_still_picture_header &&
+                           !hdr->disable_cdf_update && !dav1d_get_bits(gb, 1);
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-refresh_context: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+    // tile data
+    hdr->tiling.uniform = dav1d_get_bits(gb, 1);
+    const int sbsz_min1 = (64 << seqhdr->sb128) - 1;
+    const int sbsz_log2 = 6 + seqhdr->sb128;
+    const int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2;
+    const int sbh = (hdr->height + sbsz_min1) >> sbsz_log2;
+    const int max_tile_width_sb = 4096 >> sbsz_log2;
+    const int max_tile_area_sb = 4096 * 2304 >> (2 * sbsz_log2);
+    hdr->tiling.min_log2_cols = tile_log2(max_tile_width_sb, sbw);
+    hdr->tiling.max_log2_cols = tile_log2(1, imin(sbw, DAV1D_MAX_TILE_COLS));
+    hdr->tiling.max_log2_rows = tile_log2(1, imin(sbh, DAV1D_MAX_TILE_ROWS));
+    const int min_log2_tiles = imax(tile_log2(max_tile_area_sb, sbw * sbh),
+                              hdr->tiling.min_log2_cols);
+    if (hdr->tiling.uniform) {
+        for (hdr->tiling.log2_cols = hdr->tiling.min_log2_cols;
+             hdr->tiling.log2_cols < hdr->tiling.max_log2_cols && dav1d_get_bits(gb, 1);
+             hdr->tiling.log2_cols++) ;
+        const int tile_w = 1 + ((sbw - 1) >> hdr->tiling.log2_cols);
+        hdr->tiling.cols = 0;
+        for (int sbx = 0; sbx < sbw; sbx += tile_w, hdr->tiling.cols++)
+            hdr->tiling.col_start_sb[hdr->tiling.cols] = sbx;
+        hdr->tiling.min_log2_rows =
+            imax(min_log2_tiles - hdr->tiling.log2_cols, 0);
+
+        for (hdr->tiling.log2_rows = hdr->tiling.min_log2_rows;
+             hdr->tiling.log2_rows < hdr->tiling.max_log2_rows && dav1d_get_bits(gb, 1);
+             hdr->tiling.log2_rows++) ;
+        const int tile_h = 1 + ((sbh - 1) >> hdr->tiling.log2_rows);
+        hdr->tiling.rows = 0;
+        for (int sby = 0; sby < sbh; sby += tile_h, hdr->tiling.rows++)
+            hdr->tiling.row_start_sb[hdr->tiling.rows] = sby;
+    } else {
+        hdr->tiling.cols = 0;
+        int widest_tile = 0, max_tile_area_sb = sbw * sbh;
+        for (int sbx = 0; sbx < sbw && hdr->tiling.cols < DAV1D_MAX_TILE_COLS; hdr->tiling.cols++) {
+            const int tile_width_sb = imin(sbw - sbx, max_tile_width_sb);
+            const int tile_w = (tile_width_sb > 1) ?
+                                   1 + dav1d_get_uniform(gb, tile_width_sb) :
+                                   1;
+            hdr->tiling.col_start_sb[hdr->tiling.cols] = sbx;
+            sbx += tile_w;
+            widest_tile = imax(widest_tile, tile_w);
+        }
+        hdr->tiling.log2_cols = tile_log2(1, hdr->tiling.cols);
+        if (min_log2_tiles) max_tile_area_sb >>= min_log2_tiles + 1;
+        const int max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1);
+
+        hdr->tiling.rows = 0;
+        for (int sby = 0; sby < sbh && hdr->tiling.rows < DAV1D_MAX_TILE_ROWS; hdr->tiling.rows++) {
+            const int tile_height_sb = imin(sbh - sby, max_tile_height_sb);
+            const int tile_h = (tile_height_sb > 1) ?
+                                   1 + dav1d_get_uniform(gb, tile_height_sb) :
+                                   1;
+            hdr->tiling.row_start_sb[hdr->tiling.rows] = sby;
+            sby += tile_h;
+        }
+        hdr->tiling.log2_rows = tile_log2(1, hdr->tiling.rows);
+    }
+    hdr->tiling.col_start_sb[hdr->tiling.cols] = sbw;
+    hdr->tiling.row_start_sb[hdr->tiling.rows] = sbh;
+    if (hdr->tiling.log2_cols || hdr->tiling.log2_rows) {
+        hdr->tiling.update = dav1d_get_bits(gb, hdr->tiling.log2_cols +
+                                                hdr->tiling.log2_rows);
+        if (hdr->tiling.update >= hdr->tiling.cols * hdr->tiling.rows)
+            goto error;
+        hdr->tiling.n_bytes = dav1d_get_bits(gb, 2) + 1;
+    } else {
+        hdr->tiling.n_bytes = hdr->tiling.update = 0;
+    }
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-tiling: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+    // quant data
+    hdr->quant.yac = dav1d_get_bits(gb, 8);
+    hdr->quant.ydc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
+    if (!seqhdr->monochrome) {
+        // If the sequence header says that delta_q might be different
+        // for U, V, we must check whether it actually is for this
+        // frame.
+        const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 1) : 0;
+        hdr->quant.udc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
+        hdr->quant.uac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
+        if (diff_uv_delta) {
+            hdr->quant.vdc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
+            hdr->quant.vac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
+        } else {
+            hdr->quant.vdc_delta = hdr->quant.udc_delta;
+            hdr->quant.vac_delta = hdr->quant.uac_delta;
+        }
+    }
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-quant: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+    hdr->quant.qm = dav1d_get_bits(gb, 1);
+    if (hdr->quant.qm) {
+        hdr->quant.qm_y = dav1d_get_bits(gb, 4);
+        hdr->quant.qm_u = dav1d_get_bits(gb, 4);
+        hdr->quant.qm_v =
+            seqhdr->separate_uv_delta_q ? (int)dav1d_get_bits(gb, 4) :
+                                          hdr->quant.qm_u;
+    }
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-qm: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+    // segmentation data
+    hdr->segmentation.enabled = dav1d_get_bits(gb, 1);
+    if (hdr->segmentation.enabled) {
+        if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+            hdr->segmentation.update_map = 1;
+            hdr->segmentation.temporal = 0;
+            hdr->segmentation.update_data = 1;
+        } else {
+            hdr->segmentation.update_map = dav1d_get_bits(gb, 1);
+            hdr->segmentation.temporal =
+                hdr->segmentation.update_map ? dav1d_get_bits(gb, 1) : 0;
+            hdr->segmentation.update_data = dav1d_get_bits(gb, 1);
+        }
+
+        if (hdr->segmentation.update_data) {
+            hdr->segmentation.seg_data.preskip = 0;
+            hdr->segmentation.seg_data.last_active_segid = -1;
+            for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
+                Dav1dSegmentationData *const seg =
+                    &hdr->segmentation.seg_data.d[i];
+                if (dav1d_get_bits(gb, 1)) {
+                    seg->delta_q = dav1d_get_sbits(gb, 8);
+                    hdr->segmentation.seg_data.last_active_segid = i;
+                } else {
+                    seg->delta_q = 0;
+                }
+                if (dav1d_get_bits(gb, 1)) {
+                    seg->delta_lf_y_v = dav1d_get_sbits(gb, 6);
+                    hdr->segmentation.seg_data.last_active_segid = i;
+                } else {
+                    seg->delta_lf_y_v = 0;
+                }
+                if (dav1d_get_bits(gb, 1)) {
+                    seg->delta_lf_y_h = dav1d_get_sbits(gb, 6);
+                    hdr->segmentation.seg_data.last_active_segid = i;
+                } else {
+                    seg->delta_lf_y_h = 0;
+                }
+                if (dav1d_get_bits(gb, 1)) {
+                    seg->delta_lf_u = dav1d_get_sbits(gb, 6);
+                    hdr->segmentation.seg_data.last_active_segid = i;
+                } else {
+                    seg->delta_lf_u = 0;
+                }
+                if (dav1d_get_bits(gb, 1)) {
+                    seg->delta_lf_v = dav1d_get_sbits(gb, 6);
+                    hdr->segmentation.seg_data.last_active_segid = i;
+                } else {
+                    seg->delta_lf_v = 0;
+                }
+                if (dav1d_get_bits(gb, 1)) {
+                    seg->ref = dav1d_get_bits(gb, 3);
+                    hdr->segmentation.seg_data.last_active_segid = i;
+                    hdr->segmentation.seg_data.preskip = 1;
+                } else {
+                    seg->ref = -1;
+                }
+                if ((seg->skip = dav1d_get_bits(gb, 1))) {
+                    hdr->segmentation.seg_data.last_active_segid = i;
+                    hdr->segmentation.seg_data.preskip = 1;
+                }
+                if ((seg->globalmv = dav1d_get_bits(gb, 1))) {
+                    hdr->segmentation.seg_data.last_active_segid = i;
+                    hdr->segmentation.seg_data.preskip = 1;
+                }
+            }
+        } else {
+            // segmentation.update_data was false so we should copy
+            // segmentation data from the reference frame.
+            assert(hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
+            const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
+            if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL);
+            hdr->segmentation.seg_data =
+                c->refs[pri_ref].p.p.frame_hdr->segmentation.seg_data;
+        }
+    } else {
+        memset(&hdr->segmentation.seg_data, 0, sizeof(Dav1dSegmentationDataSet));
+        for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++)
+            hdr->segmentation.seg_data.d[i].ref = -1;
+    }
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-segmentation: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+    // delta q
+    hdr->delta.q.present = hdr->quant.yac ? dav1d_get_bits(gb, 1) : 0;
+    hdr->delta.q.res_log2 = hdr->delta.q.present ? dav1d_get_bits(gb, 2) : 0;
+    hdr->delta.lf.present = hdr->delta.q.present && !hdr->allow_intrabc &&
+                            dav1d_get_bits(gb, 1);
+    hdr->delta.lf.res_log2 = hdr->delta.lf.present ? dav1d_get_bits(gb, 2) : 0;
+    hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bits(gb, 1) : 0;
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-delta_q_lf_flags: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+    // derive lossless flags
+    const int delta_lossless = !hdr->quant.ydc_delta && !hdr->quant.udc_delta &&
+        !hdr->quant.uac_delta && !hdr->quant.vdc_delta && !hdr->quant.vac_delta;
+    hdr->all_lossless = 1;
+    for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
+        hdr->segmentation.qidx[i] = hdr->segmentation.enabled ?
+            iclip_u8(hdr->quant.yac + hdr->segmentation.seg_data.d[i].delta_q) :
+            hdr->quant.yac;
+        hdr->segmentation.lossless[i] =
+            !hdr->segmentation.qidx[i] && delta_lossless;
+        hdr->all_lossless &= hdr->segmentation.lossless[i];
+    }
+
+    // loopfilter
+    if (hdr->all_lossless || hdr->allow_intrabc) {
+        hdr->loopfilter.level_y[0] = hdr->loopfilter.level_y[1] = 0;
+        hdr->loopfilter.level_u = hdr->loopfilter.level_v = 0;
+        hdr->loopfilter.sharpness = 0;
+        hdr->loopfilter.mode_ref_delta_enabled = 1;
+        hdr->loopfilter.mode_ref_delta_update = 1;
+        hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas;
+    } else {
+        hdr->loopfilter.level_y[0] = dav1d_get_bits(gb, 6);
+        hdr->loopfilter.level_y[1] = dav1d_get_bits(gb, 6);
+        if (!seqhdr->monochrome &&
+            (hdr->loopfilter.level_y[0] || hdr->loopfilter.level_y[1]))
+        {
+            hdr->loopfilter.level_u = dav1d_get_bits(gb, 6);
+            hdr->loopfilter.level_v = dav1d_get_bits(gb, 6);
+        }
+        hdr->loopfilter.sharpness = dav1d_get_bits(gb, 3);
+
+        if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+            hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas;
+        } else {
+            const int ref = hdr->refidx[hdr->primary_ref_frame];
+            if (!c->refs[ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL);
+            hdr->loopfilter.mode_ref_deltas =
+                c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas;
+        }
+        hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bits(gb, 1);
+        if (hdr->loopfilter.mode_ref_delta_enabled) {
+            hdr->loopfilter.mode_ref_delta_update = dav1d_get_bits(gb, 1);
+            if (hdr->loopfilter.mode_ref_delta_update) {
+                for (int i = 0; i < 8; i++)
+                    if (dav1d_get_bits(gb, 1))
+                        hdr->loopfilter.mode_ref_deltas.ref_delta[i] =
+                            dav1d_get_sbits(gb, 6);
+                for (int i = 0; i < 2; i++)
+                    if (dav1d_get_bits(gb, 1))
+                        hdr->loopfilter.mode_ref_deltas.mode_delta[i] =
+                            dav1d_get_sbits(gb, 6);
+            }
+        }
+    }
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-lpf: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+    // cdef
+    if (!hdr->all_lossless && seqhdr->cdef && !hdr->allow_intrabc) {
+        hdr->cdef.damping = dav1d_get_bits(gb, 2) + 3;
+        hdr->cdef.n_bits = dav1d_get_bits(gb, 2);
+        for (int i = 0; i < (1 << hdr->cdef.n_bits); i++) {
+            hdr->cdef.y_strength[i] = dav1d_get_bits(gb, 6);
+            if (!seqhdr->monochrome)
+                hdr->cdef.uv_strength[i] = dav1d_get_bits(gb, 6);
+        }
+    } else {
+        hdr->cdef.n_bits = 0;
+        hdr->cdef.y_strength[0] = 0;
+        hdr->cdef.uv_strength[0] = 0;
+    }
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-cdef: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+    // restoration
+    if ((!hdr->all_lossless || hdr->super_res.enabled) &&
+        seqhdr->restoration && !hdr->allow_intrabc)
+    {
+        hdr->restoration.type[0] = dav1d_get_bits(gb, 2);
+        if (!seqhdr->monochrome) {
+            hdr->restoration.type[1] = dav1d_get_bits(gb, 2);
+            hdr->restoration.type[2] = dav1d_get_bits(gb, 2);
+        } else {
+            hdr->restoration.type[1] =
+            hdr->restoration.type[2] = DAV1D_RESTORATION_NONE;
+        }
+
+        if (hdr->restoration.type[0] || hdr->restoration.type[1] ||
+            hdr->restoration.type[2])
+        {
+            // Log2 of the restoration unit size.
+            hdr->restoration.unit_size[0] = 6 + seqhdr->sb128;
+            if (dav1d_get_bits(gb, 1)) {
+                hdr->restoration.unit_size[0]++;
+                if (!seqhdr->sb128)
+                    hdr->restoration.unit_size[0] += dav1d_get_bits(gb, 1);
+            }
+            hdr->restoration.unit_size[1] = hdr->restoration.unit_size[0];
+            if ((hdr->restoration.type[1] || hdr->restoration.type[2]) &&
+                seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1)
+            {
+                hdr->restoration.unit_size[1] -= dav1d_get_bits(gb, 1);
+            }
+        } else {
+            hdr->restoration.unit_size[0] = 8;
+        }
+    } else {
+        hdr->restoration.type[0] = DAV1D_RESTORATION_NONE;
+        hdr->restoration.type[1] = DAV1D_RESTORATION_NONE;
+        hdr->restoration.type[2] = DAV1D_RESTORATION_NONE;
+    }
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-restoration: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+    hdr->txfm_mode = hdr->all_lossless ? DAV1D_TX_4X4_ONLY :
+                     dav1d_get_bits(gb, 1) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST;
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-txfmmode: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+    hdr->switchable_comp_refs = hdr->frame_type & 1 ? dav1d_get_bits(gb, 1) : 0;
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-refmode: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+    hdr->skip_mode_allowed = 0;
+    if (hdr->switchable_comp_refs && hdr->frame_type & 1 && seqhdr->order_hint) {
+        const unsigned poc = hdr->frame_offset;
+        unsigned off_before = 0xFFFFFFFFU;
+        int off_after = -1;
+        int off_before_idx, off_after_idx;
+        for (int i = 0; i < 7; i++) {
+            if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
+            const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
+
+            const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc);
+            if (diff > 0) {
+                if (off_after == -1 || get_poc_diff(seqhdr->order_hint_n_bits,
+                                                    off_after, refpoc) > 0)
+                {
+                    off_after = refpoc;
+                    off_after_idx = i;
+                }
+            } else if (diff < 0 && (off_before == 0xFFFFFFFFU ||
+                                    get_poc_diff(seqhdr->order_hint_n_bits,
+                                                 refpoc, off_before) > 0))
+            {
+                off_before = refpoc;
+                off_before_idx = i;
+            }
+        }
+
+        if (off_before != 0xFFFFFFFFU && off_after != -1) {
+            hdr->skip_mode_refs[0] = imin(off_before_idx, off_after_idx);
+            hdr->skip_mode_refs[1] = imax(off_before_idx, off_after_idx);
+            hdr->skip_mode_allowed = 1;
+        } else if (off_before != 0xFFFFFFFFU) {
+            unsigned off_before2 = 0xFFFFFFFFU;
+            int off_before2_idx;
+            for (int i = 0; i < 7; i++) {
+                if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
+                const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
+                if (get_poc_diff(seqhdr->order_hint_n_bits,
+                                 refpoc, off_before) < 0) {
+                    if (off_before2 == 0xFFFFFFFFU ||
+                        get_poc_diff(seqhdr->order_hint_n_bits,
+                                     refpoc, off_before2) > 0)
+                    {
+                        off_before2 = refpoc;
+                        off_before2_idx = i;
+                    }
+                }
+            }
+
+            if (off_before2 != 0xFFFFFFFFU) {
+                hdr->skip_mode_refs[0] = imin(off_before_idx, off_before2_idx);
+                hdr->skip_mode_refs[1] = imax(off_before_idx, off_before2_idx);
+                hdr->skip_mode_allowed = 1;
+            }
+        }
+    }
+    hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bits(gb, 1) : 0;
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-extskip: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+    hdr->warp_motion = !hdr->error_resilient_mode && hdr->frame_type & 1 &&
+        seqhdr->warped_motion && dav1d_get_bits(gb, 1);
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-warpmotionbit: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+    hdr->reduced_txtp_set = dav1d_get_bits(gb, 1);
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-reducedtxtpset: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+    for (int i = 0; i < 7; i++)
+        hdr->gmv[i] = dav1d_default_wm_params;
+
+    if (hdr->frame_type & 1) {
+        for (int i = 0; i < 7; i++) {
+            hdr->gmv[i].type = !dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_IDENTITY :
+                                dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_ROT_ZOOM :
+                                dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_TRANSLATION :
+                                                  DAV1D_WM_TYPE_AFFINE;
+
+            if (hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY) continue;
+
+            const Dav1dWarpedMotionParams *ref_gmv;
+            if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+                ref_gmv = &dav1d_default_wm_params;
+            } else {
+                const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
+                if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL);
+                ref_gmv = &c->refs[pri_ref].p.p.frame_hdr->gmv[i];
+            }
+            int32_t *const mat = hdr->gmv[i].matrix;
+            const int32_t *const ref_mat = ref_gmv->matrix;
+            int bits, shift;
+
+            if (hdr->gmv[i].type >= DAV1D_WM_TYPE_ROT_ZOOM) {
+                mat[2] = (1 << 16) + 2 *
+                    dav1d_get_bits_subexp(gb, (ref_mat[2] - (1 << 16)) >> 1, 12);
+                mat[3] = 2 * dav1d_get_bits_subexp(gb, ref_mat[3] >> 1, 12);
+
+                bits = 12;
+                shift = 10;
+            } else {
+                bits = 9 - !hdr->hp;
+                shift = 13 + !hdr->hp;
+            }
+
+            if (hdr->gmv[i].type == DAV1D_WM_TYPE_AFFINE) {
+                mat[4] = 2 * dav1d_get_bits_subexp(gb, ref_mat[4] >> 1, 12);
+                mat[5] = (1 << 16) + 2 *
+                    dav1d_get_bits_subexp(gb, (ref_mat[5] - (1 << 16)) >> 1, 12);
+            } else {
+                mat[4] = -mat[3];
+                mat[5] = mat[2];
+            }
+
+            mat[0] = dav1d_get_bits_subexp(gb, ref_mat[0] >> shift, bits) * (1 << shift);
+            mat[1] = dav1d_get_bits_subexp(gb, ref_mat[1] >> shift, bits) * (1 << shift);
+        }
+    }
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-gmv: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+    hdr->film_grain.present = seqhdr->film_grain_present &&
+                              (hdr->show_frame || hdr->showable_frame) &&
+                              dav1d_get_bits(gb, 1);
+    if (hdr->film_grain.present) {
+        const unsigned seed = dav1d_get_bits(gb, 16);
+        hdr->film_grain.update = hdr->frame_type != DAV1D_FRAME_TYPE_INTER || dav1d_get_bits(gb, 1);
+        if (!hdr->film_grain.update) {
+            const int refidx = dav1d_get_bits(gb, 3);
+            int i;
+            for (i = 0; i < 7; i++)
+                if (hdr->refidx[i] == refidx)
+                    break;
+            if (i == 7 || !c->refs[refidx].p.p.frame_hdr) goto error;
+            hdr->film_grain.data = c->refs[refidx].p.p.frame_hdr->film_grain.data;
+            hdr->film_grain.data.seed = seed;
+        } else {
+            Dav1dFilmGrainData *const fgd = &hdr->film_grain.data;
+            fgd->seed = seed;
+
+            fgd->num_y_points = dav1d_get_bits(gb, 4);
+            if (fgd->num_y_points > 14) goto error;
+            for (int i = 0; i < fgd->num_y_points; i++) {
+                fgd->y_points[i][0] = dav1d_get_bits(gb, 8);
+                if (i && fgd->y_points[i - 1][0] >= fgd->y_points[i][0])
+                    goto error;
+                fgd->y_points[i][1] = dav1d_get_bits(gb, 8);
+            }
+
+            fgd->chroma_scaling_from_luma =
+                !seqhdr->monochrome && dav1d_get_bits(gb, 1);
+            if (seqhdr->monochrome || fgd->chroma_scaling_from_luma ||
+                (seqhdr->ss_ver == 1 && seqhdr->ss_hor == 1 && !fgd->num_y_points))
+            {
+                fgd->num_uv_points[0] = fgd->num_uv_points[1] = 0;
+            } else for (int pl = 0; pl < 2; pl++) {
+                fgd->num_uv_points[pl] = dav1d_get_bits(gb, 4);
+                if (fgd->num_uv_points[pl] > 10) goto error;
+                for (int i = 0; i < fgd->num_uv_points[pl]; i++) {
+                    fgd->uv_points[pl][i][0] = dav1d_get_bits(gb, 8);
+                    if (i && fgd->uv_points[pl][i - 1][0] >= fgd->uv_points[pl][i][0])
+                        goto error;
+                    fgd->uv_points[pl][i][1] = dav1d_get_bits(gb, 8);
+                }
+            }
+
+            if (seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1 &&
+                !!fgd->num_uv_points[0] != !!fgd->num_uv_points[1])
+            {
+                goto error;
+            }
+
+            fgd->scaling_shift = dav1d_get_bits(gb, 2) + 8;
+            fgd->ar_coeff_lag = dav1d_get_bits(gb, 2);
+            const int num_y_pos = 2 * fgd->ar_coeff_lag * (fgd->ar_coeff_lag + 1);
+            if (fgd->num_y_points)
+                for (int i = 0; i < num_y_pos; i++)
+                    fgd->ar_coeffs_y[i] = dav1d_get_bits(gb, 8) - 128;
+            for (int pl = 0; pl < 2; pl++)
+                if (fgd->num_uv_points[pl] || fgd->chroma_scaling_from_luma) {
+                    const int num_uv_pos = num_y_pos + !!fgd->num_y_points;
+                    for (int i = 0; i < num_uv_pos; i++)
+                        fgd->ar_coeffs_uv[pl][i] = dav1d_get_bits(gb, 8) - 128;
+                    if (!fgd->num_y_points)
+                        fgd->ar_coeffs_uv[pl][num_uv_pos] = 0;
+                }
+            fgd->ar_coeff_shift = dav1d_get_bits(gb, 2) + 6;
+            fgd->grain_scale_shift = dav1d_get_bits(gb, 2);
+            for (int pl = 0; pl < 2; pl++)
+                if (fgd->num_uv_points[pl]) {
+                    fgd->uv_mult[pl] = dav1d_get_bits(gb, 8) - 128;
+                    fgd->uv_luma_mult[pl] = dav1d_get_bits(gb, 8) - 128;
+                    fgd->uv_offset[pl] = dav1d_get_bits(gb, 9) - 256;
+                }
+            fgd->overlap_flag = dav1d_get_bits(gb, 1);
+            fgd->clip_to_restricted_range = dav1d_get_bits(gb, 1);
+        }
+    } else {
+        memset(&hdr->film_grain.data, 0, sizeof(hdr->film_grain.data));
+    }
+#if DEBUG_FRAME_HDR
+    printf("HDR: post-filmgrain: off=%ld\n",
+           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+    return 0;
+
+error:
+    dav1d_log(c, "Error parsing frame header\n");
+    return DAV1D_ERR(EINVAL);
+}
+
+static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) {
+    const int n_tiles = c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows;
+    const int have_tile_pos = n_tiles > 1 ? dav1d_get_bits(gb, 1) : 0;
+
+    if (have_tile_pos) {
+        const int n_bits = c->frame_hdr->tiling.log2_cols +
+                           c->frame_hdr->tiling.log2_rows;
+        c->tile[c->n_tile_data].start = dav1d_get_bits(gb, n_bits);
+        c->tile[c->n_tile_data].end = dav1d_get_bits(gb, n_bits);
+    } else {
+        c->tile[c->n_tile_data].start = 0;
+        c->tile[c->n_tile_data].end = n_tiles - 1;
+    }
+}
+
+// Check that we haven't read more than obu_len bytes from the buffer
+// since init_bit_pos.
+static int check_for_overrun(Dav1dContext *const c, GetBits *const gb,
+                             const unsigned init_bit_pos,
+                             const unsigned obu_len)
+{
+    // Make sure we haven't actually read past the end of the gb buffer
+    if (gb->error) {
+        dav1d_log(c, "Overrun in OBU bit buffer\n");
+        return 1;
+    }
+
+    const unsigned pos = dav1d_get_bits_pos(gb);
+
+    // We assume that init_bit_pos was the bit position of the buffer
+    // at some point in the past, so cannot be smaller than pos.
+    assert (init_bit_pos <= pos);
+
+    if (pos - init_bit_pos > 8 * obu_len) {
+        dav1d_log(c, "Overrun in OBU bit buffer into next OBU\n");
+        return 1;
+    }
+
+    return 0;
+}
+
+int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int global) {
+    GetBits gb;
+    int res;
+
+    dav1d_init_get_bits(&gb, in->data, in->sz);
+
+    // obu header
+    dav1d_get_bits(&gb, 1); // obu_forbidden_bit
+    const enum Dav1dObuType type = dav1d_get_bits(&gb, 4);
+    const int has_extension = dav1d_get_bits(&gb, 1);
+    const int has_length_field = dav1d_get_bits(&gb, 1);
+    dav1d_get_bits(&gb, 1); // reserved
+
+    int temporal_id = 0, spatial_id = 0;
+    if (has_extension) {
+        temporal_id = dav1d_get_bits(&gb, 3);
+        spatial_id = dav1d_get_bits(&gb, 2);
+        dav1d_get_bits(&gb, 3); // reserved
+    }
+
+    // obu length field
+    const unsigned len = has_length_field ?
+        dav1d_get_uleb128(&gb) : (unsigned) in->sz - 1 - has_extension;
+    if (gb.error) goto error;
+
+    const unsigned init_bit_pos = dav1d_get_bits_pos(&gb);
+    const unsigned init_byte_pos = init_bit_pos >> 3;
+    const unsigned pkt_bytelen = init_byte_pos + len;
+
+    // We must have read a whole number of bytes at this point (1 byte
+    // for the header and whole bytes at a time when reading the
+    // leb128 length field).
+    assert((init_bit_pos & 7) == 0);
+
+    // We also know that we haven't tried to read more than in->sz
+    // bytes yet (otherwise the error flag would have been set by the
+    // code in getbits.c)
+    assert(in->sz >= init_byte_pos);
+
+    // Make sure that there are enough bits left in the buffer for the
+    // rest of the OBU.
+    if (len > in->sz - init_byte_pos) goto error;
+
+    // skip obu not belonging to the selected temporal/spatial layer
+    if (type != DAV1D_OBU_SEQ_HDR && type != DAV1D_OBU_TD &&
+        has_extension && c->operating_point_idc != 0)
+    {
+        const int in_temporal_layer = (c->operating_point_idc >> temporal_id) & 1;
+        const int in_spatial_layer = (c->operating_point_idc >> (spatial_id + 8)) & 1;
+        if (!in_temporal_layer || !in_spatial_layer)
+            return len + init_byte_pos;
+    }
+
+    switch (type) {
+    case DAV1D_OBU_SEQ_HDR: {
+        Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dSequenceHeader));
+        if (!ref) return DAV1D_ERR(ENOMEM);
+        Dav1dSequenceHeader *seq_hdr = ref->data;
+        memset(seq_hdr, 0, sizeof(*seq_hdr));
+        if ((res = parse_seq_hdr(c, &gb, seq_hdr)) < 0) {
+            dav1d_ref_dec(&ref);
+            return res;
+        }
+        if (check_for_overrun(c, &gb, init_bit_pos, len)) {
+            dav1d_ref_dec(&ref);
+            return DAV1D_ERR(EINVAL);
+        }
+        // If we have read a sequence header which is different from
+        // the old one, this is a new video sequence and can't use any
+        // previous state. Free that state.
+        if (!c->seq_hdr)
+            c->frame_hdr = NULL;
+        // see 7.5, operating_parameter_info is allowed to change in
+        // sequence headers of a single sequence
+        else if (memcmp(seq_hdr, c->seq_hdr, offsetof(Dav1dSequenceHeader, operating_parameter_info))) {
+            c->frame_hdr = NULL;
+            c->mastering_display = NULL;
+            c->content_light = NULL;
+            dav1d_ref_dec(&c->mastering_display_ref);
+            dav1d_ref_dec(&c->content_light_ref);
+            for (int i = 0; i < 8; i++) {
+                if (c->refs[i].p.p.data[0])
+                    dav1d_thread_picture_unref(&c->refs[i].p);
+                dav1d_ref_dec(&c->refs[i].segmap);
+                dav1d_ref_dec(&c->refs[i].refmvs);
+                dav1d_cdf_thread_unref(&c->cdf[i]);
+            }
+        }
+        dav1d_ref_dec(&c->seq_hdr_ref);
+        c->seq_hdr_ref = ref;
+        c->seq_hdr = seq_hdr;
+        break;
+    }
+    case DAV1D_OBU_REDUNDANT_FRAME_HDR:
+        if (c->frame_hdr) break;
+        // fall-through
+    case DAV1D_OBU_FRAME:
+    case DAV1D_OBU_FRAME_HDR:
+        if (global) break;
+        if (!c->seq_hdr) goto error;
+        if (!c->frame_hdr_ref) {
+            c->frame_hdr_ref = dav1d_ref_create(sizeof(Dav1dFrameHeader));
+            if (!c->frame_hdr_ref) return DAV1D_ERR(ENOMEM);
+        }
+#ifndef NDEBUG
+        // ensure that the reference is writable
+        assert(dav1d_ref_is_writable(c->frame_hdr_ref));
+#endif
+        c->frame_hdr = c->frame_hdr_ref->data;
+        memset(c->frame_hdr, 0, sizeof(*c->frame_hdr));
+        c->frame_hdr->temporal_id = temporal_id;
+        c->frame_hdr->spatial_id = spatial_id;
+        if ((res = parse_frame_hdr(c, &gb)) < 0) {
+            c->frame_hdr = NULL;
+            return res;
+        }
+        for (int n = 0; n < c->n_tile_data; n++)
+            dav1d_data_unref_internal(&c->tile[n].data);
+        c->n_tile_data = 0;
+        c->n_tiles = 0;
+        if (type != DAV1D_OBU_FRAME) {
+            // This is actually a frame header OBU so read the
+            // trailing bit and check for overrun.
+            dav1d_get_bits(&gb, 1);
+            if (check_for_overrun(c, &gb, init_bit_pos, len)) {
+                c->frame_hdr = NULL;
+                return DAV1D_ERR(EINVAL);
+            }
+        }
+
+        if (c->frame_size_limit && (int64_t)c->frame_hdr->width[1] *
+            c->frame_hdr->height > c->frame_size_limit)
+        {
+            dav1d_log(c, "Frame size %dx%d exceeds limit %u\n", c->frame_hdr->width[1],
+                      c->frame_hdr->height, c->frame_size_limit);
+            c->frame_hdr = NULL;
+            return DAV1D_ERR(ERANGE);
+        }
+
+        if (type != DAV1D_OBU_FRAME)
+            break;
+        // OBU_FRAMEs shouldn't be signaled with show_existing_frame
+        if (c->frame_hdr->show_existing_frame) {
+            c->frame_hdr = NULL;
+            goto error;
+        }
+
+        // This is the frame header at the start of a frame OBU.
+        // There's no trailing bit at the end to skip, but we do need
+        // to align to the next byte.
+        dav1d_bytealign_get_bits(&gb);
+        // fall-through
+    case DAV1D_OBU_TILE_GRP: {
+        if (global) break;
+        if (!c->frame_hdr) goto error;
+        if (c->n_tile_data_alloc < c->n_tile_data + 1) {
+            if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error;
+            struct Dav1dTileGroup *tile = realloc(c->tile, (c->n_tile_data + 1) * sizeof(*c->tile));
+            if (!tile) goto error;
+            c->tile = tile;
+            memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile));
+            c->n_tile_data_alloc = c->n_tile_data + 1;
+        }
+        parse_tile_hdr(c, &gb);
+        // Align to the next byte boundary and check for overrun.
+        dav1d_bytealign_get_bits(&gb);
+        if (check_for_overrun(c, &gb, init_bit_pos, len))
+            return DAV1D_ERR(EINVAL);
+        // The current bit position is a multiple of 8 (because we
+        // just aligned it) and less than 8*pkt_bytelen because
+        // otherwise the overrun check would have fired.
+        const unsigned bit_pos = dav1d_get_bits_pos(&gb);
+        assert((bit_pos & 7) == 0);
+        assert(pkt_bytelen >= (bit_pos >> 3));
+        dav1d_data_ref(&c->tile[c->n_tile_data].data, in);
+        c->tile[c->n_tile_data].data.data += bit_pos >> 3;
+        c->tile[c->n_tile_data].data.sz = pkt_bytelen - (bit_pos >> 3);
+        // ensure tile groups are in order and sane, see 6.10.1
+        if (c->tile[c->n_tile_data].start > c->tile[c->n_tile_data].end ||
+            c->tile[c->n_tile_data].start != c->n_tiles)
+        {
+            for (int i = 0; i <= c->n_tile_data; i++)
+                dav1d_data_unref_internal(&c->tile[i].data);
+            c->n_tile_data = 0;
+            c->n_tiles = 0;
+            goto error;
+        }
+        c->n_tiles += 1 + c->tile[c->n_tile_data].end -
+                          c->tile[c->n_tile_data].start;
+        c->n_tile_data++;
+        break;
+    }
+    case DAV1D_OBU_METADATA: {
+        // obu metadta type field
+        const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
+        const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3;
+        if (gb.error) goto error;
+        Dav1dRef *ref;
+        Dav1dContentLightLevel *content_light;
+        Dav1dMasteringDisplay *mastering_display;
+        Dav1dITUTT35 *itut_t35_metadata;
+
+        switch (meta_type) {
+        case OBU_META_HDR_CLL:
+            ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel));
+            if (!ref) return DAV1D_ERR(ENOMEM);
+            content_light = ref->data;
+            memset(content_light, 0, sizeof(*content_light));
+
+            content_light->max_content_light_level = dav1d_get_bits(&gb, 16);
+            content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16);
+
+            // Skip the trailing bit, align to the next byte boundary and check for overrun.
+            dav1d_get_bits(&gb, 1);
+            dav1d_bytealign_get_bits(&gb);
+            if (check_for_overrun(c, &gb, init_bit_pos, len)) {
+                dav1d_ref_dec(&ref);
+                goto error;
+            }
+
+            dav1d_ref_dec(&c->content_light_ref);
+            c->content_light = content_light;
+            c->content_light_ref = ref;
+            break;
+        case OBU_META_HDR_MDCV: {
+            ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay));
+            if (!ref) return DAV1D_ERR(ENOMEM);
+            mastering_display = ref->data;
+            memset(mastering_display, 0, sizeof(*mastering_display));
+
+            for (int i = 0; i < 3; i++) {
+                mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16);
+                mastering_display->primaries[i][1] = dav1d_get_bits(&gb, 16);
+            }
+            mastering_display->white_point[0] = dav1d_get_bits(&gb, 16);
+            mastering_display->white_point[1] = dav1d_get_bits(&gb, 16);
+
+            mastering_display->max_luminance = dav1d_get_bits(&gb, 32);
+            mastering_display->min_luminance = dav1d_get_bits(&gb, 32);
+
+            // Skip the trailing bit, align to the next byte boundary and check for overrun.
+            dav1d_get_bits(&gb, 1);
+            dav1d_bytealign_get_bits(&gb);
+            if (check_for_overrun(c, &gb, init_bit_pos, len)) {
+                dav1d_ref_dec(&ref);
+                goto error;
+            }
+
+            dav1d_ref_dec(&c->mastering_display_ref);
+            c->mastering_display = mastering_display;
+            c->mastering_display_ref = ref;
+            break;
+        }
+        case OBU_META_ITUT_T35: {
+            int payload_size = len;
+            // Don't take into account all the trailing bits for payload_size
+            while (payload_size > 0 && !in->data[init_byte_pos + payload_size - 1])
+                payload_size--; // trailing_zero_bit x 8
+            payload_size--; // trailing_one_bit + trailing_zero_bit x 7
+
+            // Don't take into account meta_type bytes
+            payload_size -= meta_type_len;
+
+            int country_code_extension_byte = 0;
+            const int country_code = dav1d_get_bits(&gb, 8);
+            payload_size--;
+            if (country_code == 0xFF) {
+                country_code_extension_byte = dav1d_get_bits(&gb, 8);
+                payload_size--;
+            }
+
+            if (payload_size <= 0) {
+                dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n");
+                goto error;
+            }
+
+            ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
+            if (!ref) return DAV1D_ERR(ENOMEM);
+            itut_t35_metadata = ref->data;
+
+            // We need our public headers to be C++ compatible, so payload can't be
+            // a flexible array member
+            itut_t35_metadata->payload = (uint8_t *) &itut_t35_metadata[1];
+            itut_t35_metadata->country_code = country_code;
+            itut_t35_metadata->country_code_extension_byte = country_code_extension_byte;
+            for (int i = 0; i < payload_size; i++)
+                itut_t35_metadata->payload[i] = dav1d_get_bits(&gb, 8);
+            itut_t35_metadata->payload_size = payload_size;
+
+            dav1d_ref_dec(&c->itut_t35_ref);
+            c->itut_t35 = itut_t35_metadata;
+            c->itut_t35_ref = ref;
+            break;
+        }
+        case OBU_META_SCALABILITY:
+        case OBU_META_TIMECODE:
+            // ignore metadata OBUs we don't care about
+            break;
+        default:
+            // print a warning but don't fail for unknown types
+            dav1d_log(c, "Unknown Metadata OBU type %d\n", meta_type);
+            break;
+        }
+
+        break;
+    }
+    case DAV1D_OBU_PADDING:
+    case DAV1D_OBU_TD:
+        // ignore OBUs we don't care about
+        break;
+    default:
+        // print a warning but don't fail for unknown types
+        dav1d_log(c, "Unknown OBU type %d of size %u\n", type, len);
+        break;
+    }
+
+    if (c->seq_hdr && c->frame_hdr) {
+        if (c->frame_hdr->show_existing_frame) {
+            if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return DAV1D_ERR(EINVAL);
+            if (c->n_fc == 1) {
+                dav1d_picture_ref(&c->out,
+                                  &c->refs[c->frame_hdr->existing_frame_idx].p.p);
+                dav1d_data_props_copy(&c->out.m, &in->m);
+            } else {
+                // need to append this to the frame output queue
+                const unsigned next = c->frame_thread.next++;
+                if (c->frame_thread.next == c->n_fc)
+                    c->frame_thread.next = 0;
+
+                Dav1dFrameContext *const f = &c->fc[next];
+                pthread_mutex_lock(&f->frame_thread.td.lock);
+                while (f->n_tile_data > 0)
+                    pthread_cond_wait(&f->frame_thread.td.cond,
+                                      &f->frame_thread.td.lock);
+                Dav1dThreadPicture *const out_delayed =
+                    &c->frame_thread.out_delayed[next];
+                if (out_delayed->p.data[0]) {
+                    const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
+                                                                   memory_order_relaxed);
+                    if (out_delayed->visible && progress != FRAME_ERROR)
+                        dav1d_picture_ref(&c->out, &out_delayed->p);
+                    dav1d_thread_picture_unref(out_delayed);
+                }
+                dav1d_thread_picture_ref(out_delayed,
+                                         &c->refs[c->frame_hdr->existing_frame_idx].p);
+                out_delayed->visible = 1;
+                dav1d_data_props_copy(&out_delayed->p.m, &in->m);
+                pthread_mutex_unlock(&f->frame_thread.td.lock);
+            }
+            if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {
+                const int r = c->frame_hdr->existing_frame_idx;
+                for (int i = 0; i < 8; i++) {
+                    if (i == r) continue;
+
+                    if (c->refs[i].p.p.data[0])
+                        dav1d_thread_picture_unref(&c->refs[i].p);
+                    dav1d_thread_picture_ref(&c->refs[i].p, &c->refs[r].p);
+
+                    dav1d_cdf_thread_unref(&c->cdf[i]);
+                    dav1d_cdf_thread_ref(&c->cdf[i], &c->cdf[r]);
+
+                    dav1d_ref_dec(&c->refs[i].segmap);
+                    c->refs[i].segmap = c->refs[r].segmap;
+                    if (c->refs[r].segmap)
+                        dav1d_ref_inc(c->refs[r].segmap);
+                    dav1d_ref_dec(&c->refs[i].refmvs);
+                }
+            }
+            c->frame_hdr = NULL;
+        } else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) {
+            if (!c->n_tile_data)
+                return DAV1D_ERR(EINVAL);
+            if ((res = dav1d_submit_frame(c)) < 0)
+                return res;
+            assert(!c->n_tile_data);
+            c->frame_hdr = NULL;
+            c->n_tiles = 0;
+        }
+    }
+
+    return len + init_byte_pos;
+
+error:
+    dav1d_log(c, "Error parsing OBU data\n");
+    return DAV1D_ERR(EINVAL);
+}
diff --git a/src/obu.h b/src/obu.h
new file mode 100644 (file)
index 0000000..aa79b52
--- /dev/null
+++ b/src/obu.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_OBU_H
+#define DAV1D_SRC_OBU_H
+
+#include "dav1d/data.h"
+#include "src/internal.h"
+
+int dav1d_parse_obus(Dav1dContext *c, Dav1dData *in, int global);
+
+#endif /* DAV1D_SRC_OBU_H */
diff --git a/src/picture.c b/src/picture.c
new file mode 100644 (file)
index 0000000..72af92e
--- /dev/null
@@ -0,0 +1,328 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/intops.h"
+#include "common/mem.h"
+#include "common/validate.h"
+
+#include "src/internal.h"
+#include "src/log.h"
+#include "src/picture.h"
+#include "src/ref.h"
+#include "src/thread.h"
+#include "src/thread_task.h"
+
+int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) {
+    assert(cookie == NULL);
+    const int hbd = p->p.bpc > 8;
+    const int aligned_w = (p->p.w + 127) & ~127;
+    const int aligned_h = (p->p.h + 127) & ~127;
+    const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
+    const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    ptrdiff_t y_stride = aligned_w << hbd;
+    ptrdiff_t uv_stride = has_chroma ? y_stride >> ss_hor : 0;
+    /* Due to how mapping of addresses to sets works in most L1 and L2 cache
+     * implementations, strides of multiples of certain power-of-two numbers
+     * may cause multiple rows of the same superblock to map to the same set,
+     * causing evictions of previous rows resulting in a reduction in cache
+     * hit rate. Avoid that by slightly padding the stride when necessary. */
+    if (!(y_stride & 1023))
+        y_stride += DAV1D_PICTURE_ALIGNMENT;
+    if (!(uv_stride & 1023) && has_chroma)
+        uv_stride += DAV1D_PICTURE_ALIGNMENT;
+    p->stride[0] = y_stride;
+    p->stride[1] = uv_stride;
+    const size_t y_sz = y_stride * aligned_h;
+    const size_t uv_sz = uv_stride * (aligned_h >> ss_ver);
+    const size_t pic_size = y_sz + 2 * uv_sz + DAV1D_PICTURE_ALIGNMENT;
+    uint8_t *const data = dav1d_alloc_aligned(pic_size, DAV1D_PICTURE_ALIGNMENT);
+    if (!data) return DAV1D_ERR(ENOMEM);
+
+    p->data[0] = data;
+    p->data[1] = has_chroma ? data + y_sz : NULL;
+    p->data[2] = has_chroma ? data + y_sz + uv_sz : NULL;
+
+#ifndef NDEBUG /* safety check */
+    p->allocator_data = data;
+#endif
+
+    return 0;
+}
+
+void dav1d_default_picture_release(Dav1dPicture *const p, void *const cookie) {
+    assert(cookie == NULL);
+#ifndef NDEBUG /* safety check */
+    assert(p->allocator_data == p->data[0]);
+#endif
+    dav1d_free_aligned(p->data[0]);
+}
+
+struct pic_ctx_context {
+    Dav1dPicAllocator allocator;
+    Dav1dPicture pic;
+    void *extra_ptr; /* MUST BE AT THE END */
+};
+
+static void free_buffer(const uint8_t *const data, void *const user_data) {
+    struct pic_ctx_context *pic_ctx = user_data;
+
+    pic_ctx->allocator.release_picture_callback(&pic_ctx->pic,
+                                                pic_ctx->allocator.cookie);
+    free(pic_ctx);
+}
+
+static int picture_alloc_with_edges(Dav1dContext *const c,
+                                    Dav1dPicture *const p,
+                                    const int w, const int h,
+                                    Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref,
+                                    Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref,
+                                    Dav1dContentLightLevel *const content_light, Dav1dRef *const content_light_ref,
+                                    Dav1dMasteringDisplay *const mastering_display, Dav1dRef *const mastering_display_ref,
+                                    Dav1dITUTT35 *const itut_t35, Dav1dRef *const itut_t35_ref,
+                                    const int bpc,
+                                    const Dav1dDataProps *const props,
+                                    Dav1dPicAllocator *const p_allocator,
+                                    const size_t extra, void **const extra_ptr)
+{
+    if (p->data[0]) {
+        dav1d_log(c, "Picture already allocated!\n");
+        return -1;
+    }
+    assert(bpc > 0 && bpc <= 16);
+
+    struct pic_ctx_context *pic_ctx = malloc(extra + sizeof(struct pic_ctx_context));
+    if (pic_ctx == NULL)
+        return DAV1D_ERR(ENOMEM);
+
+    p->p.w = w;
+    p->p.h = h;
+    p->seq_hdr = seq_hdr;
+    p->frame_hdr = frame_hdr;
+    p->content_light = content_light;
+    p->mastering_display = mastering_display;
+    p->itut_t35 = itut_t35;
+    p->p.layout = seq_hdr->layout;
+    p->p.bpc = bpc;
+    dav1d_data_props_set_defaults(&p->m);
+    const int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie);
+    if (res < 0) {
+        free(pic_ctx);
+        return res;
+    }
+
+    pic_ctx->allocator = *p_allocator;
+    pic_ctx->pic = *p;
+
+    if (!(p->ref = dav1d_ref_wrap(p->data[0], free_buffer, pic_ctx))) {
+        p_allocator->release_picture_callback(p, p_allocator->cookie);
+        free(pic_ctx);
+        dav1d_log(c, "Failed to wrap picture: %s\n", strerror(errno));
+        return DAV1D_ERR(ENOMEM);
+    }
+
+    p->seq_hdr_ref = seq_hdr_ref;
+    if (seq_hdr_ref) dav1d_ref_inc(seq_hdr_ref);
+
+    p->frame_hdr_ref = frame_hdr_ref;
+    if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref);
+
+    dav1d_data_props_copy(&p->m, props);
+
+    if (extra && extra_ptr)
+        *extra_ptr = &pic_ctx->extra_ptr;
+
+    p->content_light_ref = content_light_ref;
+    if (content_light_ref) dav1d_ref_inc(content_light_ref);
+
+    p->mastering_display_ref = mastering_display_ref;
+    if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref);
+
+    p->itut_t35_ref = itut_t35_ref;
+    if (itut_t35_ref) dav1d_ref_inc(itut_t35_ref);
+
+    return 0;
+}
+
+int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f,
+                               const int bpc)
+{
+    Dav1dThreadPicture *const p = &f->sr_cur;
+    p->t = c->n_fc > 1 ? &f->frame_thread.td : NULL;
+
+    const int res =
+        picture_alloc_with_edges(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height,
+                                 f->seq_hdr, f->seq_hdr_ref,
+                                 f->frame_hdr, f->frame_hdr_ref,
+                                 c->content_light, c->content_light_ref,
+                                 c->mastering_display, c->mastering_display_ref,
+                                 c->itut_t35, c->itut_t35_ref,
+                                 bpc, &f->tile[0].data.m, &c->allocator,
+                                 p->t != NULL ? sizeof(atomic_int) * 2 : 0,
+                                 (void **) &p->progress);
+    if (res) return res;
+
+    // Must be removed from the context after being attached to the frame
+    dav1d_ref_dec(&c->itut_t35_ref);
+    c->itut_t35 = NULL;
+
+    p->visible = f->frame_hdr->show_frame;
+    if (p->t) {
+        atomic_init(&p->progress[0], 0);
+        atomic_init(&p->progress[1], 0);
+    }
+    return res;
+}
+
+int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, const int w,
+                             const Dav1dPicture *const src)
+{
+    struct pic_ctx_context *const pic_ctx = src->ref->user_data;
+    const int res = picture_alloc_with_edges(c, dst, w, src->p.h,
+                                             src->seq_hdr, src->seq_hdr_ref,
+                                             src->frame_hdr, src->frame_hdr_ref,
+                                             src->content_light, src->content_light_ref,
+                                             src->mastering_display, src->mastering_display_ref,
+                                             src->itut_t35, src->itut_t35_ref,
+                                             src->p.bpc, &src->m, &pic_ctx->allocator,
+                                             0, NULL);
+    return res;
+}
+
+void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
+    validate_input(dst != NULL);
+    validate_input(dst->data[0] == NULL);
+    validate_input(src != NULL);
+
+    if (src->ref) {
+        validate_input(src->data[0] != NULL);
+        dav1d_ref_inc(src->ref);
+        if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
+        if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref);
+        if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
+        if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref);
+        if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref);
+        if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref);
+    }
+    *dst = *src;
+}
+
+void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) {
+    validate_input(dst != NULL);
+    validate_input(dst->data[0] == NULL);
+    validate_input(src != NULL);
+
+    if (src->ref)
+        validate_input(src->data[0] != NULL);
+
+    *dst = *src;
+    memset(src, 0, sizeof(*src));
+}
+
+void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst,
+                              const Dav1dThreadPicture *const src)
+{
+    dav1d_picture_ref(&dst->p, &src->p);
+    dst->t = src->t;
+    dst->visible = src->visible;
+    dst->progress = src->progress;
+}
+
+void dav1d_picture_unref_internal(Dav1dPicture *const p) {
+    validate_input(p != NULL);
+
+    if (p->ref) {
+        validate_input(p->data[0] != NULL);
+        dav1d_ref_dec(&p->ref);
+        dav1d_ref_dec(&p->seq_hdr_ref);
+        dav1d_ref_dec(&p->frame_hdr_ref);
+        dav1d_ref_dec(&p->m.user_data.ref);
+        dav1d_ref_dec(&p->content_light_ref);
+        dav1d_ref_dec(&p->mastering_display_ref);
+        dav1d_ref_dec(&p->itut_t35_ref);
+    }
+    memset(p, 0, sizeof(*p));
+}
+
+void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) {
+    dav1d_picture_unref_internal(&p->p);
+
+    p->t = NULL;
+    p->progress = NULL;
+}
+
+int dav1d_thread_picture_wait(const Dav1dThreadPicture *const p,
+                              int y_unclipped, const enum PlaneType plane_type)
+{
+    assert(plane_type != PLANE_TYPE_ALL);
+
+    if (!p->t)
+        return 0;
+
+    // convert to luma units; include plane delay from loopfilters; clip
+    const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    y_unclipped *= 1 << (plane_type & ss_ver); // we rely here on PLANE_TYPE_UV being 1
+    y_unclipped += (plane_type != PLANE_TYPE_BLOCK) * 8; // delay imposed by loopfilter
+    const unsigned y = iclip(y_unclipped, 1, p->p.p.h);
+    atomic_uint *const progress = &p->progress[plane_type != PLANE_TYPE_BLOCK];
+    unsigned state;
+
+    if ((state = atomic_load_explicit(progress, memory_order_acquire)) >= y)
+        return state == FRAME_ERROR;
+
+    pthread_mutex_lock(&p->t->lock);
+    while ((state = atomic_load_explicit(progress, memory_order_relaxed)) < y)
+        pthread_cond_wait(&p->t->cond, &p->t->lock);
+    pthread_mutex_unlock(&p->t->lock);
+    return state == FRAME_ERROR;
+}
+
+void dav1d_thread_picture_signal(const Dav1dThreadPicture *const p,
+                                 const int y, // in pixel units
+                                 const enum PlaneType plane_type)
+{
+    assert(plane_type != PLANE_TYPE_UV);
+
+    if (!p->t)
+        return;
+
+    pthread_mutex_lock(&p->t->lock);
+    if (plane_type != PLANE_TYPE_Y)
+        atomic_store(&p->progress[0], y);
+    if (plane_type != PLANE_TYPE_BLOCK)
+        atomic_store(&p->progress[1], y);
+    pthread_cond_broadcast(&p->t->cond);
+    pthread_mutex_unlock(&p->t->lock);
+}
diff --git a/src/picture.h b/src/picture.h
new file mode 100644 (file)
index 0000000..9f82de8
--- /dev/null
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PICTURE_H
+#define DAV1D_SRC_PICTURE_H
+
+#include <stdatomic.h>
+
+#include "src/thread.h"
+#include "dav1d/picture.h"
+
+#include "src/thread_data.h"
+#include "src/ref.h"
+
+enum PlaneType {
+    PLANE_TYPE_Y,
+    PLANE_TYPE_UV,
+    PLANE_TYPE_BLOCK,
+    PLANE_TYPE_ALL,
+};
+
+typedef struct Dav1dThreadPicture {
+    Dav1dPicture p;
+    int visible;
+    struct thread_data *t;
+    // [0] block data (including segmentation map and motion vectors)
+    // [1] pixel data
+    atomic_uint *progress;
+} Dav1dThreadPicture;
+
+/*
+ * Allocate a picture with custom border size.
+ */
+int dav1d_thread_picture_alloc(Dav1dContext *c, Dav1dFrameContext *f, const int bpc);
+
+/**
+ * Allocate a picture with identical metadata to an existing picture.
+ * The width is a separate argument so this function can be used for
+ * super-res, where the width changes, but everything else is the same.
+ * For the more typical use case of allocating a new image of the same
+ * dimensions, use src->p.w as width.
+ */
+int dav1d_picture_alloc_copy(Dav1dContext *c, Dav1dPicture *dst, const int w,
+                             const Dav1dPicture *src);
+
+/**
+ * Create a copy of a picture.
+ */
+void dav1d_picture_ref(Dav1dPicture *dst, const Dav1dPicture *src);
+void dav1d_thread_picture_ref(Dav1dThreadPicture *dst,
+                              const Dav1dThreadPicture *src);
+void dav1d_thread_picture_unref(Dav1dThreadPicture *p);
+
+/**
+ * Move a picture reference.
+ */
+void dav1d_picture_move_ref(Dav1dPicture *dst, Dav1dPicture *src);
+
+/**
+ * Wait for picture to reach a certain stage.
+ *
+ * y is in full-pixel units. If pt is not UV, this is in luma
+ * units, else it is in chroma units.
+ * plane_type is used to determine how many pixels delay are
+ * introduced by loopfilter processes.
+ *
+ * Returns 0 on success, and 1 if there was an error while decoding p
+ */
+int dav1d_thread_picture_wait(const Dav1dThreadPicture *p, int y,
+                               enum PlaneType plane_type);
+
+/**
+ * Signal decoding progress.
+ *
+ * y is in full-pixel luma units. FRAME_ERROR is used to signal a decoding
+ * error to frames using this frame as reference frame.
+ * plane_type denotes whether we have completed block data (pass 1;
+ * PLANE_TYPE_BLOCK), pixel data (pass 2, PLANE_TYPE_Y) or both (no
+ * 2-pass decoding; PLANE_TYPE_ALL).
+ */
+void dav1d_thread_picture_signal(const Dav1dThreadPicture *p, int y,
+                                 enum PlaneType plane_type);
+
+int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie);
+void dav1d_default_picture_release(Dav1dPicture *p, void *cookie);
+void dav1d_picture_unref_internal(Dav1dPicture *p);
+
+#endif /* DAV1D_SRC_PICTURE_H */
diff --git a/src/ppc/cdef_init_tmpl.c b/src/ppc/cdef_init_tmpl.c
new file mode 100644 (file)
index 0000000..07cbce6
--- /dev/null
@@ -0,0 +1,488 @@
+/*
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+
+#include "common/bitdepth.h"
+#include "common/intops.h"
+
+#include "src/cdef.h"
+#include "src/cpu.h"
+
+#include "src/ppc/types.h"
+
+#if BITDEPTH == 8
+static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
+                               const int damping)
+{
+    const i16x8 zero = vec_splat_s16(0);
+    if (!threshold) return zero;
+    const uint16_t shift = imax(0, damping - ulog2(threshold));
+    const i16x8 abs_diff = vec_abs(diff);
+    const b16x8 mask = vec_cmplt(diff, zero);
+    const i16x8 thr = vec_splats(threshold);
+    const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift)));
+    const i16x8 max = vec_max(zero, sub);
+    const i16x8 min = vec_min(abs_diff, max);
+    const i16x8 neg = vec_sub(zero, min);
+    return vec_sel(min, neg, mask);
+}
+
+static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+                           const uint8_t *src, const ptrdiff_t src_stride,
+                           const uint8_t (*left)[2], const uint8_t *const top,
+                           const int w, const int h,
+                           const enum CdefEdgeFlags edges)
+{
+    const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
+
+    u16x8 l0;
+    u16x8 l1;
+
+    int y_start = -2, y_end = h + 2;
+
+    // Copy top and bottom first
+    if (!(edges & CDEF_HAVE_TOP)) {
+        l0 = fill;
+        l1 = fill;
+        y_start = 0;
+    } else {
+        l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2));
+        l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2));
+    }
+
+    vec_st(l0, 0, tmp - 2 * 8);
+    vec_st(l1, 0, tmp - 1 * 8);
+
+    if (!(edges & CDEF_HAVE_BOTTOM)) {
+        l0 = fill;
+        l1 = fill;
+        y_end -= 2;
+    } else {
+        l0 = u8h_to_u16(vec_vsx_ld(0, src - 2 + (h + 0) * src_stride));
+        l1 = u8h_to_u16(vec_vsx_ld(0, src - 2 + (h + 1) * src_stride));
+    }
+
+    vec_st(l0, 0, tmp + (h + 0) * 8);
+    vec_st(l1, 0, tmp + (h + 1) * 8);
+
+    for (int y = 0; y < h; y++) {
+        u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride));
+        vec_st(l, 0, tmp + y * 8);
+    }
+
+    if (!(edges & CDEF_HAVE_LEFT)) {
+        for (int y = y_start; y < y_end; y++) {
+            tmp[y * 8] = INT16_MAX;
+            tmp[1 + y * 8] = INT16_MAX;
+        }
+    } else {
+        for (int y = 0; y < h; y++) {
+            tmp[y * 8] = left[y][0];
+            tmp[1 + y * 8] = left[y][1];
+        }
+    }
+    if (!(edges & CDEF_HAVE_RIGHT)) {
+        for (int y = y_start; y < y_end; y++) {
+            tmp[- 2 + (y + 1) * 8] = INT16_MAX;
+            tmp[- 1 + (y + 1) * 8] = INT16_MAX;
+        }
+    }
+}
+
+static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+                           const uint8_t *src, const ptrdiff_t src_stride,
+                           const uint8_t (*left)[2], const uint8_t *const top,
+                           const int w, const int h,
+                           const enum CdefEdgeFlags edges)
+{
+    const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
+
+    u16x8 l0h, l0l;
+    u16x8 l1h, l1l;
+
+    int y_start = -2, y_end = h + 2;
+
+    // Copy top and bottom first
+    if (!(edges & CDEF_HAVE_TOP)) {
+        l0h = fill;
+        l0l = fill;
+        l1h = fill;
+        l1l = fill;
+        y_start = 0;
+    } else {
+        u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2);
+        u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2);
+        l0h = u8h_to_u16(l0);
+        l0l = u8l_to_u16(l0);
+        l1h = u8h_to_u16(l1);
+        l1l = u8l_to_u16(l1);
+    }
+
+    vec_st(l0h, 0, tmp - 4 * 8);
+    vec_st(l0l, 0, tmp - 3 * 8);
+    vec_st(l1h, 0, tmp - 2 * 8);
+    vec_st(l1l, 0, tmp - 1 * 8);
+
+    if (!(edges & CDEF_HAVE_BOTTOM)) {
+        l0h = fill;
+        l0l = fill;
+        l1h = fill;
+        l1l = fill;
+        y_end -= 2;
+    } else {
+        u8x16 l0 = vec_vsx_ld(0, src - 2 + (h + 0) * src_stride);
+        u8x16 l1 = vec_vsx_ld(0, src - 2 + (h + 1) * src_stride);
+        l0h = u8h_to_u16(l0);
+        l0l = u8l_to_u16(l0);
+        l1h = u8h_to_u16(l1);
+        l1l = u8l_to_u16(l1);
+    }
+
+    vec_st(l0h, 0, tmp + (h + 0) * 16);
+    vec_st(l0l, 0, tmp + (h + 0) * 16 + 8);
+    vec_st(l1h, 0, tmp + (h + 1) * 16);
+    vec_st(l1l, 0, tmp + (h + 1) * 16 + 8);
+
+    for (int y = 0; y < h; y++) {
+        u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride);
+        u16x8 lh = u8h_to_u16(l);
+        u16x8 ll = u8l_to_u16(l);
+        vec_st(lh, 0, tmp + y * 16);
+        vec_st(ll, 0, tmp + 8 + y * 16);
+    }
+
+    if (!(edges & CDEF_HAVE_LEFT)) {
+        for (int y = y_start; y < y_end; y++) {
+            tmp[y * 16] = INT16_MAX;
+            tmp[1 + y * 16] = INT16_MAX;
+        }
+    } else {
+        for (int y = 0; y < h; y++) {
+            tmp[y * 16] = left[y][0];
+            tmp[1 + y * 16] = left[y][1];
+        }
+    }
+    if (!(edges & CDEF_HAVE_RIGHT)) {
+        for (int y = y_start; y < y_end; y++) {
+            tmp[- 6 + (y + 1) * 16] = INT16_MAX;
+            tmp[- 5 + (y + 1) * 16] = INT16_MAX;
+        }
+    }
+}
+
+static inline i16x8 max_mask(i16x8 a, i16x8 b) {
+    const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX);
+
+    const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX);
+
+    const i16x8 val = vec_sel(a, b, mask);
+
+    return vec_max(val, b);
+}
+
+#define LOAD_PIX(addr) \
+    const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \
+    i16x8 max = px; \
+    i16x8 min = px; \
+    i16x8 sum = vec_splat_s16(0);
+
+#define LOAD_PIX4(addr) \
+    const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \
+    const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \
+    const i16x8 px = vec_xxpermdi(a, b, 0); \
+    i16x8 max = px; \
+    i16x8 min = px; \
+    i16x8 sum = vec_splat_s16(0);
+
+#define LOAD_DIR(p, addr, o0, o1) \
+    const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \
+    const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \
+    const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \
+    const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1);
+
+#define LOAD_DIR4(p, addr, o0, o1) \
+    LOAD_DIR(p ## a, addr, o0, o1) \
+    LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \
+    const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \
+    const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \
+    const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \
+    const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0);
+
+#define CONSTRAIN(p, strength) \
+    const i16x8 p ## _d0 = vec_sub(p ## 0, px); \
+    const i16x8 p ## _d1 = vec_sub(p ## 1, px); \
+    const i16x8 p ## _d2 = vec_sub(p ## 2, px); \
+    const i16x8 p ## _d3 = vec_sub(p ## 3, px); \
+\
+    i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \
+    i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \
+    i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \
+    i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping);
+
+#define MIN_MAX(p) \
+    max = max_mask(p ## 0, max); \
+    min = vec_min(p ## 0, min); \
+    max = max_mask(p ## 1, max); \
+    min = vec_min(p ## 1, min); \
+    max = max_mask(p ## 2, max); \
+    min = vec_min(p ## 2, min); \
+    max = max_mask(p ## 3, max); \
+    min = vec_min(p ## 3, min);
+
+#define PRI_0(p) \
+    p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \
+    p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even)));
+
+#define PRI_1(p) \
+    p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \
+    p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even)));
+
+#define SEC_0(p) \
+    p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \
+    p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \
+    p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \
+    p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1));
+
+#define UPDATE_SUM(p) \
+    const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \
+    const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \
+    sum = vec_add(sum, p ## sum0); \
+    sum = vec_add(sum, p ## sum1);
+
+static inline void
+filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
+           const pixel (*left)[2], const pixel *const top,
+           const int w, const int h, const int pri_strength,
+           const int sec_strength, const int dir,
+           const int damping, const enum CdefEdgeFlags edges,
+           const ptrdiff_t tmp_stride, uint16_t *tmp)
+{
+    const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
+        { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
+        {  0 * tmp_stride + 1, -1 * tmp_stride + 2 },
+        {  0 * tmp_stride + 1,  0 * tmp_stride + 2 },
+        {  0 * tmp_stride + 1,  1 * tmp_stride + 2 },
+        {  1 * tmp_stride + 1,  2 * tmp_stride + 2 },
+        {  1 * tmp_stride + 0,  2 * tmp_stride + 1 },
+        {  1 * tmp_stride + 0,  2 * tmp_stride + 0 },
+        {  1 * tmp_stride + 0,  2 * tmp_stride - 1 }
+    };
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
+    const int off1 = cdef_directions[dir][0];
+    const int off1_1 = cdef_directions[dir][1];
+
+    const int off2 = cdef_directions[(dir + 2) & 7][0];
+    const int off3 = cdef_directions[(dir + 6) & 7][0];
+
+    const int off2_1 = cdef_directions[(dir + 2) & 7][1];
+    const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+
+
+    copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, w, h, edges);
+    for (int y = 0; y < h / 2; y++) {
+        LOAD_PIX4(tmp)
+
+        // Primary pass
+        LOAD_DIR4(p, tmp, off1, off1_1)
+
+        CONSTRAIN(p, pri_strength)
+
+        MIN_MAX(p)
+
+        PRI_0(p)
+        PRI_1(p)
+
+        UPDATE_SUM(p)
+
+        // Secondary pass 1
+        LOAD_DIR4(s, tmp, off2, off3)
+
+        CONSTRAIN(s, sec_strength)
+
+        MIN_MAX(s)
+
+        SEC_0(s)
+
+        UPDATE_SUM(s)
+
+        // Secondary pass 2
+        LOAD_DIR4(s2, tmp, off2_1, off3_1)
+
+        CONSTRAIN(s2, sec_strength)
+
+        MIN_MAX(s2)
+
+        UPDATE_SUM(s2)
+
+        // Store
+        i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
+        bias = vec_sub(vec_splat_s16(8), bias);
+        i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
+        i16x8 vdst = vec_max(vec_min(unclamped, max), min);
+
+        dst[0] = vdst[0];
+        dst[1] = vdst[1];
+        dst[2] = vdst[2];
+        dst[3] = vdst[3];
+
+        tmp += tmp_stride;
+        dst += PXSTRIDE(dst_stride);
+        dst[0] = vdst[4];
+        dst[1] = vdst[5];
+        dst[2] = vdst[6];
+        dst[3] = vdst[7];
+
+        tmp += tmp_stride;
+        dst += PXSTRIDE(dst_stride);
+    }
+}
+
+static inline void
+filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
+           const pixel (*left)[2], const pixel *const top,
+           const int w, const int h, const int pri_strength,
+           const int sec_strength, const int dir,
+           const int damping, const enum CdefEdgeFlags edges,
+           const ptrdiff_t tmp_stride, uint16_t *tmp)
+{
+    const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
+        { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
+        {  0 * tmp_stride + 1, -1 * tmp_stride + 2 },
+        {  0 * tmp_stride + 1,  0 * tmp_stride + 2 },
+        {  0 * tmp_stride + 1,  1 * tmp_stride + 2 },
+        {  1 * tmp_stride + 1,  2 * tmp_stride + 2 },
+        {  1 * tmp_stride + 0,  2 * tmp_stride + 1 },
+        {  1 * tmp_stride + 0,  2 * tmp_stride + 0 },
+        {  1 * tmp_stride + 0,  2 * tmp_stride - 1 }
+    };
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+
+
+    const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
+    const int off1 = cdef_directions[dir][0];
+    const int off1_1 = cdef_directions[dir][1];
+
+    const int off2 = cdef_directions[(dir + 2) & 7][0];
+    const int off3 = cdef_directions[(dir + 6) & 7][0];
+
+    const int off2_1 = cdef_directions[(dir + 2) & 7][1];
+    const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+
+    copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, w, h, edges);
+
+    for (int y = 0; y < h; y++) {
+        LOAD_PIX(tmp)
+
+        // Primary pass
+        LOAD_DIR(p, tmp, off1, off1_1)
+
+        CONSTRAIN(p, pri_strength)
+
+        MIN_MAX(p)
+
+        PRI_0(p)
+        PRI_1(p)
+
+        UPDATE_SUM(p)
+
+        // Secondary pass 1
+        LOAD_DIR(s, tmp, off2, off3)
+
+        CONSTRAIN(s, sec_strength)
+
+        MIN_MAX(s)
+
+        SEC_0(s)
+
+        UPDATE_SUM(s)
+
+        // Secondary pass 2
+        LOAD_DIR(s2, tmp, off2_1, off3_1)
+
+        CONSTRAIN(s2, sec_strength)
+
+        MIN_MAX(s2)
+
+        UPDATE_SUM(s2)
+
+        // Store
+        i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
+        bias = vec_sub(vec_splat_s16(8), bias);
+        i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
+        i16x8 vdst = vec_max(vec_min(unclamped, max), min);
+
+        dst[0] = vdst[0];
+        dst[1] = vdst[1];
+        dst[2] = vdst[2];
+        dst[3] = vdst[3];
+        dst[4] = vdst[4];
+        dst[5] = vdst[5];
+        dst[6] = vdst[6];
+        dst[7] = vdst[7];
+
+        tmp += tmp_stride;
+        dst += PXSTRIDE(dst_stride);
+    }
+
+}
+
+
+#define cdef_fn(w, h, tmp_stride) \
+static void cdef_filter_##w##x##h##_vsx(pixel *const dst, \
+                                        const ptrdiff_t dst_stride, \
+                                        const pixel (*left)[2], \
+                                        const pixel *const top, \
+                                        const int pri_strength, \
+                                        const int sec_strength, \
+                                        const int dir, \
+                                        const int damping, \
+                                        const enum CdefEdgeFlags edges) \
+{ \
+    ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride,); \
+    uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
+    filter_##w##xN(dst, dst_stride, left, top, w, h, pri_strength, sec_strength, \
+                   dir, damping, edges, tmp_stride, tmp); \
+}
+
+cdef_fn(4, 4, 8);
+cdef_fn(4, 8, 8);
+cdef_fn(8, 8, 16);
+#endif
+
+COLD void bitfn(dav1d_cdef_dsp_init_ppc)(Dav1dCdefDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
+
+#if BITDEPTH == 8
+    // c->dir = dav1d_cdef_find_dir_vsx;
+    c->fb[0] = cdef_filter_8x8_vsx;
+    c->fb[1] = cdef_filter_4x8_vsx;
+    c->fb[2] = cdef_filter_4x4_vsx;
+#endif
+}
diff --git a/src/ppc/cpu.c b/src/ppc/cpu.c
new file mode 100644 (file)
index 0000000..fe77057
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "src/ppc/cpu.h"
+
+#if (defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)) && ARCH_PPC64LE
+#include <sys/auxv.h>
+#define HAVE_AUX
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_ppc(void) {
+    unsigned flags = 0;
+#if defined(HAVE_GETAUXVAL) && ARCH_PPC64LE
+    unsigned long hw_cap = getauxval(AT_HWCAP);
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_PPC64LE
+    unsigned long hw_cap = 0;
+    elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+#endif
+#ifdef HAVE_AUX
+    flags |= (hw_cap & PPC_FEATURE_HAS_VSX) ? DAV1D_PPC_CPU_FLAG_VSX : 0;
+#endif
+    return flags;
+}
diff --git a/src/ppc/cpu.h b/src/ppc/cpu.h
new file mode 100644 (file)
index 0000000..cfd2ff4
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PPC_CPU_H
+#define DAV1D_SRC_PPC_CPU_H
+
+enum CpuFlags {
+    DAV1D_PPC_CPU_FLAG_VSX = 1 << 0,
+};
+
+unsigned dav1d_get_cpu_flags_ppc(void);
+
+#endif /* DAV1D_SRC_PPC_CPU_H */
diff --git a/src/ppc/looprestoration_init_tmpl.c b/src/ppc/looprestoration_init_tmpl.c
new file mode 100644 (file)
index 0000000..78ff129
--- /dev/null
@@ -0,0 +1,350 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Michail Alvanos
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "common/intops.h"
+#include "src/ppc/types.h"
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#if BITDEPTH == 8
+
+#define REST_UNIT_STRIDE (400)
+
+static inline i32x4 iclip_vec(i32x4 v, const i32x4 minv, const i32x4 maxv) {
+    v = vec_max(minv, v);
+    v = vec_min(maxv, v);
+    return v;
+}
+
+#define APPLY_FILTER_H(v, f, ssum1, ssum2) do {  \
+    i16x8 ktmp_u16_high = (i16x8) u8h_to_u16(v); \
+    i16x8 ktmp_u16_low  = (i16x8) u8l_to_u16(v); \
+    ssum1 = vec_madd(ktmp_u16_high, f, ssum1);   \
+    ssum2 = vec_madd(ktmp_u16_low, f, ssum2);    \
+} while (0)
+
+static void wiener_filter_h_vsx(int32_t *hor_ptr,
+                                uint8_t *tmp_ptr,
+                                const int16_t filterh[7],
+                                const int w, const int h)
+{
+    static const i32x4 zerov = vec_splats(0);
+    static const i32x4 seven_vec = vec_splats(7);
+    static const i32x4 bitdepth_added_vec = vec_splats(1 << 14);
+    static const i32x4 round_bits_vec = vec_splats(3);
+    static const i32x4 rounding_off_vec = vec_splats(1<<2);
+    static const i32x4 clip_limit_v = vec_splats((1 << 13) - 1);
+
+    i16x8 filterhvall = vec_vsx_ld(0, filterh);
+    i16x8 filterhv0 =  vec_splat( filterhvall, 0);
+    i16x8 filterhv1 =  vec_splat( filterhvall, 1);
+    i16x8 filterhv2 =  vec_splat( filterhvall, 2);
+    i16x8 filterhv3 =  vec_splat( filterhvall, 3);
+    i16x8 filterhv4 =  vec_splat( filterhvall, 4);
+    i16x8 filterhv5 =  vec_splat( filterhvall, 5);
+    i16x8 filterhv6 =  vec_splat( filterhvall, 6);
+
+    for (int j = 0; j < h + 6; j++) {
+        for (int i = 0; i < w; i+=16) {
+            i32x4 sum1 = bitdepth_added_vec;
+            i32x4 sum2 = bitdepth_added_vec;
+            i32x4 sum3 = bitdepth_added_vec;
+            i32x4 sum4 = bitdepth_added_vec;
+
+            u8x16 tmp_v0 = vec_ld(0, &tmp_ptr[i]);
+            u8x16 tmp_v7 = vec_ld(0, &tmp_ptr[i+16]);
+
+            u8x16 tmp_v1 = vec_sld( tmp_v7, tmp_v0, 15);
+            u8x16 tmp_v2 = vec_sld( tmp_v7, tmp_v0, 14);
+            u8x16 tmp_v3 = vec_sld( tmp_v7, tmp_v0, 13);
+            u8x16 tmp_v4 = vec_sld( tmp_v7, tmp_v0, 12);
+            u8x16 tmp_v5 = vec_sld( tmp_v7, tmp_v0, 11);
+            u8x16 tmp_v6 = vec_sld( tmp_v7, tmp_v0, 10);
+
+            u16x8 tmp_u16_high = u8h_to_u16(tmp_v3);
+            u16x8 tmp_u16_low  = u8l_to_u16(tmp_v3);
+
+            i32x4 tmp_expanded1 = i16h_to_i32(tmp_u16_high);
+            i32x4 tmp_expanded2 = i16l_to_i32(tmp_u16_high);
+            i32x4 tmp_expanded3 = i16h_to_i32(tmp_u16_low);
+            i32x4 tmp_expanded4 = i16l_to_i32(tmp_u16_low);
+
+            i16x8 ssum1 = (i16x8) zerov;
+            i16x8 ssum2 = (i16x8) zerov;
+
+            APPLY_FILTER_H(tmp_v0, filterhv0, ssum1, ssum2);
+            APPLY_FILTER_H(tmp_v1, filterhv1, ssum1, ssum2);
+            APPLY_FILTER_H(tmp_v2, filterhv2, ssum1, ssum2);
+            APPLY_FILTER_H(tmp_v3, filterhv3, ssum1, ssum2);
+            APPLY_FILTER_H(tmp_v4, filterhv4, ssum1, ssum2);
+            APPLY_FILTER_H(tmp_v5, filterhv5, ssum1, ssum2);
+            APPLY_FILTER_H(tmp_v6, filterhv6, ssum1, ssum2);
+
+            sum1 += i16h_to_i32(ssum1) + (tmp_expanded1 << seven_vec);
+            sum2 += i16l_to_i32(ssum1) + (tmp_expanded2 << seven_vec);
+            sum3 += i16h_to_i32(ssum2) + (tmp_expanded3 << seven_vec);
+            sum4 += i16l_to_i32(ssum2) + (tmp_expanded4 << seven_vec);
+
+            sum1 = (sum1 + rounding_off_vec) >> round_bits_vec;
+            sum2 = (sum2 + rounding_off_vec) >> round_bits_vec;
+            sum3 = (sum3 + rounding_off_vec) >> round_bits_vec;
+            sum4 = (sum4 + rounding_off_vec) >> round_bits_vec;
+
+            sum1 = iclip_vec(sum1, zerov, clip_limit_v);
+            sum2 = iclip_vec(sum2, zerov, clip_limit_v);
+            sum3 = iclip_vec(sum3, zerov, clip_limit_v);
+            sum4 = iclip_vec(sum4, zerov, clip_limit_v);
+
+            vec_st(sum1,  0, &hor_ptr[i]);
+            vec_st(sum2, 16, &hor_ptr[i]);
+            vec_st(sum3, 32, &hor_ptr[i]);
+            vec_st(sum4, 48, &hor_ptr[i]);
+        }
+        tmp_ptr += REST_UNIT_STRIDE;
+        hor_ptr += REST_UNIT_STRIDE;
+    }
+}
+
+static inline i16x8 iclip_u8_vec(i16x8 v) {
+    static const i16x8 zerov = vec_splats((int16_t)0);
+    static const i16x8 maxv = vec_splats((int16_t)255);
+    v = vec_max(zerov, v);
+    v = vec_min(maxv, v);
+    return v;
+}
+
+#define APPLY_FILTER_V(index, f) do { \
+    i32x4 v1 = vec_ld( 0, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+    i32x4 v2 = vec_ld(16, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+    i32x4 v3 = vec_ld(32, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+    i32x4 v4 = vec_ld(48, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+    sum1 = sum1 + v1 * f; \
+    sum2 = sum2 + v2 * f; \
+    sum3 = sum3 + v3 * f; \
+    sum4 = sum4 + v4 * f; \
+} while (0)
+
+#define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \
+    i32x4 v_1 = (i32x4) vec_ld( 0, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
+    i32x4 v_2 = (i32x4) vec_ld(16, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
+    i32x4 v_3 = (i32x4) vec_ld(32, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
+    i32x4 v_4 = (i32x4) vec_ld(48, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
+    i32x4 sum1 = -round_offset_vec; \
+    i32x4 sum2 = -round_offset_vec; \
+    i32x4 sum3 = -round_offset_vec; \
+    i32x4 sum4 = -round_offset_vec; \
+    APPLY_FILTER_V(0, filterv0); \
+    APPLY_FILTER_V(1, filterv1); \
+    APPLY_FILTER_V(2, filterv2); \
+    APPLY_FILTER_V(3, filterv3); \
+    APPLY_FILTER_V(4, filterv4); \
+    APPLY_FILTER_V(5, filterv5); \
+    APPLY_FILTER_V(6, filterv6); \
+    sum1 = (v_1 << seven_vec) + sum1 + rounding_off_vec; \
+    sum2 = (v_2 << seven_vec) + sum2 + rounding_off_vec; \
+    sum3 = (v_3 << seven_vec) + sum3 + rounding_off_vec; \
+    sum4 = (v_4 << seven_vec) + sum4 + rounding_off_vec; \
+    sum1 = sum1 >> round_bits_vec; \
+    sum2 = sum2 >> round_bits_vec; \
+    sum3 = sum3 >> round_bits_vec; \
+    sum4 = sum4 >> round_bits_vec; \
+    i16x8 sum_short_packed_1 = (i16x8) vec_pack( sum1, sum2 ); \
+    i16x8 sum_short_packed_2 = (i16x8) vec_pack( sum3, sum4 ); \
+    sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \
+    sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \
+    sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2 ); \
+} while (0)
+
+static inline void wiener_filter_v_vsx(uint8_t *p,
+                                       const ptrdiff_t p_stride,
+                                       const int32_t *hor,
+                                       const int16_t filterv[7],
+                                       const int w, const int h)
+{
+    static const i32x4 round_bits_vec = vec_splats(11);
+    static const i32x4 rounding_off_vec = vec_splats(1 << 10);
+    static const i32x4 round_offset_vec = vec_splats(1 << 18);
+    static const i32x4 seven_vec = vec_splats(7);
+
+    i32x4 filterv0 =  vec_splats((int32_t) filterv[0]);
+    i32x4 filterv1 =  vec_splats((int32_t) filterv[1]);
+    i32x4 filterv2 =  vec_splats((int32_t) filterv[2]);
+    i32x4 filterv3 =  vec_splats((int32_t) filterv[3]);
+    i32x4 filterv4 =  vec_splats((int32_t) filterv[4]);
+    i32x4 filterv5 =  vec_splats((int32_t) filterv[5]);
+    i32x4 filterv6 =  vec_splats((int32_t) filterv[6]);
+
+    for (int j = 0; j < h; j++) {
+        for (int i = 0; i <(w-w%16); i += 16) {
+            u8x16 sum_pixel;
+            LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
+            vec_vsx_st(sum_pixel, 0, &p[j * PXSTRIDE(p_stride) + i]);
+        }
+        // remaining loop
+        if (w & 0xf){
+            int i=w-w%16;
+            ALIGN_STK_16(uint8_t, tmp_out, 16,);
+            u8x16 sum_pixel;
+
+            LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
+            vec_vsx_st(sum_pixel, 0, tmp_out);
+
+            for (int k=0; i<w; i++, k++) {
+                p[j * PXSTRIDE(p_stride) + i] = tmp_out[k];
+            }
+        }
+    }
+}
+
+static inline void padding(uint8_t *dst, const uint8_t *p,
+                           const ptrdiff_t p_stride, const uint8_t (*left)[4],
+                           const uint8_t *lpf, const ptrdiff_t lpf_stride,
+                           int unit_w, const int stripe_h,
+                           const enum LrEdgeFlags edges)
+{
+    const int have_left = !!(edges & LR_HAVE_LEFT);
+    const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+    // Copy more pixels if we don't have to pad them
+    unit_w += 3 * have_left + 3 * have_right;
+    uint8_t *dst_l = dst + 3 * !have_left;
+    p -= 3 * have_left;
+    lpf -= 3 * have_left;
+
+    if (edges & LR_HAVE_TOP) {
+        // Copy previous loop filtered rows
+        const uint8_t *const above_1 = lpf;
+        const uint8_t *const above_2 = above_1 + PXSTRIDE(lpf_stride);
+        pixel_copy(dst_l, above_1, unit_w);
+        pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+    } else {
+        // Pad with first row
+        pixel_copy(dst_l, p, unit_w);
+        pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+        if (have_left) {
+            pixel_copy(dst_l, &left[0][1], 3);
+            pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+            pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+        }
+    }
+
+    uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+    if (edges & LR_HAVE_BOTTOM) {
+        // Copy next loop filtered rows
+        const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride);
+        const uint8_t *const below_2 = below_1 + PXSTRIDE(lpf_stride);
+        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+    } else {
+        // Pad with last row
+        const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride);
+        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+        if (have_left) {
+            pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+            pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+            pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+        }
+    }
+
+    // Inner UNIT_WxSTRIPE_H
+    for (int j = 0; j < stripe_h; j++) {
+        pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+        dst_tl += REST_UNIT_STRIDE;
+        p += PXSTRIDE(p_stride);
+    }
+
+    if (!have_right) {
+        uint8_t *pad = dst_l + unit_w;
+        uint8_t *row_last = &dst_l[unit_w - 1];
+        // Pad 3x(STRIPE_H+6) with last column
+        for (int j = 0; j < stripe_h + 6; j++) {
+            pixel_set(pad, *row_last, 3);
+            pad += REST_UNIT_STRIDE;
+            row_last += REST_UNIT_STRIDE;
+        }
+    }
+
+    if (!have_left) {
+        // Pad 3x(STRIPE_H+6) with first column
+        for (int j = 0; j < stripe_h + 6; j++) {
+            pixel_set(dst, *dst_l, 3);
+            dst += REST_UNIT_STRIDE;
+            dst_l += REST_UNIT_STRIDE;
+        }
+    } else {
+        dst += 3 * REST_UNIT_STRIDE;
+        for (int j = 0; j < stripe_h; j++) {
+            pixel_copy(dst, &left[j][1], 3);
+            dst += REST_UNIT_STRIDE;
+        }
+    }
+}
+
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride,
+                              const uint8_t (*const left)[4],
+                              const uint8_t *lpf,
+                              const ptrdiff_t lpf_stride,
+                              const int w, const int h,
+                              const int16_t filterh[7],
+                              const int16_t filterv[7],
+                              const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+    // of padding above and below
+    ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
+    ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
+
+    wiener_filter_h_vsx(hor, tmp, filterh, w, h);
+    wiener_filter_v_vsx(p, p_stride, hor, filterv, w, h);
+
+}
+#endif
+
+COLD void bitfn(dav1d_loop_restoration_dsp_init_ppc)
+    (Dav1dLoopRestorationDSPContext *const c)
+{
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
+
+#if BITDEPTH == 8
+    c->wiener = wiener_filter_vsx;
+#endif
+}
+
+
diff --git a/src/ppc/types.h b/src/ppc/types.h
new file mode 100644 (file)
index 0000000..0b4bd72
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PPC_TYPES_H
+#define DAV1D_SRC_PPC_TYPES_H
+
+#include <altivec.h>
+#undef pixel
+
+#define u8x16 vector unsigned char
+#define i8x16 vector signed char
+#define b8x16 vector bool char
+#define u16x8 vector unsigned short
+#define i16x8 vector signed short
+#define b16x8 vector bool short
+#define u32x4 vector unsigned int
+#define i32x4 vector signed int
+#define b32x4 vector bool int
+#define u64x2 vector unsigned long long
+#define i64x2 vector signed long long
+#define b64x2 vector bool long long
+
+#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
+#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
+#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0)))
+#define i16h_to_i32(v) ((i32x4) vec_unpackh((i16x8)v))
+#define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0)))
+#define i16l_to_i32(v) ((i32x4) vec_unpackl((i16x8)v))
+
+#endif /* DAV1D_SRC_PPC_TYPES_H */
diff --git a/src/qm.c b/src/qm.c
new file mode 100644 (file)
index 0000000..8d9a0f9
--- /dev/null
+++ b/src/qm.c
@@ -0,0 +1,3152 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/attributes.h"
+
+#include "src/qm.h"
+
+static const uint8_t qm_tbl_4x4_t[][2][10] = {
+    {
+        {
+             32,
+             43,  67,
+             73,  94, 137,
+             97, 110, 150, 200,
+        }, {
+             35,
+             46,  60,
+             57,  69,  90,
+             66,  71,  90, 109,
+        },
+    }, {
+        {
+             32,
+             41,  63,
+             69,  88, 127,
+             92, 103, 140, 184,
+        }, {
+             33,
+             45,  58,
+             56,  66,  86,
+             64,  69,  87, 105,
+        },
+    }, {
+        {
+             32,
+             38,  56,
+             63,  78, 113,
+             86,  97, 130, 169,
+        }, {
+             32,
+             45,  55,
+             53,  62,  80,
+             63,  67,  84, 101,
+        },
+    }, {
+        {
+             32,
+             37,  54,
+             58,  72, 102,
+             81,  91, 121, 156,
+        }, {
+             32,
+             45,  54,
+             51,  59,  75,
+             61,  65,  81,  97,
+        },
+    }, {
+        {
+             32,
+             34,  49,
+             53,  64,  91,
+             75,  81, 112, 140,
+        }, {
+             32,
+             46,  53,
+             49,  55,  70,
+             58,  62,  78,  91,
+        },
+    }, {
+        {
+             32,
+             34,  48,
+             49,  60,  82,
+             72,  79, 104, 134,
+        }, {
+             32,
+             46,  53,
+             47,  54,  66,
+             57,  60,  75,  89,
+        },
+    }, {
+        {
+             32,
+             33,  39,
+             45,  51,  71,
+             62,  64,  87, 108,
+        }, {
+             31,
+             42,  48,
+             47,  50,  61,
+             53,  54,  67,  78,
+        },
+    }, {
+        {
+             32,
+             33,  38,
+             42,  46,  63,
+             55,  57,  75,  92,
+        }, {
+             31,
+             41,  48,
+             46,  48,  58,
+             51,  51,  62,  71,
+        },
+    }, {
+        {
+             32,
+             32,  35,
+             38,  40,  54,
+             51,  49,  64,  81,
+        }, {
+             31,
+             38,  47,
+             47,  46,  54,
+             49,  46,  57,  66,
+        },
+    }, {
+        {
+             32,
+             32,  34,
+             35,  37,  48,
+             43,  43,  54,  65,
+        }, {
+             31,
+             37,  44,
+             47,  47,  53,
+             47,  45,  53,  59,
+        },
+    }, {
+        {
+             32,
+             32,  33,
+             34,  35,  39,
+             38,  39,  45,  54,
+        }, {
+             31,
+             34,  39,
+             42,  45,  48,
+             47,  46,  49,  54,
+        },
+    }, {
+        {
+             32,
+             32,  32,
+             32,  33,  35,
+             35,  35,  38,  46,
+        }, {
+             31,
+             32,  34,
+             38,  41,  47,
+             46,  46,  47,  52,
+        },
+    }, {
+        {
+             31,
+             32,  32,
+             32,  32,  33,
+             32,  33,  34,  35,
+        }, {
+             31,
+             31,  32,
+             34,  35,  39,
+             38,  40,  43,  47,
+        },
+    }, {
+        {
+             31,
+             31,  32,
+             31,  32,  32,
+             32,  32,  32,  33,
+        }, {
+             31,
+             31,  31,
+             31,  31,  32,
+             34,  35,  35,  39,
+        },
+    }, {
+        {
+             31,
+             31,  32,
+             31,  32,  32,
+             31,  32,  32,  32,
+        }, {
+             31,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+        },
+    },
+};
+
+static const uint8_t qm_tbl_8x4[][2][32] = {
+    {
+        {
+             32,  33,  37,  49,  65,  80,  91, 104,
+             42,  42,  58,  71,  84,  97, 100, 112,
+             75,  69,  84, 103, 125, 142, 145, 146,
+             91,  86,  91, 110, 128, 152, 178, 190,
+        }, {
+             31,  40,  46,  48,  54,  61,  64,  68,
+             47,  45,  56,  61,  65,  69,  68,  71,
+             60,  54,  64,  75,  85,  92,  90,  87,
+             66,  61,  64,  73,  82,  92, 102, 105,
+        },
+    }, {
+        {
+             32,  33,  36,  46,  60,  75,  86,  98,
+             42,  42,  56,  67,  79,  92,  95, 105,
+             69,  64,  77,  93, 112, 130, 136, 136,
+             88,  83,  88, 105, 122, 144, 167, 177,
+        }, {
+             31,  40,  46,  47,  52,  59,  63,  66,
+             47,  45,  55,  60,  64,  68,  66,  69,
+             57,  52,  61,  70,  79,  87,  88,  85,
+             65,  61,  63,  72,  81,  90,  99, 102,
+        },
+    }, {
+        {
+             32,  32,  34,  44,  54,  72,  82,  92,
+             38,  40,  51,  61,  69,  84,  89,  98,
+             62,  58,  68,  85,  98, 118, 129, 127,
+             86,  80,  85, 101, 117, 136, 157, 165,
+        }, {
+             31,  38,  46,  46,  50,  57,  61,  65,
+             47,  46,  53,  56,  59,  64,  65,  67,
+             54,  50,  57,  66,  74,  82,  85,  82,
+             64,  60,  62,  71,  79,  88,  97,  99,
+        },
+    }, {
+        {
+             32,  32,  34,  41,  51,  65,  75,  86,
+             35,  36,  47,  53,  61,  73,  81,  92,
+             59,  57,  65,  78,  92, 108, 117, 119,
+             83,  78,  82,  97, 111, 129, 148, 154,
+        }, {
+             31,  36,  46,  45,  49,  54,  59,  63,
+             47,  47,  52,  53,  55,  58,  61,  65,
+             53,  50,  55,  63,  71,  77,  81,  80,
+             63,  59,  61,  70,  77,  86,  94,  95,
+        },
+    }, {
+        {
+             32,  32,  34,  38,  48,  60,  72,  81,
+             35,  36,  42,  51,  59,  68,  79,  86,
+             51,  50,  54,  67,  80,  92, 104, 112,
+             77,  72,  75,  87, 103, 119, 135, 144,
+        }, {
+             31,  36,  43,  45,  47,  52,  57,  61,
+             47,  47,  50,  53,  54,  56,  60,  63,
+             50,  47,  50,  58,  66,  70,  75,  77,
+             61,  57,  58,  65,  74,  82,  90,  93,
+        },
+    }, {
+        {
+             32,  32,  34,  37,  45,  54,  65,  75,
+             35,  36,  42,  50,  56,  63,  73,  81,
+             51,  50,  54,  65,  76,  87,  97, 106,
+             75,  71,  73,  84,  96, 110, 125, 136,
+        }, {
+             31,  36,  43,  46,  46,  50,  54,  59,
+             47,  47,  50,  53,  54,  55,  58,  61,
+             50,  47,  50,  57,  64,  68,  72,  75,
+             60,  56,  57,  64,  71,  78,  85,  90,
+        },
+    }, {
+        {
+             32,  32,  33,  35,  41,  49,  57,  66,
+             34,  34,  37,  43,  48,  54,  60,  68,
+             43,  42,  44,  54,  64,  71,  78,  86,
+             62,  59,  58,  68,  79,  91, 101, 111,
+        }, {
+             31,  33,  40,  47,  45,  48,  51,  55,
+             42,  44,  47,  50,  49,  50,  52,  55,
+             47,  45,  46,  54,  59,  61,  63,  66,
+             54,  51,  50,  57,  64,  70,  75,  79,
+        },
+    }, {
+        {
+             32,  32,  32,  34,  38,  44,  50,  61,
+             32,  33,  35,  37,  40,  45,  50,  58,
+             42,  41,  42,  50,  58,  66,  71,  79,
+             56,  53,  52,  59,  68,  78,  86,  97,
+        }, {
+             31,  32,  39,  44,  46,  47,  48,  53,
+             38,  40,  47,  47,  47,  46,  47,  50,
+             47,  45,  45,  51,  56,  59,  61,  64,
+             52,  49,  48,  53,  58,  64,  68,  73,
+        },
+    }, {
+        {
+             32,  32,  32,  34,  35,  40,  46,  52,
+             32,  33,  34,  37,  38,  42,  46,  51,
+             37,  36,  38,  44,  49,  55,  59,  64,
+             52,  49,  49,  54,  60,  69,  76,  83,
+        }, {
+             31,  31,  36,  42,  47,  46,  48,  50,
+             38,  40,  44,  47,  48,  46,  46,  48,
+             47,  46,  47,  50,  53,  54,  55,  56,
+             50,  48,  47,  50,  54,  60,  64,  67,
+        },
+    }, {
+        {
+             31,  32,  32,  32,  34,  37,  42,  46,
+             32,  33,  34,  35,  37,  40,  43,  46,
+             35,  34,  36,  38,  43,  49,  53,  56,
+             43,  41,  42,  42,  49,  56,  63,  67,
+        }, {
+             31,  31,  35,  39,  43,  47,  46,  48,
+             38,  40,  43,  47,  47,  47,  46,  46,
+             47,  46,  47,  47,  50,  53,  53,  54,
+             48,  45,  46,  45,  50,  55,  58,  59,
+        },
+    }, {
+        {
+             31,  32,  32,  32,  33,  34,  37,  40,
+             32,  32,  33,  33,  34,  36,  38,  40,
+             34,  34,  34,  36,  38,  41,  44,  46,
+             39,  38,  38,  40,  42,  47,  52,  56,
+        }, {
+             31,  31,  33,  36,  40,  45,  47,  47,
+             34,  35,  37,  41,  44,  46,  47,  46,
+             42,  42,  44,  46,  48,  49,  50,  49,
+             48,  46,  46,  46,  48,  51,  54,  55,
+        },
+    }, {
+        {
+             31,  32,  32,  32,  32,  33,  34,  35,
+             31,  32,  32,  32,  33,  33,  34,  34,
+             32,  32,  33,  34,  35,  36,  37,  38,
+             35,  35,  34,  36,  38,  40,  42,  48,
+        }, {
+             31,  31,  31,  34,  37,  39,  42,  48,
+             31,  31,  32,  36,  39,  41,  43,  46,
+             37,  38,  40,  43,  46,  47,  47,  48,
+             48,  47,  46,  47,  47,  48,  50,  53,
+        },
+    }, {
+        {
+             31,  31,  32,  32,  32,  32,  32,  33,
+             31,  32,  32,  32,  32,  32,  33,  33,
+             32,  32,  32,  32,  33,  34,  34,  35,
+             32,  32,  32,  33,  34,  34,  35,  36,
+        }, {
+             31,  31,  31,  31,  34,  35,  38,  41,
+             31,  31,  32,  32,  36,  37,  40,  42,
+             35,  36,  37,  37,  40,  42,  45,  45,
+             37,  38,  39,  40,  43,  44,  47,  47,
+        },
+    }, {
+        {
+             31,  31,  31,  31,  31,  31,  32,  32,
+             31,  32,  32,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  32,  32,
+             32,  32,  32,  32,  32,  33,  33,  33,
+        }, {
+             31,  31,  31,  31,  31,  31,  34,  34,
+             31,  31,  31,  32,  32,  33,  36,  36,
+             31,  31,  31,  32,  32,  33,  36,  36,
+             34,  35,  35,  36,  36,  37,  40,  40,
+        },
+    }, {
+        {
+             31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,
+        }, {
+             31,  31,  31,  31,  31,  31,  31,  30,
+             31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,
+        },
+    },
+};
+
+static const uint8_t qm_tbl_8x8_t[][2][36] = {
+    {
+        {
+             32,
+             32,  35,
+             38,  40,  54,
+             51,  49,  65,  82,
+             68,  63,  78,  97, 117,
+             84,  76,  91, 111, 134, 152,
+             95,  89,  98, 113, 138, 159, 183,
+            109, 102, 106, 121, 142, 168, 199, 220,
+        }, {
+             31,
+             38,  47,
+             47,  46,  54,
+             50,  47,  57,  66,
+             57,  52,  61,  72,  82,
+             63,  57,  66,  77,  88,  96,
+             67,  62,  67,  75,  86,  95, 104,
+             71,  67,  68,  75,  84,  95, 107, 113,
+        },
+    }, {
+        {
+             32,
+             32,  35,
+             37,  39,  51,
+             47,  46,  60,  73,
+             62,  58,  71,  87, 105,
+             78,  72,  84, 100, 121, 140,
+             90,  84,  93, 106, 129, 148, 169,
+            102,  96, 100, 113, 132, 155, 183, 201,
+        }, {
+             31,
+             38,  47,
+             47,  47,  53,
+             48,  46,  55,  62,
+             54,  50,  58,  67,  76,
+             61,  55,  63,  72,  83,  91,
+             66,  61,  65,  73,  84,  92, 101,
+             69,  65,  66,  73,  82,  92, 103, 109,
+        },
+    }, {
+        {
+             32,
+             32,  34,
+             35,  37,  48,
+             46,  45,  56,  70,
+             57,  54,  64,  80,  93,
+             76,  70,  79,  96, 111, 134,
+             85,  79,  87, 100, 121, 138, 156,
+             96,  90,  93, 105, 122, 144, 168, 184,
+        }, {
+             31,
+             36,  43,
+             47,  47,  53,
+             48,  46,  54,  61,
+             52,  49,  55,  65,  71,
+             60,  55,  60,  70,  78,  89,
+             64,  59,  63,  71,  81,  89,  97,
+             67,  63,  64,  71,  79,  89,  99, 104,
+        },
+    }, {
+        {
+             32,
+             32,  33,
+             35,  36,  46,
+             42,  42,  52,  63,
+             53,  51,  60,  73,  86,
+             68,  64,  72,  84, 100, 117,
+             78,  74,  80,  92, 109, 128, 140,
+             90,  84,  87,  98, 114, 133, 155, 168,
+        }, {
+             31,
+             34,  39,
+             46,  47,  52,
+             47,  45,  52,  58,
+             50,  48,  54,  62,  68,
+             57,  53,  58,  65,  73,  82,
+             61,  57,  61,  68,  77,  86,  91,
+             65,  61,  62,  68,  76,  86,  95, 100,
+        },
+    }, {
+        {
+             32,
+             32,  33,
+             34,  35,  39,
+             39,  40,  46,  56,
+             50,  48,  53,  65,  78,
+             62,  59,  63,  75,  90, 105,
+             76,  71,  74,  86, 101, 118, 134,
+             84,  79,  81,  92, 106, 123, 142, 153,
+        }, {
+             31,
+             34,  39,
+             42,  45,  48,
+             47,  46,  49,  55,
+             49,  47,  50,  58,  65,
+             54,  51,  53,  61,  69,  76,
+             60,  56,  57,  65,  73,  82,  89,
+             64,  59,  60,  66,  74,  83,  92,  96,
+        },
+    }, {
+        {
+             32,
+             32,  33,
+             34,  35,  39,
+             38,  39,  45,  54,
+             46,  45,  51,  61,  71,
+             56,  54,  58,  69,  80,  92,
+             68,  64,  68,  78,  90, 103, 117,
+             78,  74,  76,  86,  99, 113, 128, 140,
+        }, {
+             31,
+             34,  39,
+             42,  45,  48,
+             47,  46,  49,  54,
+             48,  46,  50,  56,  61,
+             52,  49,  52,  58,  65,  71,
+             57,  53,  55,  61,  68,  75,  82,
+             61,  57,  58,  64,  71,  79,  86,  91,
+        },
+    }, {
+        {
+             31,
+             32,  32,
+             32,  33,  35,
+             35,  35,  38,  48,
+             42,  41,  43,  54,  63,
+             51,  49,  49,  59,  71,  81,
+             59,  56,  56,  66,  77,  89,  98,
+             69,  65,  64,  73,  85,  97, 108, 119,
+        }, {
+             31,
+             32,  35,
+             38,  42,  47,
+             48,  47,  48,  53,
+             47,  45,  45,  53,  58,
+             50,  47,  47,  54,  61,  66,
+             53,  50,  49,  56,  63,  69,  73,
+             57,  54,  52,  58,  65,  72,  77,  82,
+        },
+    }, {
+        {
+             31,
+             32,  32,
+             32,  32,  35,
+             34,  34,  37,  42,
+             38,  37,  40,  47,  54,
+             46,  44,  45,  52,  60,  69,
+             52,  49,  49,  56,  65,  75,  82,
+             63,  59,  58,  65,  73,  84,  92, 105,
+        }, {
+             31,
+             31,  32,
+             38,  40,  47,
+             44,  44,  47,  50,
+             47,  45,  46,  51,  54,
+             48,  46,  46,  51,  56,  61,
+             50,  47,  47,  52,  57,  63,  66,
+             55,  52,  50,  54,  60,  66,  70,  76,
+        },
+    }, {
+        {
+             31,
+             32,  32,
+             32,  32,  34,
+             34,  33,  35,  39,
+             35,  34,  37,  42,  48,
+             41,  40,  41,  47,  53,  60,
+             47,  44,  45,  51,  57,  65,  71,
+             53,  50,  51,  55,  61,  70,  77,  85,
+        }, {
+             31,
+             31,  32,
+             35,  36,  41,
+             42,  42,  45,  48,
+             48,  46,  47,  50,  53,
+             47,  45,  45,  49,  53,  57,
+             49,  46,  46,  50,  54,  59,  61,
+             51,  48,  48,  51,  54,  60,  64,  68,
+        },
+    }, {
+        {
+             31,
+             31,  32,
+             32,  32,  33,
+             32,  32,  34,  35,
+             34,  34,  35,  37,  41,
+             37,  36,  38,  39,  45,  51,
+             43,  41,  42,  42,  49,  56,  63,
+             47,  44,  45,  46,  52,  59,  67,  71,
+        }, {
+             31,
+             31,  32,
+             34,  35,  39,
+             37,  40,  43,  47,
+             43,  43,  45,  47,  49,
+             48,  46,  46,  47,  50,  53,
+             47,  45,  45,  45,  50,  55,  58,
+             49,  46,  46,  46,  50,  55,  60,  61,
+        },
+    }, {
+        {
+             31,
+             31,  32,
+             32,  32,  32,
+             32,  32,  33,  34,
+             33,  33,  34,  35,  37,
+             34,  34,  35,  36,  39,  43,
+             37,  36,  37,  38,  41,  46,  51,
+             41,  39,  40,  41,  44,  49,  54,  58,
+        }, {
+             31,
+             31,  31,
+             32,  33,  35,
+             35,  37,  39,  43,
+             39,  41,  42,  45,  47,
+             45,  44,  45,  47,  48,  50,
+             48,  46,  46,  47,  48,  51,  53,
+             48,  46,  45,  46,  47,  51,  54,  56,
+        },
+    }, {
+        {
+             31,
+             31,  32,
+             31,  32,  32,
+             32,  32,  32,  33,
+             32,  32,  32,  34,  35,
+             32,  33,  33,  34,  35,  36,
+             34,  34,  33,  35,  36,  38,  39,
+             35,  35,  34,  36,  38,  40,  42,  48,
+        }, {
+             31,
+             31,  31,
+             30,  31,  32,
+             34,  34,  35,  39,
+             36,  37,  39,  42,  46,
+             39,  40,  41,  44,  47,  47,
+             42,  42,  42,  45,  47,  48,  48,
+             48,  47,  46,  47,  47,  49,  50,  53,
+        },
+    }, {
+        {
+             31,
+             31,  32,
+             31,  32,  32,
+             31,  32,  32,  32,
+             32,  32,  32,  32,  33,
+             32,  32,  32,  32,  33,  34,
+             32,  32,  32,  32,  34,  34,  35,
+             33,  33,  33,  33,  35,  35,  36,  38,
+        }, {
+             31,
+             31,  31,
+             31,  31,  31,
+             30,  31,  31,  32,
+             34,  34,  35,  35,  39,
+             35,  35,  36,  36,  40,  41,
+             37,  38,  39,  40,  43,  44,  47,
+             40,  41,  41,  42,  44,  45,  47,  48,
+        },
+    }, {
+        {
+             31,
+             31,  32,
+             31,  32,  32,
+             31,  32,  32,  32,
+             31,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,
+             32,  32,  32,  32,  32,  32,  33,
+             32,  32,  32,  32,  32,  32,  33,  33,
+        }, {
+             31,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             30,  31,  31,  31,  32,
+             31,  32,  32,  32,  32,  33,
+             33,  34,  34,  35,  35,  36,  39,
+             33,  34,  34,  35,  35,  36,  39,  39,
+        },
+    }, {
+        {
+             31,
+             31,  31,
+             31,  31,  31,
+             31,  31,  32,  32,
+             31,  31,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,
+        }, {
+             31,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,
+             30,  31,  31,  31,  31,  31,  31,  31,
+        },
+    },
+};
+
+static const uint8_t qm_tbl_16x4[][2][64] = {
+    {
+        {
+             31,  32,  32,  34,  34,  41,  45,  54,  60,  72,  75,  83,  88,  94, 101, 108,
+             44,  41,  42,  48,  54,  63,  67,  75,  79,  90,  92, 100, 100, 101, 108, 115,
+             79,  72,  71,  73,  78,  90,  96, 110, 118, 133, 136, 142, 140, 144, 141, 151,
+             96,  90,  86,  83,  89,  95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197,
+        }, {
+             31,  32,  36,  43,  46,  45,  46,  50,  52,  57,  59,  62,  63,  65,  67,  69,
+             49,  45,  46,  49,  53,  58,  59,  62,  64,  67,  68,  71,  69,  68,  70,  72,
+             63,  57,  56,  57,  60,  67,  71,  78,  82,  89,  90,  91,  89,  89,  86,  88,
+             69,  65,  62,  60,  63,  66,  70,  74,  80,  85,  91,  96, 101, 103, 105, 107,
+        },
+    }, {
+        {
+             31,  32,  32,  33,  34,  37,  44,  49,  56,  65,  72,  78,  84,  89,  95, 101,
+             44,  41,  42,  44,  54,  58,  66,  71,  77,  84,  90,  95,  95,  95, 101, 108,
+             73,  67,  65,  66,  74,  79,  90,  99, 107, 119, 127, 133, 132, 136, 132, 141,
+             93,  87,  83,  81,  86,  92,  98, 107, 117, 129, 141, 151, 163, 169, 175, 183,
+        }, {
+             31,  32,  36,  41,  46,  46,  46,  48,  51,  54,  57,  60,  62,  64,  65,  67,
+             49,  45,  46,  47,  53,  56,  59,  61,  63,  65,  67,  69,  67,  66,  68,  70,
+             61,  55,  54,  54,  59,  62,  68,  73,  77,  82,  86,  88,  86,  87,  83,  86,
+             69,  64,  61,  59,  62,  65,  68,  73,  78,  84,  89,  93,  98, 100, 102, 103,
+        },
+    }, {
+        {
+             31,  32,  32,  33,  34,  37,  41,  46,  53,  60,  65,  74,  79,  84,  89,  94,
+             39,  38,  39,  40,  47,  54,  58,  62,  68,  73,  78,  85,  90,  90,  95, 101,
+             65,  60,  59,  58,  65,  73,  79,  86,  97, 105, 111, 120, 125, 128, 124, 131,
+             90,  84,  81,  78,  83,  89,  94, 102, 112, 123, 134, 143, 154, 158, 164, 170,
+        }, {
+             31,  32,  36,  40,  44,  46,  45,  47,  49,  52,  54,  58,  60,  62,  64,  65,
+             48,  46,  46,  46,  51,  54,  56,  57,  58,  60,  62,  64,  66,  64,  66,  68,
+             57,  53,  51,  50,  54,  60,  64,  68,  73,  76,  79,  82,  84,  84,  81,  83,
+             68,  63,  60,  58,  61,  64,  67,  71,  77,  82,  87,  91,  95,  97,  99, 100,
+        },
+    }, {
+        {
+             31,  32,  32,  33,  34,  34,  39,  44,  49,  54,  60,  68,  75,  79,  84,  88,
+             36,  35,  36,  38,  42,  48,  51,  56,  60,  63,  68,  75,  81,  85,  89,  94,
+             62,  58,  57,  56,  61,  66,  74,  82,  90,  95, 102, 110, 117, 120, 116, 123,
+             88,  82,  79,  76,  81,  85,  91,  98, 107, 117, 127, 135, 145, 148, 153, 159,
+        }, {
+             31,  32,  35,  40,  43,  46,  45,  46,  48,  50,  52,  56,  58,  60,  62,  63,
+             48,  46,  47,  47,  50,  53,  53,  54,  54,  55,  56,  59,  61,  63,  64,  66,
+             56,  52,  50,  49,  53,  56,  61,  65,  70,  72,  75,  79,  81,  82,  79,  81,
+             67,  62,  60,  57,  60,  63,  66,  70,  75,  80,  85,  89,  93,  94,  96,  97,
+        },
+    }, {
+        {
+             31,  32,  32,  32,  33,  34,  37,  41,  45,  49,  54,  61,  68,  74,  78,  83,
+             36,  35,  35,  37,  41,  48,  50,  53,  56,  60,  63,  69,  75,  80,  84,  88,
+             53,  51,  49,  49,  53,  60,  65,  71,  76,  82,  87,  93, 100, 105, 109, 114,
+             81,  76,  73,  71,  74,  80,  85,  91,  98, 105, 112, 121, 130, 137, 142, 148,
+        }, {
+             31,  31,  33,  38,  42,  46,  46,  45,  46,  48,  50,  52,  56,  58,  60,  62,
+             48,  47,  46,  47,  49,  53,  53,  53,  54,  54,  55,  57,  59,  61,  62,  64,
+             52,  49,  48,  47,  50,  54,  57,  61,  64,  66,  68,  71,  73,  75,  76,  78,
+             64,  60,  57,  56,  57,  61,  64,  68,  71,  75,  78,  83,  87,  90,  92,  94,
+        },
+    }, {
+        {
+             31,  32,  32,  32,  33,  34,  34,  37,  41,  45,  49,  54,  60,  65,  72,  75,
+             36,  35,  34,  36,  38,  42,  48,  50,  53,  56,  60,  63,  68,  73,  79,  81,
+             53,  51,  49,  50,  49,  54,  60,  65,  71,  76,  82,  87,  92,  97, 104, 106,
+             79,  75,  72,  71,  69,  73,  78,  84,  90,  96, 103, 110, 118, 125, 133, 136,
+        }, {
+             31,  31,  32,  36,  40,  43,  46,  46,  45,  46,  48,  50,  52,  54,  57,  59,
+             48,  47,  46,  47,  47,  50,  53,  53,  53,  54,  54,  55,  56,  58,  60,  61,
+             52,  50,  48,  47,  47,  50,  54,  57,  61,  64,  66,  68,  70,  72,  75,  75,
+             63,  60,  57,  56,  54,  57,  60,  64,  67,  71,  75,  78,  82,  85,  89,  90,
+        },
+    }, {
+        {
+             31,  32,  32,  32,  32,  34,  34,  36,  39,  42,  45,  50,  54,  60,  66,  73,
+             34,  34,  33,  35,  37,  39,  42,  44,  46,  48,  51,  54,  58,  63,  68,  74,
+             44,  43,  41,  43,  43,  48,  53,  57,  60,  64,  67,  72,  76,  80,  85,  91,
+             65,  62,  59,  59,  58,  63,  67,  71,  76,  81,  85,  92,  98, 105, 111, 118,
+        }, {
+             31,  31,  32,  35,  40,  43,  46,  46,  46,  46,  47,  48,  50,  52,  55,  58,
+             42,  42,  42,  45,  47,  48,  50,  50,  49,  49,  50,  50,  52,  53,  55,  58,
+             49,  47,  45,  46,  46,  49,  53,  55,  57,  59,  60,  61,  63,  64,  66,  68,
+             57,  54,  52,  51,  50,  53,  56,  58,  61,  64,  67,  71,  73,  76,  79,  82,
+        },
+    }, {
+        {
+             31,  32,  32,  32,  32,  32,  34,  35,  37,  39,  41,  45,  50,  54,  57,  61,
+             32,  32,  33,  34,  34,  35,  37,  38,  40,  41,  43,  46,  50,  53,  56,  58,
+             44,  42,  41,  42,  42,  42,  48,  54,  57,  60,  63,  67,  71,  74,  77,  79,
+             58,  55,  53,  53,  53,  52,  57,  63,  67,  70,  74,  79,  86,  90,  93,  97,
+        }, {
+             31,  31,  32,  34,  37,  39,  42,  47,  46,  46,  46,  47,  48,  50,  51,  53,
+             37,  38,  40,  42,  44,  47,  47,  48,  47,  46,  46,  46,  47,  48,  49,  50,
+             49,  47,  45,  45,  45,  45,  49,  53,  55,  57,  58,  59,  61,  62,  63,  64,
+             54,  51,  49,  49,  48,  48,  51,  55,  58,  60,  62,  65,  68,  70,  71,  73,
+        },
+    }, {
+        {
+             31,  32,  32,  32,  32,  32,  33,  34,  35,  37,  38,  42,  45,  47,  51,  55,
+             32,  32,  32,  33,  34,  35,  36,  37,  38,  40,  40,  43,  45,  47,  50,  54,
+             38,  37,  36,  36,  38,  39,  41,  44,  49,  51,  52,  56,  58,  60,  63,  67,
+             53,  51,  49,  49,  50,  49,  51,  54,  60,  63,  65,  71,  75,  77,  82,  87,
+        }, {
+             31,  31,  31,  32,  35,  39,  40,  42,  47,  47,  46,  46,  47,  48,  49,  51,
+             37,  38,  39,  40,  43,  47,  47,  47,  48,  47,  47,  46,  46,  47,  47,  49,
+             48,  47,  46,  46,  46,  47,  48,  50,  53,  53,  54,  55,  55,  55,  56,  57,
+             52,  50,  48,  48,  47,  47,  48,  50,  54,  56,  57,  61,  63,  64,  66,  68,
+        },
+    }, {
+        {
+             31,  32,  32,  32,  32,  32,  32,  33,  34,  35,  35,  38,  40,  42,  45,  46,
+             32,  32,  32,  33,  34,  34,  35,  36,  37,  38,  38,  40,  41,  43,  45,  46,
+             36,  35,  35,  34,  36,  36,  38,  40,  42,  47,  48,  50,  51,  53,  56,  56,
+             44,  42,  41,  41,  42,  42,  42,  44,  48,  52,  54,  58,  60,  63,  66,  67,
+        }, {
+             31,  31,  31,  31,  34,  35,  39,  40,  42,  46,  47,  47,  47,  46,  48,  48,
+             37,  38,  39,  40,  42,  43,  47,  47,  47,  48,  48,  47,  46,  46,  46,  46,
+             48,  47,  46,  46,  47,  47,  47,  48,  50,  52,  53,  53,  53,  53,  54,  54,
+             49,  47,  46,  45,  45,  46,  45,  47,  49,  53,  53,  56,  57,  58,  59,  59,
+        },
+    }, {
+        {
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  34,  34,  35,  35,  38,  38,  42,
+             32,  32,  32,  32,  32,  33,  33,  34,  34,  35,  35,  36,  36,  39,  39,  42,
+             34,  34,  34,  33,  33,  35,  35,  37,  37,  39,  39,  43,  43,  45,  45,  48,
+             39,  38,  38,  37,  37,  39,  39,  40,  40,  45,  45,  51,  51,  54,  54,  58,
+        }, {
+             31,  31,  31,  31,  31,  34,  34,  38,  38,  42,  42,  48,  48,  47,  47,  47,
+             33,  34,  34,  35,  35,  39,  39,  43,  43,  45,  45,  47,  47,  46,  46,  45,
+             42,  42,  42,  42,  42,  45,  45,  47,  47,  48,  48,  50,  50,  50,  50,  49,
+             48,  47,  47,  45,  45,  46,  46,  46,  46,  50,  50,  53,  53,  54,  54,  56,
+        },
+    }, {
+        {
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  35,  35,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,
+             32,  32,  32,  32,  33,  33,  34,  34,  34,  35,  35,  36,  37,  37,  38,  38,
+             36,  35,  35,  35,  34,  34,  36,  36,  37,  38,  38,  41,  42,  44,  48,  48,
+        }, {
+             31,  31,  31,  31,  31,  31,  34,  34,  36,  38,  38,  41,  42,  44,  48,  48,
+             31,  31,  31,  32,  32,  32,  35,  36,  37,  40,  40,  42,  43,  44,  46,  46,
+             37,  38,  38,  39,  40,  40,  42,  43,  44,  47,  47,  47,  47,  47,  48,  48,
+             48,  47,  47,  46,  46,  46,  47,  47,  47,  47,  47,  49,  50,  51,  53,  53,
+        },
+    }, {
+        {
+             31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,
+             32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  35,  36,
+             32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  35,  35,  35,  36,  37,
+        }, {
+             31,  31,  31,  31,  31,  31,  31,  31,  33,  34,  34,  36,  38,  38,  39,  42,
+             31,  31,  31,  31,  32,  32,  32,  33,  35,  36,  36,  38,  40,  40,  41,  43,
+             35,  35,  36,  36,  36,  37,  37,  38,  40,  40,  40,  43,  45,  45,  45,  46,
+             37,  38,  38,  38,  39,  40,  40,  40,  42,  43,  43,  45,  47,  47,  47,  47,
+        },
+    }, {
+        {
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,
+        }, {
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  33,  34,  34,  34,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  33,  35,  36,  36,  36,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  33,  35,  36,  36,  36,
+             34,  34,  35,  35,  35,  35,  36,  36,  36,  36,  36,  37,  39,  40,  40,  40,
+        },
+    }, {
+        {
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+        }, {
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  30,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,
+        },
+    },
+};
+
+static const uint8_t qm_tbl_16x8[][2][128] = {
+    {
+        {
+             32,  31,  32,  34,  36,  44,  48,  58,  65,  79,  82,  91,  97, 103, 110, 118,
+             32,  33,  34,  37,  38,  43,  46,  54,  58,  70,  72,  80,  86,  93, 100, 107,
+             36,  34,  36,  42,  48,  53,  56,  63,  68,  79,  81,  88,  94,  98, 101, 105,
+             53,  49,  50,  54,  60,  71,  76,  87,  92, 104, 106, 106, 107, 114, 117, 118,
+             65,  59,  59,  63,  68,  79,  85,  98, 105, 118, 121, 130, 128, 131, 138, 136,
+             87,  78,  77,  79,  84,  95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157,
+             93,  86,  82,  80,  86,  94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182,
+             99,  93,  89,  88,  90,  97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203,
+        }, {
+             32,  30,  33,  42,  49,  49,  50,  54,  57,  63,  64,  68,  70,  72,  74,  76,
+             37,  40,  43,  47,  48,  46,  46,  49,  50,  55,  56,  59,  62,  64,  67,  69,
+             48,  46,  47,  50,  53,  53,  54,  55,  56,  60,  61,  64,  66,  66,  66,  67,
+             52,  48,  47,  50,  54,  61,  64,  68,  70,  75,  75,  74,  73,  75,  74,  73,
+             57,  52,  51,  53,  57,  64,  67,  73,  76,  82,  83,  86,  83,  83,  84,  82,
+             66,  60,  59,  60,  62,  69,  73,  80,  84,  92,  93,  94,  96,  92,  94,  91,
+             68,  63,  60,  59,  62,  66,  72,  76,  80,  87,  93,  98,  99, 101, 103, 101,
+             71,  66,  63,  62,  62,  66,  70,  75,  79,  84,  89,  94,  98, 104, 106, 109,
+        },
+    }, {
+        {
+             32,  31,  32,  32,  36,  39,  47,  53,  61,  71,  79,  86,  92,  98, 104, 110,
+             32,  32,  34,  35,  37,  40,  45,  50,  56,  64,  70,  76,  82,  88,  94, 100,
+             36,  35,  36,  40,  48,  50,  56,  60,  65,  73,  79,  84,  89,  93,  95,  98,
+             47,  44,  45,  47,  56,  60,  69,  75,  81,  89,  95, 100, 101, 108, 110, 111,
+             65,  60,  59,  60,  68,  73,  84,  92, 100, 111, 118, 124, 121, 124, 129, 127,
+             79,  72,  71,  71,  78,  84,  95, 103, 113, 125, 133, 140, 148, 141, 151, 147,
+             90,  84,  80,  78,  83,  91, 101, 108, 116, 129, 142, 153, 157, 163, 171, 169,
+             96,  90,  87,  85,  87,  94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188,
+        }, {
+             32,  30,  33,  39,  49,  48,  50,  52,  55,  60,  63,  66,  68,  70,  72,  74,
+             35,  38,  41,  46,  48,  46,  46,  47,  49,  53,  55,  58,  60,  62,  65,  67,
+             48,  46,  47,  48,  53,  53,  54,  54,  56,  58,  60,  62,  64,  65,  65,  65,
+             50,  46,  46,  47,  54,  56,  61,  63,  65,  68,  70,  72,  71,  73,  72,  71,
+             57,  52,  51,  51,  57,  60,  66,  71,  74,  79,  82,  84,  81,  81,  82,  79,
+             63,  58,  56,  55,  60,  64,  70,  75,  79,  85,  89,  91,  94,  89,  92,  89,
+             68,  63,  60,  58,  61,  65,  71,  75,  79,  85,  91,  95,  97,  98, 100,  98,
+             70,  65,  63,  61,  61,  65,  69,  74,  78,  82,  87,  91,  96, 101, 103, 105,
+        },
+    }, {
+        {
+             32,  31,  32,  32,  34,  39,  44,  49,  57,  65,  71,  81,  87,  92,  98, 103,
+             32,  32,  33,  34,  36,  39,  42,  46,  53,  59,  64,  72,  77,  83,  88,  94,
+             36,  35,  36,  38,  44,  50,  53,  57,  63,  68,  73,  80,  85,  88,  89,  92,
+             44,  41,  42,  42,  50,  58,  63,  67,  74,  79,  84,  91,  96, 102, 103, 103,
+             58,  54,  53,  52,  59,  68,  74,  81,  90,  97, 102, 110, 114, 117, 121, 119,
+             79,  73,  71,  69,  75,  84,  90,  97, 108, 118, 125, 135, 140, 133, 141, 137,
+             88,  81,  78,  76,  81,  88,  97, 104, 111, 123, 135, 145, 148, 153, 160, 158,
+             93,  88,  84,  82,  84,  90,  97, 105, 113, 122, 131, 141, 151, 163, 169, 175,
+        }, {
+             32,  31,  33,  37,  44,  48,  49,  51,  54,  57,  60,  64,  66,  68,  70,  72,
+             34,  36,  40,  44,  46,  46,  45,  47,  49,  51,  53,  57,  59,  61,  63,  65,
+             48,  46,  47,  47,  51,  53,  53,  54,  55,  56,  58,  61,  63,  63,  63,  63,
+             49,  46,  46,  45,  51,  56,  58,  60,  62,  64,  65,  68,  69,  71,  70,  69,
+             54,  50,  49,  48,  53,  58,  62,  65,  70,  73,  75,  78,  79,  79,  80,  77,
+             63,  58,  56,  54,  59,  64,  67,  71,  77,  82,  85,  89,  91,  87,  89,  86,
+             67,  62,  59,  57,  60,  64,  70,  73,  77,  83,  89,  93,  94,  96,  97,  95,
+             69,  65,  62,  60,  61,  64,  68,  72,  76,  81,  85,  89,  93,  98, 100, 102,
+        },
+    }, {
+        {
+             32,  31,  31,  32,  34,  36,  41,  47,  53,  58,  65,  74,  82,  87,  92,  97,
+             31,  32,  33,  34,  35,  36,  40,  44,  50,  54,  59,  67,  73,  78,  83,  88,
+             35,  34,  35,  37,  41,  46,  49,  53,  57,  61,  66,  73,  79,  83,  84,  86,
+             44,  41,  42,  42,  48,  54,  60,  66,  71,  75,  79,  86,  92,  96,  97,  97,
+             53,  50,  49,  49,  54,  60,  67,  75,  82,  87,  92, 100, 105, 110, 114, 111,
+             65,  61,  59,  58,  63,  68,  76,  84,  92,  98, 105, 113, 120, 125, 132, 128,
+             82,  76,  73,  71,  76,  80,  88,  97, 106, 112, 120, 131, 139, 144, 150, 147,
+             90,  85,  81,  79,  81,  87,  93, 101, 108, 116, 124, 134, 142, 153, 157, 163,
+        }, {
+             32,  31,  33,  37,  42,  49,  48,  50,  52,  54,  57,  61,  64,  66,  68,  70,
+             33,  34,  37,  43,  44,  47,  46,  46,  47,  49,  51,  55,  57,  59,  61,  63,
+             45,  45,  46,  47,  49,  52,  51,  52,  53,  54,  55,  58,  60,  61,  61,  61,
+             49,  46,  45,  45,  49,  53,  57,  59,  61,  62,  64,  66,  68,  69,  68,  67,
+             52,  49,  47,  47,  50,  54,  59,  63,  66,  68,  70,  73,  75,  77,  77,  75,
+             57,  53,  51,  50,  53,  57,  61,  66,  71,  73,  76,  80,  83,  84,  86,  83,
+             64,  60,  57,  55,  58,  61,  66,  71,  75,  79,  83,  87,  91,  93,  94,  92,
+             68,  64,  61,  59,  60,  63,  67,  71,  74,  79,  83,  87,  91,  95,  97,  98,
+        },
+    }, {
+        {
+             32,  31,  31,  32,  33,  36,  39,  44,  48,  53,  58,  66,  74,  81,  86,  91,
+             31,  32,  32,  33,  34,  35,  38,  41,  45,  49,  54,  60,  67,  73,  78,  82,
+             33,  33,  34,  36,  38,  42,  44,  46,  50,  53,  57,  63,  69,  75,  78,  80,
+             40,  39,  38,  40,  44,  51,  54,  59,  62,  66,  70,  75,  81,  86,  90,  90,
+             51,  49,  47,  48,  52,  58,  63,  69,  74,  79,  84,  90,  97, 102, 106, 103,
+             65,  61,  59,  58,  62,  68,  73,  79,  85,  92,  98, 106, 113, 120, 124, 119,
+             79,  74,  71,  69,  72,  78,  84,  90,  96, 103, 110, 119, 128, 135, 140, 137,
+             87,  82,  79,  77,  78,  84,  89,  96, 103, 111, 118, 126, 134, 143, 147, 151,
+        }, {
+             32,  31,  31,  35,  41,  49,  48,  49,  50,  52,  54,  57,  61,  64,  66,  68,
+             32,  33,  35,  39,  43,  47,  46,  45,  46,  48,  50,  52,  55,  58,  59,  61,
+             40,  41,  43,  46,  48,  50,  49,  48,  49,  50,  51,  53,  56,  58,  59,  59,
+             49,  47,  46,  46,  49,  53,  54,  56,  57,  58,  59,  61,  63,  65,  66,  65,
+             51,  49,  47,  47,  49,  54,  57,  61,  63,  65,  67,  69,  72,  73,  75,  72,
+             57,  54,  51,  50,  52,  57,  60,  64,  67,  71,  73,  77,  80,  82,  84,  81,
+             63,  59,  57,  55,  57,  60,  64,  67,  71,  75,  78,  82,  86,  89,  91,  89,
+             67,  63,  60,  58,  59,  62,  65,  69,  73,  77,  81,  85,  88,  92,  94,  95,
+        },
+    }, {
+        {
+             32,  31,  31,  32,  32,  34,  36,  39,  44,  48,  53,  58,  65,  71,  79,  82,
+             31,  32,  32,  32,  33,  34,  34,  37,  41,  45,  49,  54,  60,  65,  72,  75,
+             32,  32,  33,  34,  35,  37,  38,  40,  43,  46,  50,  54,  58,  63,  70,  72,
+             36,  35,  34,  36,  38,  42,  48,  50,  53,  56,  60,  63,  68,  73,  79,  81,
+             44,  42,  41,  42,  42,  48,  54,  58,  63,  67,  71,  75,  79,  84,  90,  92,
+             53,  51,  49,  50,  49,  54,  60,  65,  71,  76,  82,  87,  92,  97, 104, 106,
+             65,  62,  59,  59,  58,  63,  68,  73,  79,  85,  92,  98, 105, 111, 118, 121,
+             79,  75,  72,  71,  69,  73,  78,  84,  90,  96, 103, 110, 118, 125, 133, 136,
+        }, {
+             32,  31,  30,  33,  37,  42,  49,  48,  49,  50,  52,  54,  57,  60,  63,  64,
+             31,  31,  32,  36,  40,  43,  46,  46,  45,  46,  48,  50,  52,  54,  57,  59,
+             37,  38,  40,  43,  47,  47,  48,  47,  46,  46,  47,  49,  50,  52,  55,  56,
+             48,  47,  46,  47,  47,  50,  53,  53,  53,  54,  54,  55,  56,  58,  60,  61,
+             49,  47,  45,  46,  45,  49,  53,  56,  58,  59,  61,  62,  64,  65,  67,  68,
+             52,  50,  48,  47,  47,  50,  54,  57,  61,  64,  66,  68,  70,  72,  75,  75,
+             57,  54,  52,  51,  50,  53,  57,  60,  64,  67,  71,  73,  76,  79,  82,  83,
+             63,  60,  57,  56,  54,  57,  60,  64,  67,  71,  75,  78,  82,  85,  89,  90,
+        },
+    }, {
+        {
+             32,  31,  31,  32,  32,  34,  35,  38,  41,  44,  48,  53,  58,  65,  71,  79,
+             31,  32,  32,  32,  33,  34,  34,  36,  39,  42,  45,  49,  54,  60,  65,  72,
+             32,  32,  33,  34,  35,  37,  38,  40,  41,  43,  46,  50,  54,  58,  63,  70,
+             36,  35,  34,  36,  38,  42,  47,  49,  51,  54,  56,  60,  63,  68,  73,  79,
+             44,  42,  41,  42,  42,  48,  52,  56,  60,  64,  67,  71,  75,  79,  84,  90,
+             53,  51,  49,  50,  49,  54,  59,  63,  67,  72,  76,  82,  87,  92,  97, 104,
+             62,  59,  57,  57,  56,  61,  65,  69,  74,  79,  83,  90,  95, 102, 108, 115,
+             73,  69,  66,  65,  64,  69,  73,  77,  81,  86,  91,  99, 105, 112, 119, 127,
+        }, {
+             32,  31,  30,  33,  37,  42,  47,  48,  48,  49,  50,  52,  54,  57,  60,  63,
+             31,  31,  32,  36,  40,  43,  46,  46,  45,  45,  46,  48,  50,  52,  54,  57,
+             37,  38,  40,  43,  47,  47,  48,  47,  46,  46,  46,  47,  49,  50,  52,  55,
+             48,  47,  46,  47,  47,  50,  52,  53,  53,  53,  54,  54,  55,  56,  58,  60,
+             49,  47,  45,  46,  45,  49,  53,  55,  57,  58,  59,  61,  62,  64,  65,  67,
+             52,  50,  48,  47,  47,  50,  53,  56,  59,  62,  64,  66,  68,  70,  72,  75,
+             56,  53,  51,  50,  49,  53,  55,  58,  61,  64,  66,  70,  72,  75,  77,  80,
+             61,  57,  55,  54,  52,  56,  58,  61,  63,  66,  69,  73,  76,  79,  82,  86,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  32,  32,  34,  36,  38,  41,  44,  48,  53,  57,  61,  65,
+             31,  32,  32,  32,  32,  33,  34,  34,  37,  39,  41,  45,  49,  53,  56,  60,
+             32,  32,  33,  34,  34,  35,  37,  38,  40,  41,  43,  46,  50,  53,  56,  58,
+             35,  35,  34,  35,  36,  37,  41,  46,  47,  49,  51,  54,  57,  60,  63,  66,
+             39,  38,  37,  38,  39,  40,  44,  50,  52,  54,  57,  60,  64,  67,  69,  72,
+             44,  42,  41,  42,  42,  42,  48,  54,  57,  60,  63,  67,  71,  74,  77,  79,
+             53,  51,  49,  49,  49,  49,  54,  60,  64,  67,  71,  76,  82,  86,  89,  92,
+             65,  62,  59,  59,  58,  58,  63,  68,  72,  76,  79,  85,  92,  97, 100, 105,
+        }, {
+             32,  31,  30,  33,  35,  37,  42,  49,  48,  48,  49,  50,  52,  54,  55,  57,
+             31,  31,  32,  35,  37,  40,  43,  46,  46,  45,  45,  46,  48,  49,  51,  52,
+             37,  38,  40,  42,  44,  47,  47,  48,  47,  46,  46,  46,  47,  48,  49,  50,
+             45,  45,  44,  46,  46,  47,  49,  52,  51,  51,  51,  52,  53,  54,  54,  55,
+             48,  47,  45,  46,  46,  47,  50,  53,  54,  54,  55,  56,  57,  58,  58,  59,
+             49,  47,  45,  45,  45,  45,  49,  53,  55,  57,  58,  59,  61,  62,  63,  64,
+             52,  50,  48,  47,  47,  47,  50,  54,  57,  59,  61,  64,  66,  68,  69,  70,
+             57,  54,  52,  51,  51,  50,  53,  57,  59,  61,  64,  67,  71,  73,  74,  76,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  32,  32,  32,  34,  36,  38,  39,  44,  47,  49,  53,  58,
+             31,  32,  32,  32,  32,  33,  33,  34,  34,  36,  37,  41,  44,  46,  49,  54,
+             32,  32,  32,  33,  34,  35,  35,  36,  37,  39,  40,  42,  45,  47,  50,  54,
+             32,  33,  33,  33,  34,  36,  36,  38,  40,  41,  42,  45,  47,  48,  51,  55,
+             36,  35,  35,  35,  36,  38,  40,  42,  48,  49,  50,  53,  56,  57,  60,  63,
+             44,  42,  41,  41,  42,  42,  44,  48,  54,  56,  58,  63,  66,  67,  71,  75,
+             47,  45,  44,  44,  45,  45,  47,  50,  56,  58,  60,  66,  69,  71,  75,  79,
+             53,  51,  49,  49,  50,  49,  51,  54,  60,  63,  65,  71,  75,  77,  82,  87,
+        }, {
+             32,  31,  30,  31,  33,  37,  39,  42,  49,  48,  48,  49,  50,  51,  52,  54,
+             31,  31,  32,  33,  36,  40,  41,  43,  46,  46,  46,  45,  46,  47,  48,  50,
+             35,  37,  38,  38,  41,  45,  46,  46,  48,  47,  46,  45,  46,  47,  47,  49,
+             38,  40,  40,  41,  44,  47,  47,  48,  49,  48,  48,  47,  48,  48,  48,  50,
+             48,  47,  46,  46,  47,  47,  48,  50,  53,  53,  53,  53,  54,  54,  54,  55,
+             49,  47,  45,  45,  46,  45,  47,  49,  53,  55,  56,  58,  59,  60,  61,  62,
+             50,  48,  46,  46,  46,  46,  47,  50,  54,  55,  56,  59,  61,  61,  63,  65,
+             52,  50,  48,  48,  47,  47,  48,  50,  54,  56,  57,  61,  63,  64,  66,  68,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  32,  32,  32,  34,  35,  36,  39,  41,  44,  47,  48,
+             31,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  37,  39,  41,  44,  45,
+             31,  32,  32,  32,  33,  33,  34,  34,  35,  36,  36,  39,  40,  42,  44,  45,
+             32,  32,  32,  33,  34,  34,  35,  36,  37,  38,  38,  40,  41,  43,  45,  46,
+             35,  35,  34,  34,  35,  36,  37,  39,  41,  45,  46,  48,  49,  51,  53,  54,
+             36,  35,  35,  34,  36,  36,  38,  40,  42,  47,  48,  50,  51,  53,  56,  56,
+             44,  42,  41,  41,  42,  42,  42,  44,  48,  52,  54,  58,  60,  63,  66,  67,
+             47,  45,  45,  44,  44,  45,  45,  47,  50,  55,  56,  60,  62,  66,  69,  70,
+        }, {
+             32,  31,  31,  30,  33,  33,  37,  39,  42,  47,  49,  48,  48,  49,  50,  50,
+             31,  31,  32,  32,  35,  36,  40,  41,  43,  46,  46,  46,  45,  45,  46,  46,
+             33,  34,  34,  35,  37,  38,  43,  43,  44,  46,  47,  46,  46,  45,  46,  46,
+             37,  38,  39,  40,  42,  43,  47,  47,  47,  48,  48,  47,  46,  46,  46,  46,
+             45,  45,  45,  44,  46,  46,  47,  48,  49,  51,  52,  51,  51,  51,  52,  52,
+             48,  47,  46,  46,  47,  47,  47,  48,  50,  52,  53,  53,  53,  53,  54,  54,
+             49,  47,  46,  45,  45,  46,  45,  47,  49,  53,  53,  56,  57,  58,  59,  59,
+             50,  48,  47,  46,  46,  46,  46,  47,  50,  53,  54,  56,  57,  59,  61,  61,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  32,  32,  32,  32,  34,  34,  36,  36,  39,  39,  44,
+             31,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  37,  37,  41,
+             31,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  37,  37,  41,
+             32,  32,  32,  33,  33,  34,  34,  35,  35,  37,  37,  38,  38,  40,  40,  43,
+             32,  32,  32,  33,  33,  34,  34,  35,  35,  37,  37,  38,  38,  40,  40,  43,
+             36,  35,  35,  34,  34,  36,  36,  38,  38,  42,  42,  48,  48,  50,  50,  53,
+             36,  35,  35,  34,  34,  36,  36,  38,  38,  42,  42,  48,  48,  50,  50,  53,
+             44,  42,  42,  41,  41,  42,  42,  42,  42,  48,  48,  54,  54,  58,  58,  63,
+        }, {
+             32,  31,  31,  30,  30,  33,  33,  37,  37,  42,  42,  49,  49,  48,  48,  49,
+             31,  31,  31,  32,  32,  36,  36,  40,  40,  43,  43,  46,  46,  46,  46,  45,
+             31,  31,  31,  32,  32,  36,  36,  40,  40,  43,  43,  46,  46,  46,  46,  45,
+             37,  38,  38,  40,  40,  43,  43,  47,  47,  47,  47,  48,  48,  47,  47,  46,
+             37,  38,  38,  40,  40,  43,  43,  47,  47,  47,  47,  48,  48,  47,  47,  46,
+             48,  47,  47,  46,  46,  47,  47,  47,  47,  50,  50,  53,  53,  53,  53,  53,
+             48,  47,  47,  46,  46,  47,  47,  47,  47,  50,  50,  53,  53,  53,  53,  53,
+             49,  47,  47,  45,  45,  46,  46,  45,  45,  49,  49,  53,  53,  56,  56,  58,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  33,  34,  34,  36,  36,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  35,  35,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,
+             31,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  35,  35,  35,  36,  36,
+             32,  32,  32,  32,  33,  33,  34,  34,  34,  35,  35,  36,  37,  37,  38,  38,
+             32,  32,  32,  32,  33,  33,  34,  34,  34,  35,  35,  36,  37,  37,  38,  38,
+             35,  35,  35,  34,  34,  34,  35,  36,  36,  37,  37,  40,  41,  43,  46,  46,
+             36,  35,  35,  35,  34,  34,  36,  36,  37,  38,  38,  41,  42,  44,  48,  48,
+        }, {
+             32,  31,  31,  31,  30,  30,  33,  33,  35,  37,  37,  41,  42,  44,  49,  49,
+             31,  31,  31,  31,  32,  32,  34,  35,  37,  39,  39,  42,  42,  44,  47,  47,
+             31,  31,  31,  32,  32,  32,  35,  36,  37,  40,  40,  42,  43,  44,  46,  46,
+             33,  34,  34,  34,  35,  35,  37,  38,  40,  43,  43,  44,  44,  45,  47,  47,
+             37,  38,  38,  39,  40,  40,  42,  43,  44,  47,  47,  47,  47,  47,  48,  48,
+             37,  38,  38,  39,  40,  40,  42,  43,  44,  47,  47,  47,  47,  47,  48,  48,
+             45,  45,  45,  45,  44,  44,  46,  46,  46,  47,  47,  49,  49,  50,  52,  52,
+             48,  47,  47,  46,  46,  46,  47,  47,  47,  47,  47,  49,  50,  51,  53,  53,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  34,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,
+             31,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  35,
+             32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  35,  35,  35,  36,  37,
+             32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  35,  35,  35,  36,  37,
+             32,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  35,  36,  36,  36,  38,
+        }, {
+             32,  31,  31,  31,  31,  30,  30,  31,  33,  33,  33,  35,  37,  37,  39,  42,
+             31,  31,  31,  31,  31,  31,  31,  32,  34,  35,  35,  37,  39,  39,  40,  42,
+             31,  31,  31,  31,  32,  32,  32,  33,  35,  36,  36,  38,  40,  40,  41,  43,
+             31,  31,  31,  31,  32,  32,  32,  33,  35,  36,  36,  38,  40,  40,  41,  43,
+             33,  33,  34,  34,  34,  35,  35,  35,  37,  38,  38,  41,  43,  43,  43,  44,
+             37,  38,  38,  38,  39,  40,  40,  40,  42,  43,  43,  45,  47,  47,  47,  47,
+             37,  38,  38,  38,  39,  40,  40,  40,  42,  43,  43,  45,  47,  47,  47,  47,
+             38,  39,  40,  40,  40,  41,  41,  41,  43,  44,  44,  46,  47,  47,  47,  48,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,
+        }, {
+             32,  31,  31,  31,  31,  31,  31,  30,  30,  30,  30,  31,  33,  33,  33,  33,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  34,  34,  34,  34,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  33,  34,  35,  35,  35,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  33,  35,  36,  36,  36,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  33,  35,  36,  36,  36,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  33,  35,  36,  36,  36,
+             33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  35,  36,  37,  38,  38,  38,
+             35,  36,  36,  37,  37,  37,  37,  38,  38,  38,  38,  39,  40,  41,  41,  41,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+        }, {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  30,  30,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,
+        },
+    },
+};
+
+static const uint8_t qm_tbl_32x8[][2][256] = {
+    {
+        {
+             32,  31,  31,  31,  32,  32,  34,  35,  36,  39,  44,  46,  48,  53,  58,  61,  65,  71,  79,  81,  82,  88,  91,  94,  97, 100, 103, 107, 110, 114, 118, 122,
+             32,  32,  33,  33,  34,  35,  37,  37,  38,  40,  43,  44,  46,  50,  54,  56,  58,  63,  70,  71,  72,  77,  80,  83,  86,  89,  93,  96, 100, 104, 107, 111,
+             36,  35,  34,  35,  36,  38,  42,  45,  48,  50,  53,  55,  56,  60,  63,  66,  68,  73,  79,  80,  81,  85,  88,  91,  94,  97,  98, 100, 101, 103, 105, 107,
+             53,  51,  49,  49,  50,  49,  54,  57,  60,  65,  71,  73,  76,  82,  87,  89,  92,  97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118, 119,
+             65,  62,  59,  59,  59,  58,  63,  65,  68,  73,  79,  82,  85,  92,  98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, 138, 137, 136, 136,
+             87,  82,  78,  78,  77,  75,  79,  82,  84,  89,  95,  98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156,
+             93,  88,  86,  84,  82,  82,  80,  84,  86,  91,  94,  98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177, 183, 185, 182, 179,
+             99,  94,  93,  90,  89,  89,  88,  87,  90,  93,  97,  99, 105, 107, 115, 116, 124, 127, 135, 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204,
+        }, {
+             32,  31,  30,  32,  33,  37,  42,  45,  49,  48,  49,  49,  50,  52,  54,  55,  57,  60,  63,  64,  64,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
+             37,  38,  40,  41,  43,  47,  47,  47,  48,  47,  46,  46,  46,  47,  49,  49,  50,  52,  55,  55,  56,  58,  59,  60,  62,  63,  64,  65,  67,  68,  69,  70,
+             48,  47,  46,  46,  47,  47,  50,  51,  53,  53,  53,  53,  54,  54,  55,  56,  56,  58,  60,  61,  61,  63,  64,  65,  66,  67,  66,  66,  66,  66,  67,  67,
+             52,  50,  48,  48,  47,  47,  50,  52,  54,  57,  61,  62,  64,  66,  68,  69,  70,  72,  75,  75,  75,  76,  74,  72,  73,  74,  75,  75,  74,  74,  73,  73,
+             57,  54,  52,  51,  51,  50,  53,  55,  57,  60,  64,  65,  67,  71,  73,  75,  76,  79,  82,  82,  83,  85,  86,  85,  83,  82,  83,  84,  84,  83,  82,  81,
+             66,  63,  60,  59,  59,  57,  60,  61,  62,  66,  69,  71,  73,  77,  80,  82,  84,  88,  92,  92,  93,  95,  94,  95,  96,  93,  92,  93,  94,  93,  91,  90,
+             68,  65,  63,  62,  60,  60,  59,  61,  62,  65,  66,  68,  72,  73,  76,  79,  80,  84,  87,  89,  93,  94,  98,  99,  99, 102, 101, 102, 103, 103, 101,  99,
+             71,  67,  66,  64,  63,  62,  62,  61,  62,  64,  66,  67,  70,  71,  75,  76,  79,  81,  84,  86,  89,  91,  94,  97,  98, 102, 104, 106, 106, 109, 109, 108,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  32,  32,  32,  34,  36,  38,  39,  44,  47,  49,  53,  58,  61,  65,  71,  76,  79,  82,  86,  89,  92,  95,  98, 101, 104, 107, 110, 114,
+             32,  32,  32,  33,  34,  35,  35,  36,  37,  39,  40,  42,  45,  47,  50,  54,  56,  59,  64,  68,  70,  73,  76,  79,  82,  85,  88,  91,  94,  97, 100, 104,
+             36,  35,  35,  35,  36,  38,  40,  42,  48,  49,  50,  53,  56,  57,  60,  63,  65,  68,  73,  76,  79,  81,  84,  87,  89,  92,  93,  94,  95,  96,  98, 100,
+             47,  45,  44,  44,  45,  45,  47,  50,  56,  58,  60,  66,  69,  71,  75,  79,  81,  84,  89,  92,  95,  97, 100,  99, 101, 105, 108, 110, 110, 110, 111, 111,
+             65,  62,  60,  59,  59,  58,  60,  63,  68,  71,  73,  79,  84,  86,  92,  98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127,
+             79,  75,  72,  71,  71,  69,  71,  73,  78,  81,  84,  90,  95,  97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145,
+             90,  86,  84,  82,  80,  80,  78,  82,  83,  88,  91,  94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166,
+             96,  91,  90,  87,  87,  86,  85,  84,  87,  90,  94,  96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190,
+        }, {
+             32,  31,  30,  31,  33,  37,  39,  42,  49,  48,  48,  49,  50,  51,  52,  54,  55,  57,  60,  62,  63,  64,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
+             35,  37,  38,  38,  41,  45,  46,  46,  48,  47,  46,  45,  46,  47,  47,  49,  49,  50,  53,  54,  55,  56,  58,  59,  60,  61,  62,  64,  65,  66,  67,  68,
+             48,  47,  46,  46,  47,  47,  48,  50,  53,  53,  53,  53,  54,  54,  54,  55,  56,  56,  58,  60,  60,  61,  62,  63,  64,  65,  65,  65,  65,  65,  65,  65,
+             50,  48,  46,  46,  46,  46,  47,  50,  54,  55,  56,  59,  61,  61,  63,  65,  65,  66,  68,  69,  70,  71,  72,  71,  71,  72,  73,  73,  72,  72,  71,  71,
+             57,  54,  52,  52,  51,  50,  51,  53,  57,  58,  60,  64,  66,  68,  71,  73,  74,  76,  79,  81,  82,  83,  84,  83,  81,  80,  81,  82,  82,  81,  79,  78,
+             63,  60,  58,  57,  56,  54,  55,  57,  60,  62,  64,  67,  70,  71,  75,  78,  79,  82,  85,  87,  89,  90,  91,  93,  94,  91,  89,  90,  92,  90,  89,  87,
+             68,  64,  63,  61,  60,  59,  58,  60,  61,  64,  65,  67,  71,  72,  75,  78,  79,  83,  85,  87,  91,  92,  95,  96,  97,  99,  98,  99, 100, 100,  98,  96,
+             70,  66,  65,  63,  63,  62,  61,  60,  61,  63,  65,  66,  69,  70,  74,  74,  78,  79,  82,  84,  87,  89,  91,  94,  96, 100, 101, 103, 103, 105, 105, 105,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  32,  32,  32,  34,  34,  36,  39,  40,  44,  47,  49,  53,  57,  59,  65,  69,  71,  79,  81,  82,  87,  90,  92,  95,  98, 100, 103, 106,
+             32,  32,  32,  32,  33,  34,  34,  35,  36,  37,  39,  40,  42,  45,  46,  50,  53,  54,  59,  62,  64,  71,  72,  73,  77,  80,  83,  85,  88,  91,  94,  97,
+             36,  35,  35,  34,  36,  37,  38,  42,  44,  48,  50,  51,  53,  56,  57,  60,  63,  64,  68,  71,  73,  79,  80,  81,  85,  87,  88,  88,  89,  90,  92,  93,
+             44,  42,  41,  41,  42,  42,  42,  48,  50,  54,  58,  59,  63,  66,  67,  71,  74,  75,  79,  83,  84,  90,  91,  92,  96,  99, 102, 103, 103, 103, 103, 104,
+             58,  55,  54,  53,  53,  53,  52,  57,  59,  63,  68,  70,  74,  79,  81,  86,  90,  91,  97, 100, 102, 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118,
+             79,  75,  73,  72,  71,  70,  69,  73,  75,  78,  84,  85,  90,  95,  97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135,
+             88,  83,  81,  79,  78,  77,  76,  79,  81,  85,  88,  91,  97,  99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160, 161, 158, 155,
+             93,  88,  88,  84,  84,  83,  82,  81,  84,  86,  90,  92,  97,  98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176,
+        }, {
+             32,  31,  31,  30,  33,  35,  37,  42,  44,  49,  48,  48,  49,  50,  51,  52,  54,  54,  57,  59,  60,  63,  64,  64,  66,  67,  68,  69,  70,  71,  72,  73,
+             34,  35,  36,  36,  40,  42,  44,  45,  46,  47,  46,  46,  45,  46,  47,  47,  49,  49,  51,  52,  53,  56,  57,  57,  59,  60,  61,  62,  63,  64,  65,  66,
+             48,  47,  46,  46,  47,  47,  47,  50,  51,  53,  53,  53,  53,  54,  54,  54,  55,  55,  56,  58,  58,  60,  61,  61,  63,  63,  63,  63,  63,  63,  63,  63,
+             49,  47,  46,  45,  46,  45,  45,  49,  51,  53,  56,  56,  58,  59,  60,  61,  62,  62,  64,  65,  65,  67,  68,  68,  69,  70,  71,  71,  70,  70,  69,  69,
+             54,  51,  50,  49,  49,  48,  48,  51,  53,  55,  58,  59,  62,  65,  65,  68,  70,  70,  73,  74,  75,  77,  78,  78,  79,  78,  79,  80,  80,  78,  77,  76,
+             63,  60,  58,  57,  56,  55,  54,  57,  59,  60,  64,  65,  67,  70,  71,  75,  77,  78,  82,  84,  85,  89,  89,  90,  91,  88,  87,  88,  89,  88,  86,  84,
+             67,  63,  62,  60,  59,  58,  57,  59,  60,  63,  64,  66,  70,  70,  73,  76,  77,  81,  83,  85,  89,  90,  93,  94,  94,  96,  96,  96,  97,  97,  95,  93,
+             69,  65,  65,  62,  62,  61,  60,  59,  61,  62,  64,  65,  68,  68,  72,  73,  76,  77,  81,  82,  85,  87,  89,  92,  93,  97,  98, 100, 100, 102, 102, 101,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  32,  32,  32,  34,  35,  36,  39,  41,  44,  47,  48,  53,  55,  58,  63,  65,  71,  74,  79,  82,  82,  87,  89,  92,  94,  97,  99,
+             31,  32,  32,  32,  33,  33,  34,  34,  35,  36,  36,  39,  40,  42,  44,  45,  50,  51,  54,  58,  59,  64,  67,  71,  73,  74,  78,  81,  83,  85,  88,  91,
+             35,  35,  34,  34,  35,  36,  37,  39,  41,  45,  46,  48,  49,  51,  53,  54,  57,  59,  61,  65,  66,  71,  73,  77,  79,  79,  83,  83,  84,  85,  86,  87,
+             44,  42,  41,  41,  42,  42,  42,  44,  48,  52,  54,  58,  60,  63,  66,  67,  71,  72,  75,  78,  79,  84,  86,  90,  92,  92,  96,  97,  97,  97,  97,  97,
+             53,  51,  50,  49,  49,  50,  49,  51,  54,  59,  60,  65,  67,  71,  75,  76,  82,  84,  87,  91,  92,  97, 100, 104, 105, 106, 110, 113, 114, 112, 111, 110,
+             65,  62,  61,  59,  59,  59,  58,  60,  63,  67,  68,  73,  76,  79,  84,  85,  92,  94,  98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126,
+             82,  78,  76,  74,  73,  73,  71,  73,  76,  79,  80,  86,  88,  92,  97,  98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144,
+             90,  86,  85,  82,  81,  80,  79,  78,  81,  83,  87,  88,  93,  94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163, 163,
+        }, {
+             32,  31,  31,  30,  33,  33,  37,  39,  42,  47,  49,  48,  48,  49,  50,  50,  52,  53,  54,  56,  57,  60,  61,  63,  64,  64,  66,  67,  68,  69,  70,  70,
+             33,  34,  34,  35,  37,  38,  43,  43,  44,  46,  47,  46,  46,  45,  46,  46,  47,  48,  49,  51,  51,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
+             45,  45,  45,  44,  46,  46,  47,  48,  49,  51,  52,  51,  51,  51,  52,  52,  53,  53,  54,  55,  55,  57,  58,  59,  60,  60,  61,  61,  61,  61,  61,  61,
+             49,  47,  46,  45,  45,  46,  45,  47,  49,  53,  53,  56,  57,  58,  59,  59,  61,  61,  62,  63,  64,  65,  66,  67,  68,  68,  69,  69,  68,  68,  67,  67,
+             52,  50,  49,  48,  47,  47,  47,  48,  50,  53,  54,  57,  59,  61,  63,  64,  66,  67,  68,  70,  70,  72,  73,  75,  75,  75,  77,  78,  77,  76,  75,  74,
+             57,  54,  53,  52,  51,  51,  50,  51,  53,  56,  57,  60,  61,  64,  66,  67,  71,  72,  73,  76,  76,  79,  80,  82,  83,  83,  84,  85,  86,  85,  83,  82,
+             64,  61,  60,  58,  57,  57,  55,  56,  58,  61,  61,  64,  66,  68,  71,  71,  75,  77,  79,  82,  83,  86,  87,  90,  91,  91,  93,  93,  94,  94,  92,  90,
+             68,  64,  64,  61,  61,  60,  59,  58,  60,  61,  63,  64,  67,  67,  71,  71,  74,  75,  79,  80,  83,  85,  87,  89,  91,  94,  95,  97,  97,  99,  98,  98,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  32,  32,  32,  33,  34,  36,  36,  39,  40,  44,  46,  48,  52,  53,  58,  58,  65,  66,  71,  74,  79,  81,  82,  86,  88,  91,  93,
+             31,  32,  32,  32,  32,  33,  33,  33,  34,  34,  35,  35,  38,  39,  41,  43,  45,  48,  49,  53,  54,  59,  60,  65,  67,  72,  73,  74,  78,  80,  82,  85,
+             33,  33,  33,  33,  34,  35,  36,  36,  38,  39,  42,  42,  44,  45,  46,  48,  50,  52,  53,  57,  57,  62,  63,  67,  69,  73,  75,  75,  78,  80,  80,  81,
+             40,  39,  39,  38,  38,  39,  40,  41,  44,  45,  51,  51,  54,  56,  59,  60,  62,  65,  66,  69,  70,  74,  75,  79,  81,  85,  86,  87,  90,  90,  90,  90,
+             51,  49,  49,  47,  47,  48,  48,  48,  52,  53,  58,  59,  63,  65,  69,  72,  74,  78,  79,  83,  84,  89,  90,  94,  97, 101, 102, 103, 106, 105, 103, 103,
+             65,  62,  61,  59,  59,  59,  58,  58,  62,  63,  68,  68,  73,  75,  79,  82,  85,  90,  92,  97,  98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117,
+             79,  75,  74,  72,  71,  71,  69,  69,  72,  73,  78,  79,  84,  85,  90,  93,  96, 101, 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+             87,  83,  82,  79,  79,  78,  77,  75,  78,  80,  84,  85,  89,  90,  96,  97, 103, 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152,
+        }, {
+             32,  31,  31,  30,  31,  33,  35,  37,  41,  42,  49,  49,  48,  48,  49,  49,  50,  51,  52,  54,  54,  57,  57,  60,  61,  63,  64,  64,  66,  67,  68,  68,
+             32,  33,  33,  33,  35,  37,  39,  41,  43,  43,  47,  47,  46,  46,  45,  46,  46,  47,  48,  49,  50,  52,  52,  54,  55,  57,  58,  58,  59,  60,  61,  62,
+             40,  41,  41,  42,  43,  44,  46,  47,  48,  48,  50,  50,  49,  49,  48,  49,  49,  49,  50,  51,  51,  52,  53,  55,  56,  57,  58,  58,  59,  59,  59,  59,
+             49,  47,  47,  45,  46,  46,  46,  46,  49,  49,  53,  53,  54,  55,  56,  57,  57,  58,  58,  59,  59,  60,  61,  62,  63,  64,  65,  65,  66,  66,  65,  65,
+             51,  49,  49,  47,  47,  47,  47,  46,  49,  50,  54,  54,  57,  58,  61,  62,  63,  64,  65,  67,  67,  69,  69,  71,  72,  73,  73,  74,  75,  74,  72,  71,
+             57,  54,  54,  52,  51,  51,  50,  50,  52,  53,  57,  57,  60,  61,  64,  65,  67,  69,  71,  73,  73,  76,  77,  79,  80,  82,  82,  83,  84,  82,  81,  79,
+             63,  60,  59,  57,  57,  56,  55,  54,  57,  57,  60,  61,  64,  65,  67,  69,  71,  73,  75,  77,  78,  82,  82,  85,  86,  89,  89,  90,  91,  91,  89,  87,
+             67,  63,  63,  60,  60,  59,  58,  57,  59,  60,  62,  63,  65,  66,  69,  70,  73,  74,  77,  78,  81,  83,  85,  87,  88,  92,  92,  94,  94,  96,  95,  95,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  32,  32,  32,  32,  34,  34,  36,  36,  39,  39,  44,  44,  48,  48,  53,  53,  58,  58,  65,  65,  71,  71,  79,  79,  82,  82,  87,
+             31,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  37,  37,  41,  41,  45,  45,  49,  49,  54,  54,  60,  60,  65,  65,  72,  72,  75,  75,  79,
+             32,  32,  32,  33,  33,  34,  34,  35,  35,  37,  37,  38,  38,  40,  40,  43,  43,  46,  46,  50,  50,  54,  54,  58,  58,  63,  63,  70,  70,  72,  72,  76,
+             36,  35,  35,  34,  34,  36,  36,  38,  38,  42,  42,  48,  48,  50,  50,  53,  53,  56,  56,  60,  60,  63,  63,  68,  68,  73,  73,  79,  79,  81,  81,  84,
+             44,  42,  42,  41,  41,  42,  42,  42,  42,  48,  48,  54,  54,  58,  58,  63,  63,  67,  67,  71,  71,  75,  75,  79,  79,  84,  84,  90,  90,  92,  92,  96,
+             53,  51,  51,  49,  49,  50,  50,  49,  49,  54,  54,  60,  60,  65,  65,  71,  71,  76,  76,  82,  82,  87,  87,  92,  92,  97,  97, 104, 104, 106, 106, 109,
+             65,  62,  62,  59,  59,  59,  59,  58,  58,  63,  63,  68,  68,  73,  73,  79,  79,  85,  85,  92,  92,  98,  98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
+             79,  75,  75,  72,  72,  71,  71,  69,  69,  73,  73,  78,  78,  84,  84,  90,  90,  96,  96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141,
+        }, {
+             32,  31,  31,  30,  30,  33,  33,  37,  37,  42,  42,  49,  49,  48,  48,  49,  49,  50,  50,  52,  52,  54,  54,  57,  57,  60,  60,  63,  63,  64,  64,  66,
+             31,  31,  31,  32,  32,  36,  36,  40,  40,  43,  43,  46,  46,  46,  46,  45,  45,  46,  46,  48,  48,  50,  50,  52,  52,  54,  54,  57,  57,  59,  59,  60,
+             37,  38,  38,  40,  40,  43,  43,  47,  47,  47,  47,  48,  48,  47,  47,  46,  46,  46,  46,  47,  47,  49,  49,  50,  50,  52,  52,  55,  55,  56,  56,  57,
+             48,  47,  47,  46,  46,  47,  47,  47,  47,  50,  50,  53,  53,  53,  53,  53,  53,  54,  54,  54,  54,  55,  55,  56,  56,  58,  58,  60,  60,  61,  61,  63,
+             49,  47,  47,  45,  45,  46,  46,  45,  45,  49,  49,  53,  53,  56,  56,  58,  58,  59,  59,  61,  61,  62,  62,  64,  64,  65,  65,  67,  67,  68,  68,  69,
+             52,  50,  50,  48,  48,  47,  47,  47,  47,  50,  50,  54,  54,  57,  57,  61,  61,  64,  64,  66,  66,  68,  68,  70,  70,  72,  72,  75,  75,  75,  75,  77,
+             57,  54,  54,  52,  52,  51,  51,  50,  50,  53,  53,  57,  57,  60,  60,  64,  64,  67,  67,  71,  71,  73,  73,  76,  76,  79,  79,  82,  82,  83,  83,  84,
+             63,  60,  60,  57,  57,  56,  56,  54,  54,  57,  57,  60,  60,  64,  64,  67,  67,  71,  71,  75,  75,  78,  78,  82,  82,  85,  85,  89,  89,  90,  90,  92,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  32,  32,  32,  32,  34,  34,  35,  36,  38,  39,  41,  44,  44,  48,  48,  53,  53,  57,  58,  61,  65,  67,  71,  72,  79,  79,
+             31,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  36,  37,  39,  41,  42,  45,  45,  49,  49,  52,  54,  57,  60,  61,  65,  66,  72,  72,
+             32,  32,  32,  32,  33,  33,  34,  34,  35,  35,  37,  37,  38,  38,  40,  40,  41,  43,  43,  46,  46,  49,  50,  52,  54,  56,  58,  60,  63,  64,  70,  70,
+             36,  35,  35,  35,  34,  35,  36,  37,  38,  39,  42,  42,  47,  48,  49,  50,  51,  53,  54,  56,  56,  59,  60,  62,  63,  66,  68,  69,  73,  73,  79,  79,
+             44,  42,  42,  41,  41,  41,  42,  42,  42,  43,  48,  48,  52,  54,  56,  58,  60,  63,  64,  67,  67,  71,  71,  74,  75,  77,  79,  81,  84,  85,  90,  90,
+             53,  51,  51,  50,  49,  49,  50,  49,  49,  50,  54,  54,  59,  60,  63,  65,  67,  71,  72,  76,  76,  81,  82,  85,  87,  89,  92,  94,  97,  98, 104, 104,
+             62,  60,  59,  58,  57,  57,  57,  56,  56,  56,  61,  61,  65,  66,  69,  71,  74,  78,  79,  83,  83,  89,  90,  94,  95,  98, 102, 103, 108, 108, 115, 115,
+             73,  70,  69,  67,  66,  66,  65,  65,  64,  64,  69,  69,  73,  74,  77,  79,  81,  85,  86,  91,  91,  98,  99, 103, 105, 108, 112, 114, 119, 119, 127, 127,
+        }, {
+             32,  31,  31,  30,  30,  32,  33,  34,  37,  37,  42,  42,  47,  49,  48,  48,  48,  49,  49,  50,  50,  52,  52,  53,  54,  55,  57,  58,  60,  60,  63,  63,
+             31,  31,  31,  32,  32,  34,  36,  37,  40,  40,  43,  43,  46,  46,  46,  46,  45,  45,  45,  46,  46,  48,  48,  49,  50,  51,  52,  53,  54,  55,  57,  57,
+             37,  38,  38,  39,  40,  41,  43,  44,  47,  47,  47,  47,  48,  48,  47,  47,  46,  46,  46,  46,  46,  47,  47,  48,  49,  49,  50,  51,  52,  53,  55,  55,
+             48,  47,  47,  46,  46,  46,  47,  47,  47,  48,  50,  50,  52,  53,  53,  53,  53,  53,  53,  54,  54,  54,  54,  55,  55,  56,  56,  57,  58,  59,  60,  60,
+             49,  47,  47,  46,  45,  45,  46,  45,  45,  46,  49,  49,  53,  53,  55,  56,  57,  58,  58,  59,  59,  61,  61,  62,  62,  63,  64,  64,  65,  65,  67,  67,
+             52,  50,  50,  48,  48,  48,  47,  47,  47,  47,  50,  50,  53,  54,  56,  57,  59,  61,  62,  64,  64,  66,  66,  68,  68,  69,  70,  71,  72,  73,  75,  75,
+             56,  54,  53,  52,  51,  51,  50,  50,  49,  49,  53,  53,  55,  56,  58,  59,  61,  63,  64,  66,  66,  69,  70,  71,  72,  74,  75,  76,  77,  78,  80,  80,
+             61,  58,  57,  56,  55,  54,  54,  53,  52,  53,  56,  56,  58,  59,  61,  62,  63,  66,  66,  69,  69,  72,  73,  75,  76,  78,  79,  80,  82,  83,  86,  86,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  33,  34,  34,  36,  36,  38,  39,  41,  44,  44,  47,  48,  50,  53,  53,  57,  58,  61,  65,  65,  70,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,  37,  37,  39,  41,  41,  44,  45,  46,  49,  49,  53,  54,  56,  60,  60,  64,
+             32,  32,  32,  32,  33,  33,  34,  34,  34,  35,  35,  36,  37,  37,  38,  38,  40,  40,  41,  43,  43,  45,  46,  47,  50,  50,  53,  54,  56,  58,  58,  62,
+             35,  35,  35,  34,  34,  34,  35,  36,  36,  37,  37,  40,  41,  43,  46,  46,  47,  48,  49,  51,  51,  53,  54,  55,  57,  57,  60,  61,  63,  66,  66,  70,
+             39,  38,  38,  37,  37,  37,  38,  38,  39,  40,  40,  43,  44,  46,  50,  50,  52,  53,  54,  57,  57,  59,  60,  61,  64,  64,  67,  68,  69,  72,  72,  76,
+             44,  42,  42,  41,  41,  41,  42,  42,  42,  42,  42,  46,  48,  50,  54,  54,  57,  58,  60,  63,  63,  66,  67,  68,  71,  71,  74,  75,  77,  79,  79,  83,
+             53,  52,  51,  50,  49,  49,  49,  50,  49,  49,  49,  53,  54,  56,  60,  60,  64,  65,  67,  71,  71,  75,  76,  78,  82,  82,  86,  87,  89,  92,  92,  96,
+             65,  63,  62,  61,  59,  59,  59,  59,  58,  58,  58,  62,  63,  65,  68,  68,  72,  73,  76,  79,  79,  84,  85,  88,  92,  92,  97,  98, 100, 105, 105, 109,
+        }, {
+             32,  31,  31,  31,  30,  30,  33,  33,  35,  37,  37,  41,  42,  44,  49,  49,  48,  48,  48,  49,  49,  50,  50,  51,  52,  52,  54,  54,  55,  57,  57,  59,
+             31,  31,  31,  32,  32,  32,  35,  36,  37,  40,  40,  42,  43,  44,  46,  46,  46,  46,  45,  45,  45,  46,  46,  47,  48,  48,  49,  50,  51,  52,  52,  54,
+             37,  38,  38,  39,  40,  40,  42,  43,  44,  47,  47,  47,  47,  47,  48,  48,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  48,  49,  49,  50,  50,  52,
+             45,  45,  45,  45,  44,  44,  46,  46,  46,  47,  47,  49,  49,  50,  52,  52,  51,  51,  51,  51,  51,  52,  52,  52,  53,  53,  54,  54,  54,  55,  55,  57,
+             48,  47,  47,  46,  45,  45,  46,  46,  46,  47,  47,  49,  50,  51,  53,  53,  54,  54,  54,  55,  55,  56,  56,  56,  57,  57,  58,  58,  58,  59,  59,  61,
+             49,  47,  47,  46,  45,  45,  45,  46,  45,  45,  45,  48,  49,  51,  53,  53,  55,  56,  57,  58,  58,  59,  59,  60,  61,  61,  62,  62,  63,  64,  64,  65,
+             52,  50,  50,  49,  48,  48,  47,  47,  47,  47,  47,  50,  50,  52,  54,  54,  57,  57,  59,  61,  61,  63,  64,  65,  66,  66,  68,  68,  69,  70,  70,  72,
+             57,  55,  54,  53,  52,  52,  51,  51,  51,  50,  50,  52,  53,  54,  57,  57,  59,  60,  61,  64,  64,  66,  67,  68,  71,  71,  73,  73,  74,  76,  76,  78,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  34,  34,  35,  36,  36,  38,  39,  39,  42,  44,  44,  47,  48,  49,  53,  53,  55,  58,  58,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,  34,  36,  37,  37,  40,  41,  41,  44,  45,  46,  49,  49,  51,  54,  54,
+             32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  35,  35,  35,  36,  36,  37,  37,  37,  39,  40,  40,  42,  42,  43,  45,  46,  47,  49,  50,  51,  54,  54,
+             32,  33,  33,  33,  33,  33,  33,  34,  34,  35,  36,  36,  36,  38,  38,  39,  40,  40,  41,  42,  42,  44,  45,  45,  47,  48,  48,  51,  51,  53,  55,  55,
+             36,  35,  35,  35,  35,  34,  35,  36,  36,  37,  38,  38,  40,  42,  42,  45,  48,  48,  49,  50,  50,  52,  53,  54,  56,  56,  57,  59,  60,  61,  63,  63,
+             44,  43,  42,  42,  41,  41,  41,  42,  42,  42,  42,  42,  44,  48,  48,  50,  54,  54,  56,  58,  58,  61,  63,  63,  66,  67,  67,  71,  71,  72,  75,  75,
+             47,  46,  45,  45,  44,  44,  44,  45,  45,  45,  45,  45,  47,  50,  50,  53,  56,  56,  58,  60,  60,  64,  66,  66,  69,  70,  71,  74,  75,  76,  79,  79,
+             53,  52,  51,  51,  49,  49,  49,  49,  50,  49,  49,  49,  51,  54,  54,  57,  60,  60,  63,  65,  65,  69,  71,  72,  75,  76,  77,  81,  82,  83,  87,  87,
+        }, {
+             32,  31,  31,  31,  30,  30,  31,  33,  33,  34,  37,  37,  39,  42,  42,  45,  49,  49,  48,  48,  48,  49,  49,  49,  50,  50,  51,  52,  52,  53,  54,  54,
+             31,  31,  31,  31,  32,  32,  33,  35,  36,  37,  40,  40,  41,  43,  43,  44,  46,  46,  46,  46,  46,  45,  45,  45,  46,  46,  47,  48,  48,  48,  50,  50,
+             35,  36,  37,  37,  38,  38,  38,  41,  41,  42,  45,  45,  46,  46,  46,  47,  48,  48,  47,  46,  46,  46,  45,  46,  46,  46,  47,  47,  47,  48,  49,  49,
+             38,  39,  40,  40,  40,  41,  41,  43,  44,  45,  47,  47,  47,  48,  48,  48,  49,  49,  48,  48,  48,  47,  47,  47,  48,  48,  48,  48,  48,  49,  50,  50,
+             48,  47,  47,  47,  46,  46,  46,  47,  47,  47,  47,  47,  48,  50,  50,  51,  53,  53,  53,  53,  53,  53,  53,  53,  54,  54,  54,  54,  54,  54,  55,  55,
+             49,  48,  47,  47,  45,  45,  45,  45,  46,  45,  45,  45,  47,  49,  49,  51,  53,  53,  55,  56,  56,  57,  58,  58,  59,  59,  60,  61,  61,  61,  62,  62,
+             50,  49,  48,  48,  46,  46,  46,  46,  46,  46,  46,  46,  47,  50,  50,  52,  54,  54,  55,  56,  56,  58,  59,  60,  61,  61,  61,  63,  63,  63,  65,  65,
+             52,  50,  50,  50,  48,  48,  48,  47,  47,  47,  47,  47,  48,  50,  50,  52,  54,  54,  56,  57,  57,  60,  61,  61,  63,  64,  64,  66,  66,  67,  68,  68,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  34,  34,  34,  35,  36,  36,  38,  39,  39,  41,  44,  44,  44,  47,  48,  48,  51,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,  34,  34,  36,  37,  37,  39,  41,  41,  42,  44,  45,  45,  47,
+             31,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  35,  35,  35,  36,  36,  36,  37,  39,  39,  40,  42,  42,  42,  44,  45,  45,  48,
+             32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  35,  35,  35,  36,  37,  37,  37,  38,  38,  38,  40,  40,  40,  41,  43,  43,  43,  45,  46,  46,  48,
+             35,  35,  35,  35,  34,  34,  34,  34,  35,  36,  36,  37,  37,  37,  39,  41,  41,  42,  45,  46,  46,  47,  48,  48,  49,  51,  51,  51,  53,  54,  54,  56,
+             36,  35,  35,  35,  35,  34,  34,  35,  36,  36,  36,  37,  38,  38,  40,  42,  42,  43,  47,  48,  48,  49,  50,  50,  51,  53,  53,  54,  56,  56,  56,  58,
+             44,  43,  42,  42,  41,  41,  41,  41,  42,  42,  42,  42,  42,  42,  44,  48,  48,  49,  52,  54,  54,  56,  58,  58,  60,  63,  63,  64,  66,  67,  67,  69,
+             47,  46,  45,  45,  45,  44,  44,  44,  44,  45,  45,  45,  45,  45,  47,  50,  50,  51,  55,  56,  56,  58,  60,  60,  62,  66,  66,  67,  69,  70,  70,  73,
+        }, {
+             32,  31,  31,  31,  31,  30,  30,  31,  33,  33,  33,  35,  37,  37,  39,  42,  42,  43,  47,  49,  49,  48,  48,  48,  48,  49,  49,  49,  50,  50,  50,  51,
+             31,  31,  31,  31,  32,  32,  32,  33,  35,  36,  36,  38,  40,  40,  41,  43,  43,  43,  46,  46,  46,  46,  46,  46,  45,  45,  45,  45,  46,  46,  46,  47,
+             33,  33,  34,  34,  34,  35,  35,  35,  37,  38,  38,  41,  43,  43,  43,  44,  44,  45,  46,  47,  47,  46,  46,  46,  46,  45,  45,  45,  46,  46,  46,  47,
+             37,  38,  38,  38,  39,  40,  40,  40,  42,  43,  43,  45,  47,  47,  47,  47,  47,  47,  48,  48,  48,  47,  47,  47,  46,  46,  46,  46,  46,  46,  46,  47,
+             45,  45,  45,  45,  45,  44,  44,  45,  46,  46,  46,  47,  47,  47,  48,  49,  49,  50,  51,  52,  52,  52,  51,  51,  51,  51,  51,  52,  52,  52,  52,  52,
+             48,  47,  47,  47,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  48,  50,  50,  50,  52,  53,  53,  53,  53,  53,  53,  53,  53,  53,  54,  54,  54,  54,
+             49,  48,  47,  47,  46,  45,  45,  45,  45,  46,  46,  45,  45,  45,  47,  49,  49,  50,  53,  53,  53,  55,  56,  56,  57,  58,  58,  58,  59,  59,  59,  60,
+             50,  49,  48,  48,  47,  46,  46,  46,  46,  46,  46,  46,  46,  46,  47,  50,  50,  50,  53,  54,  54,  55,  56,  56,  57,  59,  59,  60,  61,  61,  61,  62,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  35,  36,  36,  36,  37,  39,  39,  39,  41,  44,  44,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  36,  37,  37,  37,  39,  41,  41,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  36,  37,  37,  37,  39,  41,  41,
+             32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  36,  37,  37,  37,  37,  38,  38,  38,  39,  40,  40,  40,  42,  43,  43,
+             32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  36,  37,  37,  37,  37,  38,  38,  38,  39,  40,  40,  40,  42,  43,  43,
+             36,  35,  35,  35,  35,  35,  34,  34,  34,  35,  36,  36,  36,  37,  38,  38,  38,  40,  42,  42,  42,  45,  48,  48,  48,  49,  50,  50,  50,  52,  53,  53,
+             36,  35,  35,  35,  35,  35,  34,  34,  34,  35,  36,  36,  36,  37,  38,  38,  38,  40,  42,  42,  42,  45,  48,  48,  48,  49,  50,  50,  50,  52,  53,  53,
+             44,  43,  42,  42,  42,  41,  41,  41,  41,  41,  42,  42,  42,  42,  42,  42,  42,  45,  48,  48,  48,  50,  54,  54,  54,  56,  58,  58,  58,  60,  63,  63,
+        }, {
+             32,  31,  31,  31,  31,  31,  30,  30,  30,  32,  33,  33,  33,  35,  37,  37,  37,  39,  42,  42,  42,  45,  49,  49,  49,  48,  48,  48,  48,  48,  49,  49,
+             31,  31,  31,  31,  31,  32,  32,  32,  32,  34,  36,  36,  36,  38,  40,  40,  40,  41,  43,  43,  43,  44,  46,  46,  46,  46,  46,  46,  46,  45,  45,  45,
+             31,  31,  31,  31,  31,  32,  32,  32,  32,  34,  36,  36,  36,  38,  40,  40,  40,  41,  43,  43,  43,  44,  46,  46,  46,  46,  46,  46,  46,  45,  45,  45,
+             37,  37,  38,  38,  38,  39,  40,  40,  40,  41,  43,  43,  43,  45,  47,  47,  47,  47,  47,  47,  47,  47,  48,  48,  48,  47,  47,  47,  47,  46,  46,  46,
+             37,  37,  38,  38,  38,  39,  40,  40,  40,  41,  43,  43,  43,  45,  47,  47,  47,  47,  47,  47,  47,  47,  48,  48,  48,  47,  47,  47,  47,  46,  46,  46,
+             48,  47,  47,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  47,  49,  50,  50,  50,  51,  53,  53,  53,  53,  53,  53,  53,  53,  53,  53,
+             48,  47,  47,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  47,  49,  50,  50,  50,  51,  53,  53,  53,  53,  53,  53,  53,  53,  53,  53,
+             49,  48,  47,  47,  47,  46,  45,  45,  45,  45,  46,  46,  46,  45,  45,  45,  45,  47,  49,  49,  49,  51,  53,  53,  53,  54,  56,  56,  56,  57,  58,  58,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  34,  35,  36,  36,  36,  37,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  34,  35,  35,  35,  36,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  36,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  35,  35,  36,  36,  36,  36,  37,
+             32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  34,  35,  35,  35,  35,  36,  36,  37,  37,  37,  37,  38,  38,  38,  38,  39,
+             32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  34,  35,  35,  35,  35,  36,  36,  37,  37,  37,  37,  38,  38,  38,  38,  39,
+             35,  35,  35,  35,  35,  35,  34,  34,  34,  34,  34,  35,  35,  36,  36,  36,  36,  37,  37,  37,  37,  39,  40,  41,  41,  41,  43,  45,  46,  46,  46,  46,
+             36,  35,  35,  35,  35,  35,  35,  35,  34,  34,  34,  35,  36,  36,  36,  36,  37,  38,  38,  38,  38,  40,  41,  42,  42,  42,  44,  47,  48,  48,  48,  49,
+        }, {
+             32,  31,  31,  31,  31,  31,  31,  30,  30,  30,  30,  31,  33,  33,  33,  33,  35,  36,  37,  37,  37,  39,  41,  42,  42,  42,  44,  47,  49,  49,  49,  49,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  33,  34,  35,  35,  35,  37,  38,  39,  39,  39,  40,  42,  42,  42,  42,  44,  46,  47,  47,  47,  47,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  33,  35,  36,  36,  36,  37,  39,  40,  40,  40,  41,  42,  43,  43,  43,  44,  46,  46,  46,  46,  46,
+             33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  35,  36,  37,  38,  38,  38,  40,  42,  43,  43,  43,  43,  44,  44,  44,  44,  45,  46,  47,  47,  47,  47,
+             37,  37,  38,  38,  38,  38,  39,  39,  40,  40,  40,  41,  42,  43,  43,  43,  44,  46,  47,  47,  47,  47,  47,  47,  47,  47,  47,  48,  48,  48,  48,  47,
+             37,  37,  38,  38,  38,  38,  39,  39,  40,  40,  40,  41,  42,  43,  43,  43,  44,  46,  47,  47,  47,  47,  47,  47,  47,  47,  47,  48,  48,  48,  48,  47,
+             45,  45,  45,  45,  45,  45,  45,  44,  44,  44,  44,  45,  46,  46,  46,  46,  46,  47,  47,  47,  47,  48,  49,  49,  49,  49,  50,  51,  52,  52,  52,  52,
+             48,  48,  47,  47,  47,  47,  46,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  47,  47,  47,  48,  49,  50,  50,  50,  51,  52,  53,  53,  53,  53,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  34,  34,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  34,  34,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,  35,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  36,  36,  37,  37,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  36,  36,  37,  37,
+             32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  36,  36,  36,  36,  36,  36,  37,  38,  38,
+        }, {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  30,  30,  30,  30,  30,  31,  32,  33,  33,  33,  33,  33,  34,  35,  36,  37,  37,  37,  37,  39,  40,  42,  42,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  33,  34,  35,  35,  35,  35,  36,  37,  38,  39,  39,  39,  39,  40,  41,  42,  42,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  33,  34,  35,  36,  36,  36,  36,  37,  38,  39,  40,  40,  40,  40,  41,  42,  43,  43,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  33,  34,  35,  36,  36,  36,  36,  37,  38,  39,  40,  40,  40,  40,  41,  42,  43,  43,
+             33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  36,  37,  38,  38,  38,  38,  39,  41,  42,  43,  43,  43,  43,  43,  44,  44,  44,
+             37,  37,  38,  38,  38,  38,  38,  38,  39,  39,  40,  40,  40,  40,  40,  41,  42,  43,  43,  43,  43,  44,  45,  47,  47,  47,  47,  47,  47,  47,  47,  47,
+             37,  37,  38,  38,  38,  38,  38,  38,  39,  39,  40,  40,  40,  40,  40,  41,  42,  43,  43,  43,  43,  44,  45,  47,  47,  47,  47,  47,  47,  47,  47,  47,
+             38,  39,  39,  40,  40,  40,  40,  40,  40,  40,  41,  41,  41,  41,  41,  42,  43,  44,  44,  44,  44,  45,  46,  47,  47,  47,  47,  47,  47,  47,  48,  48,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,
+        }, {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  30,  30,  30,  30,  30,  30,  30,  31,  31,  32,  33,  33,  33,  33,  33,  33,  33,  34,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  33,  34,  34,  34,  34,  34,  34,  34,  35,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  35,  35,  35,  35,  35,  35,  36,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  35,  36,  36,  36,  36,  36,  36,  36,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  35,  36,  36,  36,  36,  36,  36,  36,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  35,  36,  36,  36,  36,  36,  36,  36,
+             33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  35,  36,  37,  37,  38,  38,  38,  38,  38,  38,  39,
+             35,  35,  36,  36,  36,  37,  37,  37,  37,  37,  37,  37,  37,  37,  38,  38,  38,  38,  38,  38,  38,  38,  39,  40,  40,  41,  41,  41,  41,  41,  41,  42,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+        }, {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  30,  30,  30,  30,  30,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+        },
+    },
+};
+
+static const uint8_t qm_tbl_32x16[][2][512] = {
+    {
+        {
+             32,  31,  31,  31,  32,  32,  34,  35,  36,  39,  44,  46,  48,  53,  58,  61,  65,  71,  79,  81,  82,  88,  91,  94,  97, 100, 103, 107, 110, 114, 118, 122,
+             31,  32,  32,  32,  32,  33,  34,  34,  34,  37,  41,  43,  45,  49,  54,  57,  60,  65,  72,  74,  75,  80,  83,  85,  88,  91,  94,  97, 101, 104, 108, 111,
+             32,  32,  33,  33,  34,  35,  37,  37,  38,  40,  43,  44,  46,  50,  54,  56,  58,  63,  70,  71,  72,  77,  80,  83,  86,  89,  93,  96, 100, 104, 107, 111,
+             34,  34,  33,  34,  35,  37,  39,  41,  43,  45,  48,  49,  51,  54,  58,  60,  63,  68,  74,  75,  76,  80,  81,  82,  85,  87,  90,  93,  97, 100, 103, 107,
+             36,  35,  34,  35,  36,  38,  42,  45,  48,  50,  53,  55,  56,  60,  63,  66,  68,  73,  79,  80,  81,  85,  88,  91,  94,  97,  98, 100, 101, 103, 105, 107,
+             44,  42,  41,  41,  42,  42,  48,  50,  54,  58,  63,  65,  67,  71,  75,  77,  79,  84,  90,  91,  92,  97, 100, 100, 100, 100, 101, 104, 108, 112, 115, 119,
+             53,  51,  49,  49,  50,  49,  54,  57,  60,  65,  71,  73,  76,  82,  87,  89,  92,  97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118, 119,
+             59,  56,  54,  54,  54,  53,  58,  61,  64,  69,  75,  78,  80,  87,  92,  95,  98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123, 127, 131, 136,
+             65,  62,  59,  59,  59,  58,  63,  65,  68,  73,  79,  82,  85,  92,  98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, 138, 137, 136, 136,
+             79,  75,  72,  71,  71,  69,  73,  76,  78,  84,  90,  93,  96, 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144, 142, 141, 146, 151, 156,
+             87,  82,  78,  78,  77,  75,  79,  82,  84,  89,  95,  98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156,
+             90,  85,  82,  81,  80,  78,  78,  83,  87,  89,  93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160, 161, 164, 170, 168, 165, 167, 172, 178,
+             93,  88,  86,  84,  82,  82,  80,  84,  86,  91,  94,  98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177, 183, 185, 182, 179,
+             96,  91,  90,  87,  86,  86,  83,  84,  89,  91,  95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152, 160, 167, 173, 178, 180, 187, 188, 190, 197, 203,
+             99,  94,  93,  90,  89,  89,  88,  87,  90,  93,  97,  99, 105, 107, 115, 116, 124, 127, 135, 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204,
+            102,  97,  97,  93,  93,  92,  92,  90,  90,  96,  97, 103, 104, 111, 112, 120, 121, 130, 131, 142, 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217,
+        }, {
+             32,  31,  30,  32,  33,  37,  42,  45,  49,  48,  49,  49,  50,  52,  54,  55,  57,  60,  63,  64,  64,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
+             31,  31,  32,  34,  36,  40,  43,  44,  46,  46,  45,  46,  46,  48,  50,  51,  52,  54,  57,  58,  59,  61,  62,  62,  63,  64,  65,  66,  67,  68,  69,  70,
+             37,  38,  40,  41,  43,  47,  47,  47,  48,  47,  46,  46,  46,  47,  49,  49,  50,  52,  55,  55,  56,  58,  59,  60,  62,  63,  64,  65,  67,  68,  69,  70,
+             42,  42,  42,  44,  45,  47,  48,  49,  50,  50,  49,  49,  50,  50,  52,  52,  53,  55,  58,  58,  58,  60,  60,  60,  60,  61,  62,  63,  64,  65,  66,  67,
+             48,  47,  46,  46,  47,  47,  50,  51,  53,  53,  53,  53,  54,  54,  55,  56,  56,  58,  60,  61,  61,  63,  64,  65,  66,  67,  66,  66,  66,  66,  67,  67,
+             49,  47,  45,  45,  46,  45,  49,  51,  53,  56,  58,  59,  59,  61,  62,  63,  64,  65,  67,  68,  68,  69,  71,  70,  69,  68,  68,  69,  70,  71,  72,  73,
+             52,  50,  48,  48,  47,  47,  50,  52,  54,  57,  61,  62,  64,  66,  68,  69,  70,  72,  75,  75,  75,  76,  74,  72,  73,  74,  75,  75,  74,  74,  73,  73,
+             54,  52,  50,  49,  49,  48,  52,  54,  55,  59,  62,  64,  65,  68,  71,  72,  73,  75,  78,  78,  79,  79,  78,  79,  81,  79,  78,  76,  77,  78,  80,  81,
+             57,  54,  52,  51,  51,  50,  53,  55,  57,  60,  64,  65,  67,  71,  73,  75,  76,  79,  82,  82,  83,  85,  86,  85,  83,  82,  83,  84,  84,  83,  82,  81,
+             63,  60,  57,  57,  56,  54,  57,  59,  60,  64,  67,  69,  71,  75,  78,  80,  82,  85,  89,  89,  90,  92,  91,  88,  89,  90,  89,  87,  86,  87,  88,  90,
+             66,  63,  60,  59,  59,  57,  60,  61,  62,  66,  69,  71,  73,  77,  80,  82,  84,  88,  92,  92,  93,  95,  94,  95,  96,  93,  92,  93,  94,  93,  91,  90,
+             67,  64,  62,  61,  60,  58,  58,  61,  63,  65,  67,  70,  72,  74,  78,  80,  82,  86,  88,  90,  95,  96,  96,  98,  97,  98, 100,  98,  96,  96,  97,  99,
+             68,  65,  63,  62,  60,  60,  59,  61,  62,  65,  66,  68,  72,  73,  76,  79,  80,  84,  87,  89,  93,  94,  98,  99,  99, 102, 101, 102, 103, 103, 101,  99,
+             69,  66,  65,  63,  62,  61,  60,  60,  63,  64,  66,  68,  70,  73,  74,  78,  80,  82,  85,  87,  91,  92,  96,  98, 101, 102, 103, 105, 105, 105, 107, 108,
+             71,  67,  66,  64,  63,  62,  62,  61,  62,  64,  66,  67,  70,  71,  75,  76,  79,  81,  84,  86,  89,  91,  94,  97,  98, 102, 104, 106, 106, 109, 109, 108,
+             72,  68,  68,  65,  65,  63,  63,  61,  62,  65,  65,  68,  69,  72,  73,  77,  77,  81,  81,  86,  87,  91,  91,  96,  97, 101, 102, 107, 107, 109, 110, 113,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  32,  32,  32,  34,  36,  38,  39,  44,  47,  49,  53,  58,  61,  65,  71,  76,  79,  82,  86,  89,  92,  95,  98, 101, 104, 107, 110, 114,
+             31,  32,  32,  32,  32,  33,  33,  34,  34,  36,  37,  41,  44,  46,  49,  54,  56,  60,  65,  69,  72,  75,  78,  81,  84,  86,  89,  92,  95,  98, 101, 104,
+             32,  32,  32,  33,  34,  35,  35,  36,  37,  39,  40,  42,  45,  47,  50,  54,  56,  59,  64,  68,  70,  73,  76,  79,  82,  85,  88,  91,  94,  97, 100, 104,
+             32,  33,  33,  33,  34,  36,  36,  38,  40,  41,  42,  45,  47,  48,  51,  55,  57,  60,  65,  69,  71,  74,  77,  78,  80,  83,  85,  88,  91,  94,  97, 100,
+             36,  35,  35,  35,  36,  38,  40,  42,  48,  49,  50,  53,  56,  57,  60,  63,  65,  68,  73,  76,  79,  81,  84,  87,  89,  92,  93,  94,  95,  96,  98, 100,
+             44,  42,  41,  41,  42,  42,  44,  48,  54,  56,  58,  63,  66,  67,  71,  75,  77,  79,  84,  88,  90,  92,  95,  95,  95,  95,  95,  98, 101, 105, 108, 111,
+             47,  45,  44,  44,  45,  45,  47,  50,  56,  58,  60,  66,  69,  71,  75,  79,  81,  84,  89,  92,  95,  97, 100,  99, 101, 105, 108, 110, 110, 110, 111, 111,
+             53,  51,  49,  49,  50,  49,  51,  54,  60,  63,  65,  71,  75,  77,  82,  87,  89,  92,  97, 101, 104, 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126,
+             65,  62,  60,  59,  59,  58,  60,  63,  68,  71,  73,  79,  84,  86,  92,  98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127,
+             73,  69,  67,  66,  65,  64,  66,  69,  74,  77,  79,  85,  90,  93,  99, 105, 107, 112, 119, 123, 127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145,
+             79,  75,  72,  71,  71,  69,  71,  73,  78,  81,  84,  90,  95,  97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145,
+             87,  83,  80,  79,  78,  76,  76,  80,  84,  86,  90,  96,  99, 103, 111, 114, 118, 126, 130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166,
+             90,  86,  84,  82,  80,  80,  78,  82,  83,  88,  91,  94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166,
+             93,  88,  87,  84,  83,  83,  81,  81,  86,  88,  92,  96,  98, 105, 107, 113, 117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177, 183, 189,
+             96,  91,  90,  87,  87,  86,  85,  84,  87,  90,  94,  96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190,
+             99,  94,  94,  90,  90,  88,  89,  86,  87,  93,  93,  99,  99, 106, 107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185, 186, 192, 193, 201,
+        }, {
+             32,  31,  30,  31,  33,  37,  39,  42,  49,  48,  48,  49,  50,  51,  52,  54,  55,  57,  60,  62,  63,  64,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
+             31,  31,  32,  33,  36,  40,  41,  43,  46,  46,  46,  45,  46,  47,  48,  50,  51,  52,  54,  56,  57,  59,  60,  61,  62,  63,  64,  65,  65,  66,  67,  68,
+             35,  37,  38,  38,  41,  45,  46,  46,  48,  47,  46,  45,  46,  47,  47,  49,  49,  50,  53,  54,  55,  56,  58,  59,  60,  61,  62,  64,  65,  66,  67,  68,
+             38,  40,  40,  41,  44,  47,  47,  48,  49,  48,  48,  47,  48,  48,  48,  50,  50,  51,  53,  55,  56,  57,  58,  58,  59,  60,  60,  61,  62,  63,  64,  65,
+             48,  47,  46,  46,  47,  47,  48,  50,  53,  53,  53,  53,  54,  54,  54,  55,  56,  56,  58,  60,  60,  61,  62,  63,  64,  65,  65,  65,  65,  65,  65,  65,
+             49,  47,  45,  45,  46,  45,  47,  49,  53,  55,  56,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  68,  67,  66,  66,  67,  68,  69,  70,  71,
+             50,  48,  46,  46,  46,  46,  47,  50,  54,  55,  56,  59,  61,  61,  63,  65,  65,  66,  68,  69,  70,  71,  72,  71,  71,  72,  73,  73,  72,  72,  71,  71,
+             52,  50,  48,  48,  47,  47,  48,  50,  54,  56,  57,  61,  63,  64,  66,  68,  69,  70,  72,  74,  75,  75,  76,  78,  79,  77,  76,  74,  75,  76,  77,  78,
+             57,  54,  52,  52,  51,  50,  51,  53,  57,  58,  60,  64,  66,  68,  71,  73,  74,  76,  79,  81,  82,  83,  84,  83,  81,  80,  81,  82,  82,  81,  79,  78,
+             61,  57,  55,  55,  54,  52,  54,  56,  59,  61,  62,  66,  68,  70,  73,  76,  77,  79,  82,  84,  86,  87,  88,  86,  86,  88,  87,  85,  83,  85,  86,  87,
+             63,  60,  58,  57,  56,  54,  55,  57,  60,  62,  64,  67,  70,  71,  75,  78,  79,  82,  85,  87,  89,  90,  91,  93,  94,  91,  89,  90,  92,  90,  89,  87,
+             67,  63,  61,  60,  59,  57,  57,  60,  63,  64,  66,  69,  71,  73,  77,  79,  81,  85,  87,  88,  92,  93,  94,  96,  95,  96,  97,  95,  93,  93,  94,  96,
+             68,  64,  63,  61,  60,  59,  58,  60,  61,  64,  65,  67,  71,  72,  75,  78,  79,  83,  85,  87,  91,  92,  95,  96,  97,  99,  98,  99, 100, 100,  98,  96,
+             69,  65,  64,  62,  61,  61,  59,  59,  62,  63,  65,  67,  68,  72,  73,  76,  78,  81,  84,  85,  89,  90,  93,  96,  98,  99, 100, 102, 102, 102, 103, 105,
+             70,  66,  65,  63,  63,  62,  61,  60,  61,  63,  65,  66,  69,  70,  74,  74,  78,  79,  82,  84,  87,  89,  91,  94,  96, 100, 101, 103, 103, 105, 105, 105,
+             71,  67,  67,  64,  64,  62,  62,  60,  61,  64,  64,  67,  67,  71,  71,  75,  75,  79,  80,  84,  84,  89,  89,  94,  94,  98,  99, 104, 104, 106, 106, 109,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  32,  32,  32,  34,  34,  36,  39,  40,  44,  47,  49,  53,  57,  59,  65,  69,  71,  79,  81,  82,  87,  90,  92,  95,  98, 100, 103, 106,
+             31,  32,  32,  32,  32,  32,  33,  34,  34,  34,  37,  38,  41,  44,  46,  49,  53,  54,  60,  63,  65,  72,  74,  75,  79,  82,  84,  87,  89,  92,  94,  97,
+             32,  32,  32,  32,  33,  34,  34,  35,  36,  37,  39,  40,  42,  45,  46,  50,  53,  54,  59,  62,  64,  71,  72,  73,  77,  80,  83,  85,  88,  91,  94,  97,
+             32,  32,  32,  33,  34,  34,  35,  37,  37,  38,  40,  41,  43,  46,  47,  50,  53,  54,  58,  62,  63,  70,  71,  72,  76,  78,  81,  83,  85,  88,  90,  93,
+             36,  35,  35,  34,  36,  37,  38,  42,  44,  48,  50,  51,  53,  56,  57,  60,  63,  64,  68,  71,  73,  79,  80,  81,  85,  87,  88,  88,  89,  90,  92,  93,
+             39,  38,  38,  37,  39,  40,  40,  45,  47,  51,  54,  55,  58,  61,  62,  65,  68,  69,  73,  76,  78,  84,  85,  86,  90,  89,  90,  92,  95,  98, 101, 104,
+             44,  42,  41,  41,  42,  42,  42,  48,  50,  54,  58,  59,  63,  66,  67,  71,  74,  75,  79,  83,  84,  90,  91,  92,  96,  99, 102, 103, 103, 103, 103, 104,
+             53,  51,  50,  49,  50,  49,  49,  54,  56,  60,  65,  67,  71,  75,  77,  82,  86,  87,  92,  96,  97, 104, 105, 106, 110, 108, 106, 105, 108, 111, 114, 118,
+             58,  55,  54,  53,  53,  53,  52,  57,  59,  63,  68,  70,  74,  79,  81,  86,  90,  91,  97, 100, 102, 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118,
+             65,  62,  60,  59,  59,  58,  58,  63,  65,  68,  73,  75,  79,  85,  86,  92,  97,  98, 105, 109, 111, 118, 120, 121, 125, 129, 128, 125, 124, 127, 131, 135,
+             79,  75,  73,  72,  71,  70,  69,  73,  75,  78,  84,  85,  90,  95,  97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135,
+             81,  77,  75,  74,  72,  71,  70,  75,  77,  80,  85,  87,  91,  97,  99, 105, 110, 112, 119, 124, 127, 135, 137, 139, 143, 146, 150, 148, 144, 146, 150, 154,
+             88,  83,  81,  79,  78,  77,  76,  79,  81,  85,  88,  91,  97,  99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160, 161, 158, 155,
+             90,  86,  84,  82,  81,  80,  78,  79,  83,  85,  89,  92,  94, 101, 102, 108, 112, 117, 123, 125, 134, 136, 143, 148, 154, 157, 158, 164, 164, 165, 170, 175,
+             93,  88,  88,  84,  84,  83,  82,  81,  84,  86,  90,  92,  97,  98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176,
+             96,  91,  91,  87,  87,  85,  86,  83,  84,  89,  89,  95,  95, 102, 102, 110, 110, 118, 119, 128, 129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187,
+        }, {
+             32,  31,  31,  30,  33,  35,  37,  42,  44,  49,  48,  48,  49,  50,  51,  52,  54,  54,  57,  59,  60,  63,  64,  64,  66,  67,  68,  69,  70,  71,  72,  73,
+             31,  31,  32,  32,  36,  38,  40,  43,  44,  46,  46,  45,  45,  46,  47,  48,  49,  50,  52,  54,  54,  57,  58,  59,  60,  61,  62,  63,  64,  65,  65,  66,
+             34,  35,  36,  36,  40,  42,  44,  45,  46,  47,  46,  46,  45,  46,  47,  47,  49,  49,  51,  52,  53,  56,  57,  57,  59,  60,  61,  62,  63,  64,  65,  66,
+             37,  38,  39,  40,  43,  45,  47,  47,  47,  48,  47,  46,  46,  46,  47,  47,  48,  49,  50,  52,  52,  55,  55,  56,  57,  58,  59,  60,  60,  61,  62,  63,
+             48,  47,  46,  46,  47,  47,  47,  50,  51,  53,  53,  53,  53,  54,  54,  54,  55,  55,  56,  58,  58,  60,  61,  61,  63,  63,  63,  63,  63,  63,  63,  63,
+             48,  47,  46,  45,  46,  46,  46,  50,  51,  53,  54,  55,  56,  56,  57,  57,  58,  59,  60,  61,  62,  64,  64,  65,  66,  65,  64,  65,  66,  67,  68,  69,
+             49,  47,  46,  45,  46,  45,  45,  49,  51,  53,  56,  56,  58,  59,  60,  61,  62,  62,  64,  65,  65,  67,  68,  68,  69,  70,  71,  71,  70,  70,  69,  69,
+             52,  50,  48,  48,  47,  47,  47,  50,  52,  54,  57,  58,  61,  63,  64,  66,  68,  68,  70,  72,  72,  75,  75,  75,  77,  75,  74,  72,  73,  74,  75,  76,
+             54,  51,  50,  49,  49,  48,  48,  51,  53,  55,  58,  59,  62,  65,  65,  68,  70,  70,  73,  74,  75,  77,  78,  78,  79,  78,  79,  80,  80,  78,  77,  76,
+             57,  54,  53,  52,  51,  50,  50,  53,  54,  57,  60,  61,  64,  66,  68,  71,  73,  74,  76,  78,  79,  82,  82,  83,  84,  85,  84,  82,  81,  82,  83,  84,
+             63,  60,  58,  57,  56,  55,  54,  57,  59,  60,  64,  65,  67,  70,  71,  75,  77,  78,  82,  84,  85,  89,  89,  90,  91,  88,  87,  88,  89,  88,  86,  84,
+             64,  61,  59,  58,  57,  56,  55,  58,  59,  61,  64,  65,  68,  71,  72,  75,  78,  79,  82,  85,  86,  90,  90,  91,  93,  93,  94,  93,  90,  90,  92,  93,
+             67,  63,  62,  60,  59,  58,  57,  59,  60,  63,  64,  66,  70,  70,  73,  76,  77,  81,  83,  85,  89,  90,  93,  94,  94,  96,  96,  96,  97,  97,  95,  93,
+             68,  64,  63,  61,  60,  60,  58,  58,  61,  62,  64,  66,  67,  71,  71,  75,  77,  79,  82,  83,  87,  88,  91,  93,  95,  97,  97,  99,  99,  99, 100, 101,
+             69,  65,  65,  62,  62,  61,  60,  59,  61,  62,  64,  65,  68,  68,  72,  73,  76,  77,  81,  82,  85,  87,  89,  92,  93,  97,  98, 100, 100, 102, 102, 101,
+             69,  66,  66,  63,  63,  61,  61,  59,  60,  63,  63,  66,  66,  70,  70,  73,  74,  78,  78,  82,  82,  86,  87,  91,  91,  95,  96, 101, 101, 103, 103, 105,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  32,  32,  32,  34,  35,  36,  39,  41,  44,  47,  48,  53,  55,  58,  63,  65,  71,  74,  79,  82,  82,  87,  89,  92,  94,  97,  99,
+             31,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  37,  39,  41,  44,  45,  49,  51,  54,  58,  60,  65,  68,  72,  75,  75,  79,  82,  84,  86,  88,  91,
+             31,  32,  32,  32,  33,  33,  34,  34,  35,  36,  36,  39,  40,  42,  44,  45,  50,  51,  54,  58,  59,  64,  67,  71,  73,  74,  78,  81,  83,  85,  88,  91,
+             32,  32,  32,  33,  34,  34,  35,  36,  37,  38,  38,  40,  41,  43,  45,  46,  50,  51,  54,  57,  58,  63,  66,  70,  72,  72,  76,  78,  80,  82,  85,  87,
+             35,  35,  34,  34,  35,  36,  37,  39,  41,  45,  46,  48,  49,  51,  53,  54,  57,  59,  61,  65,  66,  71,  73,  77,  79,  79,  83,  83,  84,  85,  86,  87,
+             36,  35,  35,  34,  36,  36,  38,  40,  42,  47,  48,  50,  51,  53,  56,  56,  60,  61,  63,  67,  68,  73,  75,  79,  81,  81,  85,  87,  89,  92,  94,  97,
+             44,  42,  41,  41,  42,  42,  42,  44,  48,  52,  54,  58,  60,  63,  66,  67,  71,  72,  75,  78,  79,  84,  86,  90,  92,  92,  96,  97,  97,  97,  97,  97,
+             47,  45,  45,  44,  44,  45,  45,  47,  50,  55,  56,  60,  62,  66,  69,  70,  75,  77,  79,  83,  84,  89,  91,  95,  97,  97, 100,  99, 101, 104, 107, 110,
+             53,  51,  50,  49,  49,  50,  49,  51,  54,  59,  60,  65,  67,  71,  75,  76,  82,  84,  87,  91,  92,  97, 100, 104, 105, 106, 110, 113, 114, 112, 111, 110,
+             62,  59,  58,  57,  57,  57,  56,  58,  61,  65,  66,  71,  74,  78,  82,  83,  90,  92,  95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116, 119, 123, 126,
+             65,  62,  61,  59,  59,  59,  58,  60,  63,  67,  68,  73,  76,  79,  84,  85,  92,  94,  98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126,
+             79,  75,  74,  72,  71,  71,  69,  71,  73,  77,  78,  84,  86,  90,  95,  96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135, 136, 140, 144,
+             82,  78,  76,  74,  73,  73,  71,  73,  76,  79,  80,  86,  88,  92,  97,  98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144,
+             88,  83,  82,  79,  79,  78,  76,  76,  81,  82,  85,  89,  91,  97,  98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153, 153, 154, 159, 163,
+             90,  86,  85,  82,  81,  80,  79,  78,  81,  83,  87,  88,  93,  94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163, 163,
+             93,  88,  88,  84,  84,  82,  83,  80,  80,  86,  86,  91,  91,  97,  98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150, 161, 162, 166, 167, 173,
+        }, {
+             32,  31,  31,  30,  33,  33,  37,  39,  42,  47,  49,  48,  48,  49,  50,  50,  52,  53,  54,  56,  57,  60,  61,  63,  64,  64,  66,  67,  68,  69,  70,  70,
+             31,  31,  32,  32,  35,  36,  40,  41,  43,  46,  46,  46,  45,  45,  46,  46,  48,  49,  50,  51,  52,  54,  56,  57,  58,  59,  60,  61,  62,  63,  63,  64,
+             33,  34,  34,  35,  37,  38,  43,  43,  44,  46,  47,  46,  46,  45,  46,  46,  47,  48,  49,  51,  51,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
+             37,  38,  39,  40,  42,  43,  47,  47,  47,  48,  48,  47,  46,  46,  46,  46,  47,  48,  49,  50,  50,  52,  53,  55,  56,  56,  57,  58,  59,  59,  60,  61,
+             45,  45,  45,  44,  46,  46,  47,  48,  49,  51,  52,  51,  51,  51,  52,  52,  53,  53,  54,  55,  55,  57,  58,  59,  60,  60,  61,  61,  61,  61,  61,  61,
+             48,  47,  46,  46,  47,  47,  47,  48,  50,  52,  53,  53,  53,  53,  54,  54,  54,  55,  55,  56,  56,  58,  59,  60,  61,  61,  63,  63,  64,  65,  66,  67,
+             49,  47,  46,  45,  45,  46,  45,  47,  49,  53,  53,  56,  57,  58,  59,  59,  61,  61,  62,  63,  64,  65,  66,  67,  68,  68,  69,  69,  68,  68,  67,  67,
+             50,  48,  47,  46,  46,  46,  46,  47,  50,  53,  54,  56,  57,  59,  61,  61,  63,  64,  65,  66,  66,  68,  69,  70,  71,  71,  72,  70,  71,  72,  73,  74,
+             52,  50,  49,  48,  47,  47,  47,  48,  50,  53,  54,  57,  59,  61,  63,  64,  66,  67,  68,  70,  70,  72,  73,  75,  75,  75,  77,  78,  77,  76,  75,  74,
+             56,  53,  52,  51,  50,  50,  49,  50,  53,  55,  56,  59,  61,  63,  65,  66,  70,  71,  72,  74,  75,  77,  79,  80,  81,  81,  82,  80,  79,  80,  81,  82,
+             57,  54,  53,  52,  51,  51,  50,  51,  53,  56,  57,  60,  61,  64,  66,  67,  71,  72,  73,  76,  76,  79,  80,  82,  83,  83,  84,  85,  86,  85,  83,  82,
+             63,  60,  59,  57,  56,  56,  54,  55,  57,  60,  60,  64,  65,  67,  70,  71,  75,  76,  78,  81,  82,  85,  86,  89,  90,  90,  92,  90,  88,  88,  89,  90,
+             64,  61,  60,  58,  57,  57,  55,  56,  58,  61,  61,  64,  66,  68,  71,  71,  75,  77,  79,  82,  83,  86,  87,  90,  91,  91,  93,  93,  94,  94,  92,  90,
+             67,  63,  62,  60,  60,  59,  57,  57,  60,  61,  63,  65,  66,  70,  70,  73,  75,  77,  80,  81,  85,  86,  89,  91,  93,  94,  94,  96,  96,  95,  97,  98,
+             68,  64,  64,  61,  61,  60,  59,  58,  60,  61,  63,  64,  67,  67,  71,  71,  74,  75,  79,  80,  83,  85,  87,  89,  91,  94,  95,  97,  97,  99,  98,  98,
+             68,  65,  65,  62,  62,  60,  61,  59,  59,  62,  62,  65,  65,  68,  68,  72,  72,  76,  76,  80,  80,  84,  84,  89,  89,  93,  93,  97,  98,  99,  99, 102,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  32,  32,  32,  33,  34,  36,  36,  39,  40,  44,  46,  48,  52,  53,  58,  58,  65,  66,  71,  74,  79,  81,  82,  86,  88,  91,  93,
+             31,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  35,  37,  38,  41,  43,  45,  48,  49,  53,  54,  60,  61,  65,  68,  72,  74,  75,  78,  81,  83,  85,
+             31,  32,  32,  32,  32,  33,  33,  33,  34,  34,  35,  35,  38,  39,  41,  43,  45,  48,  49,  53,  54,  59,  60,  65,  67,  72,  73,  74,  78,  80,  82,  85,
+             32,  32,  32,  33,  33,  34,  35,  35,  36,  37,  38,  38,  40,  41,  43,  44,  46,  49,  50,  53,  54,  58,  59,  63,  66,  70,  71,  72,  75,  77,  79,  81,
+             33,  33,  33,  33,  34,  35,  36,  36,  38,  39,  42,  42,  44,  45,  46,  48,  50,  52,  53,  57,  57,  62,  63,  67,  69,  73,  75,  75,  78,  80,  80,  81,
+             36,  35,  35,  34,  35,  36,  37,  38,  41,  42,  48,  48,  50,  51,  53,  55,  56,  59,  60,  63,  63,  68,  69,  73,  75,  79,  80,  81,  84,  86,  88,  90,
+             40,  39,  39,  38,  38,  39,  40,  41,  44,  45,  51,  51,  54,  56,  59,  60,  62,  65,  66,  69,  70,  74,  75,  79,  81,  85,  86,  87,  90,  90,  90,  90,
+             44,  42,  42,  41,  41,  42,  42,  42,  46,  48,  54,  54,  58,  59,  63,  65,  67,  70,  71,  74,  75,  79,  80,  84,  86,  90,  91,  92,  95,  98, 100, 102,
+             51,  49,  49,  47,  47,  48,  48,  48,  52,  53,  58,  59,  63,  65,  69,  72,  74,  78,  79,  83,  84,  89,  90,  94,  97, 101, 102, 103, 106, 105, 103, 103,
+             53,  51,  51,  49,  49,  50,  49,  49,  53,  54,  60,  60,  65,  67,  71,  73,  76,  80,  82,  86,  87,  92,  93,  97, 100, 104, 105, 106, 109, 112, 114, 117,
+             65,  62,  61,  59,  59,  59,  58,  58,  62,  63,  68,  68,  73,  75,  79,  82,  85,  90,  92,  97,  98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117,
+             66,  63,  62,  60,  60,  60,  59,  59,  63,  64,  69,  69,  74,  76,  80,  83,  86,  91,  93,  98,  99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134,
+             79,  75,  74,  72,  71,  71,  69,  69,  72,  73,  78,  79,  84,  85,  90,  93,  96, 101, 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+             81,  77,  76,  74,  73,  72,  71,  70,  74,  75,  80,  80,  85,  87,  91,  94,  98, 103, 105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151,
+             87,  83,  82,  79,  79,  78,  77,  75,  78,  80,  84,  85,  89,  90,  96,  97, 103, 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152,
+             90,  85,  85,  81,  81,  80,  80,  77,  78,  83,  83,  87,  88,  93,  93, 100, 100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155, 155, 160,
+        }, {
+             32,  31,  31,  30,  31,  33,  35,  37,  41,  42,  49,  49,  48,  48,  49,  49,  50,  51,  52,  54,  54,  57,  57,  60,  61,  63,  64,  64,  66,  67,  68,  68,
+             31,  31,  31,  32,  33,  36,  38,  40,  42,  43,  46,  46,  46,  45,  45,  46,  46,  47,  48,  50,  50,  52,  52,  54,  56,  57,  58,  59,  60,  61,  62,  62,
+             32,  33,  33,  33,  35,  37,  39,  41,  43,  43,  47,  47,  46,  46,  45,  46,  46,  47,  48,  49,  50,  52,  52,  54,  55,  57,  58,  58,  59,  60,  61,  62,
+             37,  38,  38,  40,  41,  43,  45,  47,  47,  47,  48,  48,  47,  46,  46,  46,  46,  47,  47,  48,  49,  50,  51,  52,  53,  55,  55,  56,  57,  58,  58,  59,
+             40,  41,  41,  42,  43,  44,  46,  47,  48,  48,  50,  50,  49,  49,  48,  49,  49,  49,  50,  51,  51,  52,  53,  55,  56,  57,  58,  58,  59,  59,  59,  59,
+             48,  47,  47,  46,  46,  47,  47,  47,  49,  50,  53,  53,  53,  53,  53,  53,  54,  54,  54,  55,  55,  56,  57,  58,  59,  60,  61,  61,  62,  63,  64,  65,
+             49,  47,  47,  45,  46,  46,  46,  46,  49,  49,  53,  53,  54,  55,  56,  57,  57,  58,  58,  59,  59,  60,  61,  62,  63,  64,  65,  65,  66,  66,  65,  65,
+             49,  47,  47,  45,  45,  46,  45,  45,  48,  49,  53,  54,  56,  56,  58,  59,  59,  61,  61,  62,  62,  64,  64,  65,  66,  67,  68,  68,  69,  70,  71,  71,
+             51,  49,  49,  47,  47,  47,  47,  46,  49,  50,  54,  54,  57,  58,  61,  62,  63,  64,  65,  67,  67,  69,  69,  71,  72,  73,  73,  74,  75,  74,  72,  71,
+             52,  50,  49,  48,  48,  47,  47,  47,  50,  50,  54,  55,  57,  58,  61,  62,  64,  66,  66,  68,  68,  70,  71,  72,  73,  75,  75,  75,  76,  77,  78,  79,
+             57,  54,  54,  52,  51,  51,  50,  50,  52,  53,  57,  57,  60,  61,  64,  65,  67,  69,  71,  73,  73,  76,  77,  79,  80,  82,  82,  83,  84,  82,  81,  79,
+             58,  55,  54,  52,  52,  52,  51,  50,  53,  54,  57,  57,  60,  61,  64,  66,  67,  70,  71,  73,  74,  77,  77,  79,  81,  82,  83,  83,  85,  85,  86,  87,
+             63,  60,  59,  57,  57,  56,  55,  54,  57,  57,  60,  61,  64,  65,  67,  69,  71,  73,  75,  77,  78,  82,  82,  85,  86,  89,  89,  90,  91,  91,  89,  87,
+             64,  61,  60,  58,  57,  57,  56,  55,  57,  58,  61,  61,  64,  65,  68,  69,  71,  74,  75,  78,  78,  82,  83,  86,  87,  90,  90,  91,  92,  93,  94,  95,
+             67,  63,  63,  60,  60,  59,  58,  57,  59,  60,  62,  63,  65,  66,  69,  70,  73,  74,  77,  78,  81,  83,  85,  87,  88,  92,  92,  94,  94,  96,  95,  95,
+             67,  64,  64,  61,  61,  60,  60,  58,  58,  61,  61,  64,  64,  67,  67,  70,  71,  74,  74,  78,  78,  82,  82,  86,  86,  90,  90,  95,  95,  96,  96,  98,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  32,  32,  32,  32,  34,  34,  36,  36,  39,  39,  44,  44,  48,  48,  53,  53,  58,  58,  65,  65,  71,  71,  79,  79,  82,  82,  87,
+             31,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  37,  37,  41,  41,  45,  45,  49,  49,  54,  54,  60,  60,  65,  65,  72,  72,  75,  75,  79,
+             31,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  37,  37,  41,  41,  45,  45,  49,  49,  54,  54,  60,  60,  65,  65,  72,  72,  75,  75,  79,
+             32,  32,  32,  33,  33,  34,  34,  35,  35,  37,  37,  38,  38,  40,  40,  43,  43,  46,  46,  50,  50,  54,  54,  58,  58,  63,  63,  70,  70,  72,  72,  76,
+             32,  32,  32,  33,  33,  34,  34,  35,  35,  37,  37,  38,  38,  40,  40,  43,  43,  46,  46,  50,  50,  54,  54,  58,  58,  63,  63,  70,  70,  72,  72,  76,
+             36,  35,  35,  34,  34,  36,  36,  38,  38,  42,  42,  48,  48,  50,  50,  53,  53,  56,  56,  60,  60,  63,  63,  68,  68,  73,  73,  79,  79,  81,  81,  84,
+             36,  35,  35,  34,  34,  36,  36,  38,  38,  42,  42,  48,  48,  50,  50,  53,  53,  56,  56,  60,  60,  63,  63,  68,  68,  73,  73,  79,  79,  81,  81,  84,
+             44,  42,  42,  41,  41,  42,  42,  42,  42,  48,  48,  54,  54,  58,  58,  63,  63,  67,  67,  71,  71,  75,  75,  79,  79,  84,  84,  90,  90,  92,  92,  96,
+             44,  42,  42,  41,  41,  42,  42,  42,  42,  48,  48,  54,  54,  58,  58,  63,  63,  67,  67,  71,  71,  75,  75,  79,  79,  84,  84,  90,  90,  92,  92,  96,
+             53,  51,  51,  49,  49,  50,  50,  49,  49,  54,  54,  60,  60,  65,  65,  71,  71,  76,  76,  82,  82,  87,  87,  92,  92,  97,  97, 104, 104, 106, 106, 109,
+             53,  51,  51,  49,  49,  50,  50,  49,  49,  54,  54,  60,  60,  65,  65,  71,  71,  76,  76,  82,  82,  87,  87,  92,  92,  97,  97, 104, 104, 106, 106, 109,
+             65,  62,  62,  59,  59,  59,  59,  58,  58,  63,  63,  68,  68,  73,  73,  79,  79,  85,  85,  92,  92,  98,  98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
+             65,  62,  62,  59,  59,  59,  59,  58,  58,  63,  63,  68,  68,  73,  73,  79,  79,  85,  85,  92,  92,  98,  98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
+             79,  75,  75,  72,  72,  71,  71,  69,  69,  73,  73,  78,  78,  84,  84,  90,  90,  96,  96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141,
+             79,  75,  75,  72,  72,  71,  71,  69,  69,  73,  73,  78,  78,  84,  84,  90,  90,  96,  96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141,
+             87,  82,  82,  78,  78,  77,  77,  75,  75,  79,  79,  84,  84,  89,  89,  95,  95, 102, 102, 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149,
+        }, {
+             32,  31,  31,  30,  30,  33,  33,  37,  37,  42,  42,  49,  49,  48,  48,  49,  49,  50,  50,  52,  52,  54,  54,  57,  57,  60,  60,  63,  63,  64,  64,  66,
+             31,  31,  31,  32,  32,  36,  36,  40,  40,  43,  43,  46,  46,  46,  46,  45,  45,  46,  46,  48,  48,  50,  50,  52,  52,  54,  54,  57,  57,  59,  59,  60,
+             31,  31,  31,  32,  32,  36,  36,  40,  40,  43,  43,  46,  46,  46,  46,  45,  45,  46,  46,  48,  48,  50,  50,  52,  52,  54,  54,  57,  57,  59,  59,  60,
+             37,  38,  38,  40,  40,  43,  43,  47,  47,  47,  47,  48,  48,  47,  47,  46,  46,  46,  46,  47,  47,  49,  49,  50,  50,  52,  52,  55,  55,  56,  56,  57,
+             37,  38,  38,  40,  40,  43,  43,  47,  47,  47,  47,  48,  48,  47,  47,  46,  46,  46,  46,  47,  47,  49,  49,  50,  50,  52,  52,  55,  55,  56,  56,  57,
+             48,  47,  47,  46,  46,  47,  47,  47,  47,  50,  50,  53,  53,  53,  53,  53,  53,  54,  54,  54,  54,  55,  55,  56,  56,  58,  58,  60,  60,  61,  61,  63,
+             48,  47,  47,  46,  46,  47,  47,  47,  47,  50,  50,  53,  53,  53,  53,  53,  53,  54,  54,  54,  54,  55,  55,  56,  56,  58,  58,  60,  60,  61,  61,  63,
+             49,  47,  47,  45,  45,  46,  46,  45,  45,  49,  49,  53,  53,  56,  56,  58,  58,  59,  59,  61,  61,  62,  62,  64,  64,  65,  65,  67,  67,  68,  68,  69,
+             49,  47,  47,  45,  45,  46,  46,  45,  45,  49,  49,  53,  53,  56,  56,  58,  58,  59,  59,  61,  61,  62,  62,  64,  64,  65,  65,  67,  67,  68,  68,  69,
+             52,  50,  50,  48,  48,  47,  47,  47,  47,  50,  50,  54,  54,  57,  57,  61,  61,  64,  64,  66,  66,  68,  68,  70,  70,  72,  72,  75,  75,  75,  75,  77,
+             52,  50,  50,  48,  48,  47,  47,  47,  47,  50,  50,  54,  54,  57,  57,  61,  61,  64,  64,  66,  66,  68,  68,  70,  70,  72,  72,  75,  75,  75,  75,  77,
+             57,  54,  54,  52,  52,  51,  51,  50,  50,  53,  53,  57,  57,  60,  60,  64,  64,  67,  67,  71,  71,  73,  73,  76,  76,  79,  79,  82,  82,  83,  83,  84,
+             57,  54,  54,  52,  52,  51,  51,  50,  50,  53,  53,  57,  57,  60,  60,  64,  64,  67,  67,  71,  71,  73,  73,  76,  76,  79,  79,  82,  82,  83,  83,  84,
+             63,  60,  60,  57,  57,  56,  56,  54,  54,  57,  57,  60,  60,  64,  64,  67,  67,  71,  71,  75,  75,  78,  78,  82,  82,  85,  85,  89,  89,  90,  90,  92,
+             63,  60,  60,  57,  57,  56,  56,  54,  54,  57,  57,  60,  60,  64,  64,  67,  67,  71,  71,  75,  75,  78,  78,  82,  82,  85,  85,  89,  89,  90,  90,  92,
+             66,  63,  63,  60,  60,  59,  59,  57,  57,  60,  60,  62,  62,  66,  66,  69,  69,  73,  73,  77,  77,  80,  80,  84,  84,  88,  88,  92,  92,  93,  93,  95,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  32,  32,  32,  32,  34,  34,  35,  36,  38,  39,  41,  44,  44,  48,  48,  53,  53,  57,  58,  61,  65,  67,  71,  72,  79,  79,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  35,  36,  38,  39,  41,  42,  45,  45,  49,  50,  53,  54,  57,  60,  62,  66,  66,  73,  73,
+             31,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  36,  37,  39,  41,  42,  45,  45,  49,  49,  52,  54,  57,  60,  61,  65,  66,  72,  72,
+             32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  36,  36,  37,  37,  38,  40,  41,  42,  43,  46,  46,  49,  50,  52,  54,  56,  59,  60,  64,  64,  71,  71,
+             32,  32,  32,  32,  33,  33,  34,  34,  35,  35,  37,  37,  38,  38,  40,  40,  41,  43,  43,  46,  46,  49,  50,  52,  54,  56,  58,  60,  63,  64,  70,  70,
+             34,  34,  34,  33,  33,  34,  35,  35,  37,  37,  39,  39,  42,  43,  44,  45,  46,  48,  48,  51,  51,  54,  54,  57,  58,  60,  63,  64,  68,  68,  74,  74,
+             36,  35,  35,  35,  34,  35,  36,  37,  38,  39,  42,  42,  47,  48,  49,  50,  51,  53,  54,  56,  56,  59,  60,  62,  63,  66,  68,  69,  73,  73,  79,  79,
+             38,  37,  37,  36,  36,  37,  38,  38,  39,  40,  44,  44,  48,  49,  51,  52,  54,  56,  56,  59,  59,  62,  63,  65,  67,  69,  71,  72,  76,  76,  82,  82,
+             44,  42,  42,  41,  41,  41,  42,  42,  42,  43,  48,  48,  52,  54,  56,  58,  60,  63,  64,  67,  67,  71,  71,  74,  75,  77,  79,  81,  84,  85,  90,  90,
+             44,  43,  43,  42,  41,  42,  43,  43,  43,  44,  48,  48,  53,  54,  57,  58,  60,  64,  64,  67,  67,  71,  72,  75,  76,  78,  80,  82,  85,  86,  91,  91,
+             53,  51,  51,  50,  49,  49,  50,  49,  49,  50,  54,  54,  59,  60,  63,  65,  67,  71,  72,  76,  76,  81,  82,  85,  87,  89,  92,  94,  97,  98, 104, 104,
+             53,  51,  51,  50,  49,  49,  50,  49,  49,  50,  54,  54,  59,  60,  63,  65,  67,  71,  72,  76,  76,  81,  82,  85,  87,  89,  92,  94,  97,  98, 104, 104,
+             62,  60,  59,  58,  57,  57,  57,  56,  56,  56,  61,  61,  65,  66,  69,  71,  74,  78,  79,  83,  83,  89,  90,  94,  95,  98, 102, 103, 108, 108, 115, 115,
+             65,  62,  62,  60,  59,  59,  59,  59,  58,  58,  63,  63,  67,  68,  71,  73,  76,  79,  81,  85,  85,  91,  92,  96,  98, 101, 105, 106, 111, 111, 118, 118,
+             73,  70,  69,  67,  66,  66,  65,  65,  64,  64,  69,  69,  73,  74,  77,  79,  81,  85,  86,  91,  91,  98,  99, 103, 105, 108, 112, 114, 119, 119, 127, 127,
+             79,  75,  75,  73,  72,  71,  71,  70,  69,  69,  73,  73,  77,  78,  81,  84,  86,  90,  91,  96,  96, 103, 103, 108, 110, 114, 118, 120, 125, 125, 133, 133,
+        }, {
+             32,  31,  31,  30,  30,  32,  33,  34,  37,  37,  42,  42,  47,  49,  48,  48,  48,  49,  49,  50,  50,  52,  52,  53,  54,  55,  57,  58,  60,  60,  63,  63,
+             31,  31,  31,  32,  32,  33,  35,  37,  40,  40,  43,  43,  46,  47,  46,  46,  46,  45,  46,  47,  47,  48,  48,  50,  50,  51,  52,  53,  55,  55,  58,  58,
+             31,  31,  31,  32,  32,  34,  36,  37,  40,  40,  43,  43,  46,  46,  46,  46,  45,  45,  45,  46,  46,  48,  48,  49,  50,  51,  52,  53,  54,  55,  57,  57,
+             35,  36,  36,  37,  37,  39,  40,  42,  45,  45,  46,  46,  47,  47,  47,  46,  46,  45,  46,  46,  46,  47,  47,  48,  49,  50,  51,  51,  53,  53,  56,  56,
+             37,  38,  38,  39,  40,  41,  43,  44,  47,  47,  47,  47,  48,  48,  47,  47,  46,  46,  46,  46,  46,  47,  47,  48,  49,  49,  50,  51,  52,  53,  55,  55,
+             42,  42,  42,  42,  42,  44,  45,  45,  47,  47,  48,  48,  50,  50,  50,  50,  49,  49,  49,  50,  50,  50,  50,  51,  52,  52,  53,  54,  55,  55,  58,  58,
+             48,  47,  47,  46,  46,  46,  47,  47,  47,  48,  50,  50,  52,  53,  53,  53,  53,  53,  53,  54,  54,  54,  54,  55,  55,  56,  56,  57,  58,  59,  60,  60,
+             48,  47,  47,  46,  46,  46,  46,  47,  47,  47,  50,  50,  52,  53,  53,  54,  54,  55,  55,  55,  55,  56,  56,  57,  57,  58,  58,  59,  60,  60,  62,  62,
+             49,  47,  47,  46,  45,  45,  46,  45,  45,  46,  49,  49,  53,  53,  55,  56,  57,  58,  58,  59,  59,  61,  61,  62,  62,  63,  64,  64,  65,  65,  67,  67,
+             49,  47,  47,  46,  45,  45,  46,  46,  46,  46,  49,  49,  53,  54,  55,  56,  57,  59,  59,  60,  60,  61,  61,  62,  63,  63,  64,  65,  66,  66,  68,  68,
+             52,  50,  50,  48,  48,  48,  47,  47,  47,  47,  50,  50,  53,  54,  56,  57,  59,  61,  62,  64,  64,  66,  66,  68,  68,  69,  70,  71,  72,  73,  75,  75,
+             52,  50,  50,  48,  48,  48,  47,  47,  47,  47,  50,  50,  53,  54,  56,  57,  59,  61,  62,  64,  64,  66,  66,  68,  68,  69,  70,  71,  72,  73,  75,  75,
+             56,  54,  53,  52,  51,  51,  50,  50,  49,  49,  53,  53,  55,  56,  58,  59,  61,  63,  64,  66,  66,  69,  70,  71,  72,  74,  75,  76,  77,  78,  80,  80,
+             57,  54,  54,  52,  52,  51,  51,  51,  50,  50,  53,  53,  56,  57,  58,  60,  61,  64,  64,  67,  67,  70,  71,  72,  73,  75,  76,  77,  79,  79,  82,  82,
+             61,  58,  57,  56,  55,  54,  54,  53,  52,  53,  56,  56,  58,  59,  61,  62,  63,  66,  66,  69,  69,  72,  73,  75,  76,  78,  79,  80,  82,  83,  86,  86,
+             63,  60,  60,  58,  57,  57,  56,  55,  54,  55,  57,  57,  60,  60,  62,  64,  65,  67,  68,  71,  71,  74,  75,  77,  78,  80,  82,  83,  85,  85,  89,  89,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  33,  34,  34,  36,  36,  38,  39,  41,  44,  44,  47,  48,  50,  53,  53,  57,  58,  61,  65,  65,  70,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  35,  35,  37,  38,  39,  41,  41,  44,  45,  47,  50,  50,  54,  55,  57,  61,  61,  65,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,  37,  37,  39,  41,  41,  44,  45,  46,  49,  49,  53,  54,  56,  60,  60,  64,
+             31,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  35,  35,  35,  36,  36,  38,  39,  40,  42,  42,  44,  45,  47,  50,  50,  53,  54,  56,  59,  59,  63,
+             32,  32,  32,  32,  33,  33,  34,  34,  34,  35,  35,  36,  37,  37,  38,  38,  40,  40,  41,  43,  43,  45,  46,  47,  50,  50,  53,  54,  56,  58,  58,  62,
+             32,  32,  32,  32,  33,  33,  34,  34,  34,  35,  35,  36,  37,  37,  38,  38,  40,  40,  41,  43,  43,  45,  46,  47,  50,  50,  53,  54,  56,  58,  58,  62,
+             35,  35,  35,  34,  34,  34,  35,  36,  36,  37,  37,  40,  41,  43,  46,  46,  47,  48,  49,  51,  51,  53,  54,  55,  57,  57,  60,  61,  63,  66,  66,  70,
+             36,  35,  35,  35,  34,  34,  36,  36,  37,  38,  38,  41,  42,  44,  48,  48,  50,  50,  51,  53,  53,  56,  56,  58,  60,  60,  63,  63,  65,  68,  68,  72,
+             39,  38,  38,  37,  37,  37,  38,  38,  39,  40,  40,  43,  44,  46,  50,  50,  52,  53,  54,  57,  57,  59,  60,  61,  64,  64,  67,  68,  69,  72,  72,  76,
+             44,  42,  42,  41,  41,  41,  42,  42,  42,  42,  42,  46,  48,  50,  54,  54,  57,  58,  60,  63,  63,  66,  67,  68,  71,  71,  74,  75,  77,  79,  79,  83,
+             44,  42,  42,  41,  41,  41,  42,  42,  42,  42,  42,  46,  48,  50,  54,  54,  57,  58,  60,  63,  63,  66,  67,  68,  71,  71,  74,  75,  77,  79,  79,  83,
+             51,  49,  49,  48,  47,  47,  48,  48,  48,  48,  48,  52,  53,  55,  58,  58,  62,  63,  66,  69,  69,  73,  74,  76,  79,  79,  83,  84,  86,  89,  89,  93,
+             53,  52,  51,  50,  49,  49,  49,  50,  49,  49,  49,  53,  54,  56,  60,  60,  64,  65,  67,  71,  71,  75,  76,  78,  82,  82,  86,  87,  89,  92,  92,  96,
+             58,  56,  55,  54,  53,  53,  53,  53,  53,  52,  52,  56,  57,  59,  63,  63,  67,  68,  70,  74,  74,  78,  79,  82,  86,  86,  90,  91,  93,  97,  97, 101,
+             65,  63,  62,  61,  59,  59,  59,  59,  58,  58,  58,  62,  63,  65,  68,  68,  72,  73,  76,  79,  79,  84,  85,  88,  92,  92,  97,  98, 100, 105, 105, 109,
+             65,  63,  62,  61,  59,  59,  59,  59,  58,  58,  58,  62,  63,  65,  68,  68,  72,  73,  76,  79,  79,  84,  85,  88,  92,  92,  97,  98, 100, 105, 105, 109,
+        }, {
+             32,  31,  31,  31,  30,  30,  33,  33,  35,  37,  37,  41,  42,  44,  49,  49,  48,  48,  48,  49,  49,  50,  50,  51,  52,  52,  54,  54,  55,  57,  57,  59,
+             31,  31,  31,  31,  32,  32,  34,  35,  37,  39,  39,  42,  42,  44,  47,  47,  46,  46,  46,  46,  46,  47,  47,  48,  48,  48,  50,  51,  51,  53,  53,  55,
+             31,  31,  31,  32,  32,  32,  35,  36,  37,  40,  40,  42,  43,  44,  46,  46,  46,  46,  45,  45,  45,  46,  46,  47,  48,  48,  49,  50,  51,  52,  52,  54,
+             33,  34,  34,  34,  35,  35,  37,  38,  40,  43,  43,  44,  44,  45,  47,  47,  46,  46,  46,  45,  45,  46,  46,  47,  47,  47,  49,  49,  50,  51,  51,  53,
+             37,  38,  38,  39,  40,  40,  42,  43,  44,  47,  47,  47,  47,  47,  48,  48,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  48,  49,  49,  50,  50,  52,
+             37,  38,  38,  39,  40,  40,  42,  43,  44,  47,  47,  47,  47,  47,  48,  48,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  48,  49,  49,  50,  50,  52,
+             45,  45,  45,  45,  44,  44,  46,  46,  46,  47,  47,  49,  49,  50,  52,  52,  51,  51,  51,  51,  51,  52,  52,  52,  53,  53,  54,  54,  54,  55,  55,  57,
+             48,  47,  47,  46,  46,  46,  47,  47,  47,  47,  47,  49,  50,  51,  53,  53,  53,  53,  53,  53,  53,  54,  54,  54,  54,  54,  55,  55,  56,  56,  56,  58,
+             48,  47,  47,  46,  45,  45,  46,  46,  46,  47,  47,  49,  50,  51,  53,  53,  54,  54,  54,  55,  55,  56,  56,  56,  57,  57,  58,  58,  58,  59,  59,  61,
+             49,  47,  47,  46,  45,  45,  45,  46,  45,  45,  45,  48,  49,  51,  53,  53,  55,  56,  57,  58,  58,  59,  59,  60,  61,  61,  62,  62,  63,  64,  64,  65,
+             49,  47,  47,  46,  45,  45,  45,  46,  45,  45,  45,  48,  49,  51,  53,  53,  55,  56,  57,  58,  58,  59,  59,  60,  61,  61,  62,  62,  63,  64,  64,  65,
+             51,  50,  49,  48,  47,  47,  47,  47,  47,  46,  46,  49,  50,  52,  54,  54,  56,  57,  58,  61,  61,  62,  63,  64,  65,  65,  67,  67,  68,  69,  69,  70,
+             52,  50,  50,  49,  48,  48,  47,  47,  47,  47,  47,  50,  50,  52,  54,  54,  57,  57,  59,  61,  61,  63,  64,  65,  66,  66,  68,  68,  69,  70,  70,  72,
+             54,  52,  51,  51,  49,  49,  49,  49,  48,  48,  48,  51,  51,  53,  55,  55,  58,  58,  60,  62,  62,  64,  65,  66,  68,  68,  70,  70,  71,  73,  73,  74,
+             57,  55,  54,  53,  52,  52,  51,  51,  51,  50,  50,  52,  53,  54,  57,  57,  59,  60,  61,  64,  64,  66,  67,  68,  71,  71,  73,  73,  74,  76,  76,  78,
+             57,  55,  54,  53,  52,  52,  51,  51,  51,  50,  50,  52,  53,  54,  57,  57,  59,  60,  61,  64,  64,  66,  67,  68,  71,  71,  73,  73,  74,  76,  76,  78,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  34,  34,  35,  36,  36,  38,  39,  39,  42,  44,  44,  47,  48,  49,  53,  53,  55,  58,  58,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  35,  35,  37,  38,  38,  40,  42,  42,  45,  46,  47,  50,  51,  52,  55,  55,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,  34,  36,  37,  37,  40,  41,  41,  44,  45,  46,  49,  49,  51,  54,  54,
+             31,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  34,  34,  34,  35,  35,  37,  38,  38,  40,  41,  41,  44,  45,  46,  49,  49,  51,  54,  54,
+             32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  35,  35,  35,  36,  36,  37,  37,  37,  39,  40,  40,  42,  42,  43,  45,  46,  47,  49,  50,  51,  54,  54,
+             32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  35,  35,  36,  37,  37,  37,  38,  38,  40,  40,  40,  42,  43,  43,  45,  46,  47,  49,  50,  51,  54,  54,
+             32,  33,  33,  33,  33,  33,  33,  34,  34,  35,  36,  36,  36,  38,  38,  39,  40,  40,  41,  42,  42,  44,  45,  45,  47,  48,  48,  51,  51,  53,  55,  55,
+             35,  35,  35,  35,  34,  34,  35,  36,  36,  37,  38,  38,  39,  42,  42,  44,  47,  47,  48,  49,  49,  51,  52,  52,  54,  55,  56,  58,  59,  60,  62,  62,
+             36,  35,  35,  35,  35,  34,  35,  36,  36,  37,  38,  38,  40,  42,  42,  45,  48,  48,  49,  50,  50,  52,  53,  54,  56,  56,  57,  59,  60,  61,  63,  63,
+             38,  37,  37,  37,  36,  36,  36,  38,  38,  38,  39,  39,  41,  44,  44,  46,  49,  49,  51,  52,  52,  55,  56,  56,  58,  59,  60,  62,  63,  64,  67,  67,
+             44,  43,  42,  42,  41,  41,  41,  42,  42,  42,  42,  42,  44,  48,  48,  50,  54,  54,  56,  58,  58,  61,  63,  63,  66,  67,  67,  71,  71,  72,  75,  75,
+             44,  43,  42,  42,  41,  41,  41,  42,  42,  42,  42,  42,  44,  48,  48,  50,  54,  54,  56,  58,  58,  61,  63,  63,  66,  67,  67,  71,  71,  72,  75,  75,
+             47,  46,  45,  45,  44,  44,  44,  45,  45,  45,  45,  45,  47,  50,  50,  53,  56,  56,  58,  60,  60,  64,  66,  66,  69,  70,  71,  74,  75,  76,  79,  79,
+             53,  52,  51,  51,  49,  49,  49,  49,  50,  49,  49,  49,  51,  54,  54,  57,  60,  60,  63,  65,  65,  69,  71,  72,  75,  76,  77,  81,  82,  83,  87,  87,
+             53,  52,  51,  51,  49,  49,  49,  49,  50,  49,  49,  49,  51,  54,  54,  57,  60,  60,  63,  65,  65,  69,  71,  72,  75,  76,  77,  81,  82,  83,  87,  87,
+             59,  57,  56,  56,  54,  54,  54,  54,  54,  54,  53,  53,  55,  58,  58,  61,  64,  64,  67,  69,  69,  73,  75,  76,  79,  80,  81,  86,  87,  88,  92,  92,
+        }, {
+             32,  31,  31,  31,  30,  30,  31,  33,  33,  34,  37,  37,  39,  42,  42,  45,  49,  49,  48,  48,  48,  49,  49,  49,  50,  50,  51,  52,  52,  53,  54,  54,
+             31,  31,  31,  31,  31,  31,  32,  35,  35,  36,  39,  39,  40,  42,  42,  45,  47,  47,  47,  46,  46,  46,  46,  46,  47,  48,  48,  49,  49,  50,  51,  51,
+             31,  31,  31,  31,  32,  32,  33,  35,  36,  37,  40,  40,  41,  43,  43,  44,  46,  46,  46,  46,  46,  45,  45,  45,  46,  46,  47,  48,  48,  48,  50,  50,
+             31,  32,  32,  32,  32,  33,  33,  36,  36,  37,  41,  41,  42,  43,  43,  45,  47,  47,  46,  46,  46,  45,  45,  45,  46,  46,  47,  48,  48,  48,  50,  50,
+             35,  36,  37,  37,  38,  38,  38,  41,  41,  42,  45,  45,  46,  46,  46,  47,  48,  48,  47,  46,  46,  46,  45,  46,  46,  46,  47,  47,  47,  48,  49,  49,
+             37,  38,  38,  38,  39,  40,  40,  43,  43,  44,  47,  47,  47,  47,  47,  47,  48,  48,  47,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  48,  49,  49,
+             38,  39,  40,  40,  40,  41,  41,  43,  44,  45,  47,  47,  47,  48,  48,  48,  49,  49,  48,  48,  48,  47,  47,  47,  48,  48,  48,  48,  48,  49,  50,  50,
+             47,  46,  46,  46,  45,  45,  45,  46,  46,  47,  47,  47,  48,  50,  50,  51,  52,  52,  52,  52,  52,  52,  52,  52,  53,  53,  53,  53,  53,  54,  55,  55,
+             48,  47,  47,  47,  46,  46,  46,  47,  47,  47,  47,  47,  48,  50,  50,  51,  53,  53,  53,  53,  53,  53,  53,  53,  54,  54,  54,  54,  54,  54,  55,  55,
+             48,  47,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  48,  50,  50,  51,  53,  53,  53,  54,  54,  54,  55,  55,  55,  55,  55,  56,  56,  56,  57,  57,
+             49,  48,  47,  47,  45,  45,  45,  45,  46,  45,  45,  45,  47,  49,  49,  51,  53,  53,  55,  56,  56,  57,  58,  58,  59,  59,  60,  61,  61,  61,  62,  62,
+             49,  48,  47,  47,  45,  45,  45,  45,  46,  45,  45,  45,  47,  49,  49,  51,  53,  53,  55,  56,  56,  57,  58,  58,  59,  59,  60,  61,  61,  61,  62,  62,
+             50,  49,  48,  48,  46,  46,  46,  46,  46,  46,  46,  46,  47,  50,  50,  52,  54,  54,  55,  56,  56,  58,  59,  60,  61,  61,  61,  63,  63,  63,  65,  65,
+             52,  50,  50,  50,  48,  48,  48,  47,  47,  47,  47,  47,  48,  50,  50,  52,  54,  54,  56,  57,  57,  60,  61,  61,  63,  64,  64,  66,  66,  67,  68,  68,
+             52,  50,  50,  50,  48,  48,  48,  47,  47,  47,  47,  47,  48,  50,  50,  52,  54,  54,  56,  57,  57,  60,  61,  61,  63,  64,  64,  66,  66,  67,  68,  68,
+             54,  53,  52,  52,  50,  50,  50,  49,  49,  49,  48,  48,  50,  52,  52,  54,  55,  55,  57,  59,  59,  61,  62,  63,  65,  65,  66,  68,  68,  69,  71,  71,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  34,  34,  34,  35,  36,  36,  38,  39,  39,  41,  44,  44,  44,  47,  48,  48,  51,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  35,  35,  35,  37,  38,  38,  40,  42,  42,  43,  45,  46,  46,  49,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,  34,  34,  36,  37,  37,  39,  41,  41,  42,  44,  45,  45,  47,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,  34,  34,  36,  37,  37,  39,  41,  41,  42,  44,  45,  45,  47,
+             31,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  35,  35,  35,  36,  36,  36,  37,  39,  39,  40,  42,  42,  42,  44,  45,  45,  48,
+             32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  35,  35,  35,  36,  37,  37,  37,  38,  38,  38,  40,  40,  40,  41,  43,  43,  43,  45,  46,  46,  48,
+             32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  35,  35,  35,  36,  37,  37,  37,  38,  38,  38,  40,  40,  40,  41,  43,  43,  43,  45,  46,  46,  48,
+             32,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  35,  36,  36,  36,  38,  38,  38,  39,  40,  40,  41,  42,  42,  43,  45,  45,  45,  47,  48,  48,  50,
+             35,  35,  35,  35,  34,  34,  34,  34,  35,  36,  36,  37,  37,  37,  39,  41,  41,  42,  45,  46,  46,  47,  48,  48,  49,  51,  51,  51,  53,  54,  54,  56,
+             36,  35,  35,  35,  35,  34,  34,  35,  36,  36,  36,  37,  38,  38,  40,  42,  42,  43,  47,  48,  48,  49,  50,  50,  51,  53,  53,  54,  56,  56,  56,  58,
+             36,  35,  35,  35,  35,  34,  34,  35,  36,  36,  36,  37,  38,  38,  40,  42,  42,  43,  47,  48,  48,  49,  50,  50,  51,  53,  53,  54,  56,  56,  56,  58,
+             40,  39,  39,  39,  39,  38,  38,  38,  39,  39,  39,  40,  41,  41,  42,  45,  45,  46,  50,  51,  51,  53,  54,  54,  56,  59,  59,  59,  61,  62,  62,  64,
+             44,  43,  42,  42,  41,  41,  41,  41,  42,  42,  42,  42,  42,  42,  44,  48,  48,  49,  52,  54,  54,  56,  58,  58,  60,  63,  63,  64,  66,  67,  67,  69,
+             44,  43,  42,  42,  41,  41,  41,  41,  42,  42,  42,  42,  42,  42,  44,  48,  48,  49,  52,  54,  54,  56,  58,  58,  60,  63,  63,  64,  66,  67,  67,  69,
+             47,  46,  45,  45,  45,  44,  44,  44,  44,  45,  45,  45,  45,  45,  47,  50,  50,  51,  55,  56,  56,  58,  60,  60,  62,  66,  66,  67,  69,  70,  70,  73,
+             53,  52,  51,  51,  50,  49,  49,  49,  49,  50,  50,  49,  49,  49,  51,  54,  54,  55,  59,  60,  60,  63,  65,  65,  67,  71,  71,  72,  75,  76,  76,  79,
+        }, {
+             32,  31,  31,  31,  31,  30,  30,  31,  33,  33,  33,  35,  37,  37,  39,  42,  42,  43,  47,  49,  49,  48,  48,  48,  48,  49,  49,  49,  50,  50,  50,  51,
+             31,  31,  31,  31,  31,  31,  31,  32,  34,  35,  35,  37,  39,  39,  40,  42,  42,  43,  46,  47,  47,  47,  47,  47,  47,  46,  46,  47,  48,  48,  48,  49,
+             31,  31,  31,  31,  32,  32,  32,  33,  35,  36,  36,  38,  40,  40,  41,  43,  43,  43,  46,  46,  46,  46,  46,  46,  45,  45,  45,  45,  46,  46,  46,  47,
+             31,  31,  31,  31,  32,  32,  32,  33,  35,  36,  36,  38,  40,  40,  41,  43,  43,  43,  46,  46,  46,  46,  46,  46,  45,  45,  45,  45,  46,  46,  46,  47,
+             33,  33,  34,  34,  34,  35,  35,  35,  37,  38,  38,  41,  43,  43,  43,  44,  44,  45,  46,  47,  47,  46,  46,  46,  46,  45,  45,  45,  46,  46,  46,  47,
+             37,  38,  38,  38,  39,  40,  40,  40,  42,  43,  43,  45,  47,  47,  47,  47,  47,  47,  48,  48,  48,  47,  47,  47,  46,  46,  46,  46,  46,  46,  46,  47,
+             37,  38,  38,  38,  39,  40,  40,  40,  42,  43,  43,  45,  47,  47,  47,  47,  47,  47,  48,  48,  48,  47,  47,  47,  46,  46,  46,  46,  46,  46,  46,  47,
+             38,  39,  40,  40,  40,  41,  41,  41,  43,  44,  44,  46,  47,  47,  47,  48,  48,  48,  48,  49,  49,  48,  48,  48,  47,  47,  47,  47,  48,  48,  48,  48,
+             45,  45,  45,  45,  45,  44,  44,  45,  46,  46,  46,  47,  47,  47,  48,  49,  49,  50,  51,  52,  52,  52,  51,  51,  51,  51,  51,  52,  52,  52,  52,  52,
+             48,  47,  47,  47,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  48,  50,  50,  50,  52,  53,  53,  53,  53,  53,  53,  53,  53,  53,  54,  54,  54,  54,
+             48,  47,  47,  47,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  48,  50,  50,  50,  52,  53,  53,  53,  53,  53,  53,  53,  53,  53,  54,  54,  54,  54,
+             49,  48,  47,  47,  46,  45,  45,  45,  46,  46,  46,  46,  46,  46,  47,  49,  49,  50,  52,  53,  53,  54,  54,  54,  55,  56,  56,  56,  57,  57,  57,  58,
+             49,  48,  47,  47,  46,  45,  45,  45,  45,  46,  46,  45,  45,  45,  47,  49,  49,  50,  53,  53,  53,  55,  56,  56,  57,  58,  58,  58,  59,  59,  59,  60,
+             49,  48,  47,  47,  46,  45,  45,  45,  45,  46,  46,  45,  45,  45,  47,  49,  49,  50,  53,  53,  53,  55,  56,  56,  57,  58,  58,  58,  59,  59,  59,  60,
+             50,  49,  48,  48,  47,  46,  46,  46,  46,  46,  46,  46,  46,  46,  47,  50,  50,  50,  53,  54,  54,  55,  56,  56,  57,  59,  59,  60,  61,  61,  61,  62,
+             52,  51,  50,  50,  49,  48,  48,  48,  47,  47,  47,  47,  47,  47,  48,  50,  50,  51,  53,  54,  54,  56,  57,  57,  59,  61,  61,  62,  63,  64,  64,  65,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  35,  36,  36,  36,  37,  39,  39,  39,  41,  44,  44,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  34,  35,  35,  35,  37,  38,  38,  38,  40,  42,  42,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  36,  37,  37,  37,  39,  41,  41,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  36,  37,  37,  37,  39,  41,  41,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  36,  37,  37,  37,  39,  41,  41,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  36,  36,  36,  36,  38,  39,  39,  39,  40,  42,  42,
+             32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  36,  37,  37,  37,  37,  38,  38,  38,  39,  40,  40,  40,  42,  43,  43,
+             32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  36,  37,  37,  37,  37,  38,  38,  38,  39,  40,  40,  40,  42,  43,  43,
+             32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  36,  37,  37,  37,  37,  38,  38,  38,  39,  40,  40,  40,  42,  43,  43,
+             34,  34,  34,  34,  34,  34,  33,  33,  33,  34,  35,  35,  35,  36,  37,  37,  37,  38,  39,  39,  39,  41,  43,  43,  43,  44,  45,  45,  45,  46,  48,  48,
+             36,  35,  35,  35,  35,  35,  34,  34,  34,  35,  36,  36,  36,  37,  38,  38,  38,  40,  42,  42,  42,  45,  48,  48,  48,  49,  50,  50,  50,  52,  53,  53,
+             36,  35,  35,  35,  35,  35,  34,  34,  34,  35,  36,  36,  36,  37,  38,  38,  38,  40,  42,  42,  42,  45,  48,  48,  48,  49,  50,  50,  50,  52,  53,  53,
+             36,  35,  35,  35,  35,  35,  34,  34,  34,  35,  36,  36,  36,  37,  38,  38,  38,  40,  42,  42,  42,  45,  48,  48,  48,  49,  50,  50,  50,  52,  53,  53,
+             39,  39,  38,  38,  38,  38,  37,  37,  37,  38,  39,  39,  39,  40,  40,  40,  40,  42,  45,  45,  45,  47,  51,  51,  51,  52,  54,  54,  54,  56,  58,  58,
+             44,  43,  42,  42,  42,  41,  41,  41,  41,  41,  42,  42,  42,  42,  42,  42,  42,  45,  48,  48,  48,  50,  54,  54,  54,  56,  58,  58,  58,  60,  63,  63,
+             44,  43,  42,  42,  42,  41,  41,  41,  41,  41,  42,  42,  42,  42,  42,  42,  42,  45,  48,  48,  48,  50,  54,  54,  54,  56,  58,  58,  58,  60,  63,  63,
+        }, {
+             32,  31,  31,  31,  31,  31,  30,  30,  30,  32,  33,  33,  33,  35,  37,  37,  37,  39,  42,  42,  42,  45,  49,  49,  49,  48,  48,  48,  48,  48,  49,  49,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  33,  34,  34,  34,  36,  38,  38,  38,  40,  42,  42,  42,  45,  48,  48,  48,  47,  47,  47,  47,  47,  47,  47,
+             31,  31,  31,  31,  31,  32,  32,  32,  32,  34,  36,  36,  36,  38,  40,  40,  40,  41,  43,  43,  43,  44,  46,  46,  46,  46,  46,  46,  46,  45,  45,  45,
+             31,  31,  31,  31,  31,  32,  32,  32,  32,  34,  36,  36,  36,  38,  40,  40,  40,  41,  43,  43,  43,  44,  46,  46,  46,  46,  46,  46,  46,  45,  45,  45,
+             31,  31,  31,  31,  31,  32,  32,  32,  32,  34,  36,  36,  36,  38,  40,  40,  40,  41,  43,  43,  43,  44,  46,  46,  46,  46,  46,  46,  46,  45,  45,  45,
+             33,  34,  34,  34,  34,  35,  35,  35,  35,  37,  39,  39,  39,  41,  43,  43,  43,  44,  45,  45,  45,  46,  47,  47,  47,  47,  46,  46,  46,  46,  45,  45,
+             37,  37,  38,  38,  38,  39,  40,  40,  40,  41,  43,  43,  43,  45,  47,  47,  47,  47,  47,  47,  47,  47,  48,  48,  48,  47,  47,  47,  47,  46,  46,  46,
+             37,  37,  38,  38,  38,  39,  40,  40,  40,  41,  43,  43,  43,  45,  47,  47,  47,  47,  47,  47,  47,  47,  48,  48,  48,  47,  47,  47,  47,  46,  46,  46,
+             37,  37,  38,  38,  38,  39,  40,  40,  40,  41,  43,  43,  43,  45,  47,  47,  47,  47,  47,  47,  47,  47,  48,  48,  48,  47,  47,  47,  47,  46,  46,  46,
+             42,  42,  42,  42,  42,  42,  42,  42,  42,  44,  45,  45,  45,  46,  47,  47,  47,  48,  48,  48,  48,  49,  50,  50,  50,  50,  50,  50,  50,  49,  49,  49,
+             48,  47,  47,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  47,  49,  50,  50,  50,  51,  53,  53,  53,  53,  53,  53,  53,  53,  53,  53,
+             48,  47,  47,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  47,  49,  50,  50,  50,  51,  53,  53,  53,  53,  53,  53,  53,  53,  53,  53,
+             48,  47,  47,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  47,  49,  50,  50,  50,  51,  53,  53,  53,  53,  53,  53,  53,  53,  53,  53,
+             48,  48,  47,  47,  47,  46,  45,  45,  45,  46,  46,  46,  46,  46,  46,  46,  46,  48,  50,  50,  50,  51,  53,  53,  53,  54,  54,  54,  54,  55,  56,  56,
+             49,  48,  47,  47,  47,  46,  45,  45,  45,  45,  46,  46,  46,  45,  45,  45,  45,  47,  49,  49,  49,  51,  53,  53,  53,  54,  56,  56,  56,  57,  58,  58,
+             49,  48,  47,  47,  47,  46,  45,  45,  45,  45,  46,  46,  46,  45,  45,  45,  45,  47,  49,  49,  49,  51,  53,  53,  53,  54,  56,  56,  56,  57,  58,  58,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  34,  35,  36,  36,  36,  37,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  35,  35,  35,  35,  36,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  34,  35,  35,  35,  36,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  36,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  36,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  36,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  35,  35,  36,  36,  36,  36,  37,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  34,  35,  35,  35,  35,  36,  36,  36,  36,  37,  37,  37,  37,  37,  38,
+             32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  34,  35,  35,  35,  35,  36,  36,  37,  37,  37,  37,  38,  38,  38,  38,  39,
+             32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  34,  35,  35,  35,  35,  36,  36,  37,  37,  37,  37,  38,  38,  38,  38,  39,
+             32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  34,  35,  35,  35,  35,  36,  36,  37,  37,  37,  37,  38,  38,  38,  38,  39,
+             33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  35,  35,  35,  35,  36,  36,  36,  36,  37,  38,  39,  39,  39,  40,  41,  42,  42,  42,  42,
+             35,  35,  35,  35,  35,  35,  34,  34,  34,  34,  34,  35,  35,  36,  36,  36,  36,  37,  37,  37,  37,  39,  40,  41,  41,  41,  43,  45,  46,  46,  46,  46,
+             36,  35,  35,  35,  35,  35,  35,  35,  34,  34,  34,  35,  36,  36,  36,  36,  37,  38,  38,  38,  38,  40,  41,  42,  42,  42,  44,  47,  48,  48,  48,  49,
+             36,  35,  35,  35,  35,  35,  35,  35,  34,  34,  34,  35,  36,  36,  36,  36,  37,  38,  38,  38,  38,  40,  41,  42,  42,  42,  44,  47,  48,  48,  48,  49,
+             36,  35,  35,  35,  35,  35,  35,  35,  34,  34,  34,  35,  36,  36,  36,  36,  37,  38,  38,  38,  38,  40,  41,  42,  42,  42,  44,  47,  48,  48,  48,  49,
+        }, {
+             32,  31,  31,  31,  31,  31,  31,  30,  30,  30,  30,  31,  33,  33,  33,  33,  35,  36,  37,  37,  37,  39,  41,  42,  42,  42,  44,  47,  49,  49,  49,  49,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  34,  34,  34,  34,  36,  37,  38,  38,  38,  39,  41,  42,  42,  42,  44,  46,  48,  48,  48,  48,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  33,  34,  35,  35,  35,  37,  38,  39,  39,  39,  40,  42,  42,  42,  42,  44,  46,  47,  47,  47,  47,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  33,  35,  36,  36,  36,  37,  39,  40,  40,  40,  41,  42,  43,  43,  43,  44,  46,  46,  46,  46,  46,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  33,  35,  36,  36,  36,  37,  39,  40,  40,  40,  41,  42,  43,  43,  43,  44,  46,  46,  46,  46,  46,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  33,  35,  36,  36,  36,  37,  39,  40,  40,  40,  41,  42,  43,  43,  43,  44,  46,  46,  46,  46,  46,
+             33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  35,  36,  37,  38,  38,  38,  40,  42,  43,  43,  43,  43,  44,  44,  44,  44,  45,  46,  47,  47,  47,  47,
+             35,  36,  36,  37,  37,  37,  37,  38,  38,  38,  38,  39,  40,  41,  41,  41,  43,  44,  45,  45,  45,  46,  46,  46,  46,  46,  47,  47,  48,  48,  48,  47,
+             37,  37,  38,  38,  38,  38,  39,  39,  40,  40,  40,  41,  42,  43,  43,  43,  44,  46,  47,  47,  47,  47,  47,  47,  47,  47,  47,  48,  48,  48,  48,  47,
+             37,  37,  38,  38,  38,  38,  39,  39,  40,  40,  40,  41,  42,  43,  43,  43,  44,  46,  47,  47,  47,  47,  47,  47,  47,  47,  47,  48,  48,  48,  48,  47,
+             37,  37,  38,  38,  38,  38,  39,  39,  40,  40,  40,  41,  42,  43,  43,  43,  44,  46,  47,  47,  47,  47,  47,  47,  47,  47,  47,  48,  48,  48,  48,  47,
+             40,  41,  41,  41,  41,  41,  41,  42,  42,  42,  42,  43,  44,  44,  44,  44,  45,  47,  47,  47,  47,  48,  48,  48,  48,  48,  49,  49,  50,  50,  50,  49,
+             45,  45,  45,  45,  45,  45,  45,  44,  44,  44,  44,  45,  46,  46,  46,  46,  46,  47,  47,  47,  47,  48,  49,  49,  49,  49,  50,  51,  52,  52,  52,  52,
+             48,  48,  47,  47,  47,  47,  46,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  47,  47,  47,  48,  49,  50,  50,  50,  51,  52,  53,  53,  53,  53,
+             48,  48,  47,  47,  47,  47,  46,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  47,  47,  47,  48,  49,  50,  50,  50,  51,  52,  53,  53,  53,  53,
+             48,  48,  47,  47,  47,  47,  46,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  47,  47,  47,  48,  49,  50,  50,  50,  51,  52,  53,  53,  53,  53,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  34,  34,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  34,  34,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  34,  34,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,  35,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  35,  35,  36,  36,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  36,  36,  37,  37,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  36,  36,  37,  37,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  36,  36,  37,  37,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  36,  36,  37,  37,
+             32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  36,  36,  36,  36,  36,  36,  37,  38,  38,
+             34,  34,  34,  34,  34,  34,  34,  34,  34,  33,  33,  33,  33,  33,  34,  34,  35,  35,  35,  35,  35,  35,  36,  36,  37,  37,  37,  37,  38,  38,  39,  39,
+        }, {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  30,  30,  30,  30,  30,  31,  32,  33,  33,  33,  33,  33,  34,  35,  36,  37,  37,  37,  37,  39,  40,  42,  42,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  33,  34,  34,  34,  34,  35,  36,  37,  38,  38,  38,  38,  39,  41,  42,  42,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  33,  34,  35,  35,  35,  35,  36,  37,  38,  39,  39,  39,  39,  40,  41,  42,  42,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  33,  35,  35,  35,  35,  35,  37,  38,  39,  40,  40,  40,  40,  41,  42,  43,  43,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  33,  34,  35,  36,  36,  36,  36,  37,  38,  39,  40,  40,  40,  40,  41,  42,  43,  43,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  33,  34,  35,  36,  36,  36,  36,  37,  38,  39,  40,  40,  40,  40,  41,  42,  43,  43,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  33,  34,  35,  36,  36,  36,  36,  37,  38,  39,  40,  40,  40,  40,  41,  42,  43,  43,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  34,  35,  36,  36,  36,  36,  37,  39,  40,  41,  41,  41,  41,  42,  42,  43,  43,
+             33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  36,  37,  38,  38,  38,  38,  39,  41,  42,  43,  43,  43,  43,  43,  44,  44,  44,
+             35,  35,  35,  36,  36,  36,  36,  36,  36,  37,  37,  37,  37,  37,  38,  39,  40,  40,  40,  40,  40,  42,  43,  44,  45,  45,  45,  45,  45,  45,  46,  46,
+             37,  37,  38,  38,  38,  38,  38,  38,  39,  39,  40,  40,  40,  40,  40,  41,  42,  43,  43,  43,  43,  44,  45,  47,  47,  47,  47,  47,  47,  47,  47,  47,
+             37,  37,  38,  38,  38,  38,  38,  38,  39,  39,  40,  40,  40,  40,  40,  41,  42,  43,  43,  43,  43,  44,  45,  47,  47,  47,  47,  47,  47,  47,  47,  47,
+             37,  37,  38,  38,  38,  38,  38,  38,  39,  39,  40,  40,  40,  40,  40,  41,  42,  43,  43,  43,  43,  44,  45,  47,  47,  47,  47,  47,  47,  47,  47,  47,
+             37,  37,  38,  38,  38,  38,  38,  38,  39,  39,  40,  40,  40,  40,  40,  41,  42,  43,  43,  43,  43,  44,  45,  47,  47,  47,  47,  47,  47,  47,  47,  47,
+             38,  39,  39,  40,  40,  40,  40,  40,  40,  40,  41,  41,  41,  41,  41,  42,  43,  44,  44,  44,  44,  45,  46,  47,  47,  47,  47,  47,  47,  47,  48,  48,
+             42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  43,  44,  44,  45,  45,  45,  45,  45,  46,  47,  47,  47,  47,  47,  48,  48,  48,  48,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  34,
+        }, {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  30,  30,  30,  30,  30,  30,  30,  31,  31,  32,  33,  33,  33,  33,  33,  33,  33,  34,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  33,  34,  34,  34,  34,  34,  34,  34,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  33,  34,  34,  34,  34,  34,  34,  34,  35,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  33,  33,  34,  35,  35,  35,  35,  35,  35,  35,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  35,  35,  35,  35,  35,  35,  36,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  35,  36,  36,  36,  36,  36,  36,  36,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  35,  36,  36,  36,  36,  36,  36,  36,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  35,  36,  36,  36,  36,  36,  36,  36,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  35,  36,  36,  36,  36,  36,  36,  36,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  35,  36,  36,  36,  36,  36,  36,  36,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  35,  36,  36,  36,  36,  36,  36,  36,
+             32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  35,  35,  36,  37,  37,  37,  37,  37,  37,  38,
+             33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  35,  36,  37,  37,  38,  38,  38,  38,  38,  38,  39,
+             34,  34,  34,  35,  35,  35,  35,  35,  35,  35,  35,  35,  36,  36,  36,  36,  36,  36,  36,  36,  36,  37,  37,  38,  39,  40,  40,  40,  40,  40,  40,  40,
+             35,  35,  36,  36,  36,  37,  37,  37,  37,  37,  37,  37,  37,  37,  38,  38,  38,  38,  38,  38,  38,  38,  39,  40,  40,  41,  41,  41,  41,  41,  41,  42,
+             37,  37,  37,  38,  38,  38,  38,  38,  38,  38,  38,  38,  39,  39,  39,  40,  40,  40,  40,  40,  40,  40,  41,  41,  42,  43,  43,  43,  43,  43,  43,  44,
+        },
+    }, {
+        {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+        }, {
+             32,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  30,  30,  30,  30,  30,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  30,  30,  30,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+        },
+    },
+};
+
+static const uint8_t qm_tbl_32x32_t[][2][528] = {
+    {
+        {
+             32,
+             31,  32,
+             31,  32,  32,
+             31,  32,  32,  32,
+             31,  32,  32,  33,  33,
+             32,  32,  32,  33,  34,  35,
+             34,  34,  33,  34,  35,  37,  39,
+             35,  34,  34,  35,  36,  37,  41,  43,
+             36,  35,  34,  35,  36,  38,  42,  45,  48,
+             39,  38,  37,  38,  39,  40,  45,  47,  50,  54,
+             44,  42,  41,  41,  42,  42,  47,  50,  54,  58,  63,
+             46,  44,  42,  43,  44,  44,  49,  52,  55,  59,  65,  67,
+             48,  46,  44,  45,  45,  46,  51,  53,  57,  61,  67,  69,  71,
+             54,  51,  49,  49,  50,  49,  54,  57,  60,  65,  71,  74,  76,  82,
+             59,  56,  54,  54,  54,  53,  58,  61,  64,  69,  75,  78,  80,  87,  92,
+             62,  59,  56,  56,  56,  55,  60,  63,  66,  71,  77,  80,  83,  89,  95,  98,
+             65,  62,  59,  59,  59,  58,  63,  65,  68,  73,  79,  82,  85,  92,  98, 101, 105,
+             71,  68,  65,  64,  64,  63,  68,  70,  73,  78,  84,  87,  90,  97, 103, 107, 111, 117,
+             80,  76,  72,  72,  71,  69,  74,  76,  79,  84,  90,  93,  96, 104, 110, 114, 118, 125, 134,
+             81,  77,  73,  73,  72,  70,  75,  77,  80,  85,  91,  94,  97, 105, 111, 115, 119, 126, 135, 137,
+             83,  78,  75,  74,  74,  72,  76,  79,  81,  86,  92,  95,  99, 106, 113, 117, 121, 128, 137, 138, 140,
+             88,  84,  80,  79,  78,  76,  80,  82,  85,  91,  95,  98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152,
+             91,  86,  83,  82,  81,  79,  81,  84,  88,  92,  95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154, 159,
+             94,  89,  86,  85,  84,  82,  82,  86,  90,  92,  97, 103, 105, 111, 119, 121, 128, 136, 139, 146, 156, 158, 161, 166,
+             97,  92,  90,  88,  86,  85,  84,  89,  91,  95, 100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163, 166, 168, 174,
+            101,  95,  93,  91,  89,  89,  87,  91,  93,  98, 101, 105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176, 183,
+            104,  99,  97,  94,  93,  93,  90,  92,  96, 100, 102, 108, 111, 116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191,
+            107, 102, 101,  97,  96,  96,  93,  93,  99, 101, 105, 110, 113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200,
+            111, 105, 104, 101, 100,  99,  97,  96, 102, 103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202, 210,
+            115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220,
+            119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220, 222, 231,
+            123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191, 204, 206, 222, 224, 230, 232, 242,
+        }, {
+             32,
+             31,  31,
+             30,  31,  32,
+             32,  33,  33,  35,
+             33,  34,  35,  37,  39,
+             36,  38,  40,  41,  43,  47,
+             41,  42,  42,  43,  45,  47,  48,
+             45,  45,  44,  45,  46,  47,  49,  50,
+             49,  47,  46,  47,  47,  48,  50,  51,  53,
+             48,  47,  45,  46,  46,  46,  49,  51,  53,  54,
+             49,  47,  45,  45,  45,  45,  49,  51,  53,  55,  58,
+             50,  47,  45,  46,  46,  46,  49,  51,  54,  56,  59,  60,
+             50,  48,  46,  46,  46,  46,  50,  52,  54,  56,  60,  60,  61,
+             52,  50,  47,  47,  47,  47,  50,  52,  54,  57,  61,  62,  63,  66,
+             54,  52,  49,  49,  49,  48,  52,  53,  55,  58,  62,  64,  65,  68,  71,
+             56,  53,  51,  50,  50,  49,  52,  54,  56,  59,  63,  64,  66,  69,  72,  73,
+             57,  54,  52,  51,  51,  50,  53,  55,  56,  60,  63,  65,  67,  70,  73,  75,  76,
+             60,  57,  54,  54,  53,  52,  55,  57,  58,  61,  65,  67,  68,  72,  75,  77,  79,  82,
+             63,  60,  57,  57,  56,  54,  57,  59,  60,  63,  67,  69,  71,  75,  78,  80,  82,  85,  89,
+             64,  61,  58,  57,  57,  55,  58,  59,  61,  64,  67,  69,  71,  75,  78,  80,  82,  85,  89,  90,
+             65,  61,  58,  58,  57,  55,  58,  60,  61,  64,  68,  70,  71,  75,  79,  81,  83,  86,  90,  91,  91,
+             67,  63,  61,  60,  59,  57,  60,  61,  63,  66,  69,  70,  73,  77,  79,  81,  85,  88,  90,  92,  94,  96,
+             68,  64,  62,  61,  60,  58,  59,  61,  64,  66,  67,  71,  74,  75,  78,  82,  84,  86,  90,  93,  94,  96,  98,
+             69,  65,  63,  62,  61,  59,  59,  62,  64,  65,  68,  71,  72,  75,  79,  80,  83,  87,  89,  92,  96,  97,  98, 100,
+             70,  66,  64,  63,  62,  61,  60,  63,  64,  66,  69,  70,  73,  76,  77,  81,  84,  85,  89,  92,  93,  98,  99, 100, 102,
+             71,  67,  66,  64,  63,  62,  61,  63,  64,  67,  68,  70,  74,  75,  78,  81,  83,  86,  88,  91,  94,  95, 100, 101, 102, 104,
+             72,  68,  67,  65,  64,  64,  61,  63,  65,  67,  68,  71,  73,  75,  78,  79,  84,  85,  88,  91,  93,  97,  98, 102, 103, 104, 106,
+             73,  69,  68,  66,  65,  65,  63,  63,  66,  67,  69,  71,  73,  76,  77,  81,  82,  85,  88,  90,  94,  95,  99, 101, 104, 105, 106, 109,
+             74,  70,  70,  67,  66,  66,  64,  63,  66,  67,  70,  71,  74,  75,  78,  80,  82,  86,  87,  91,  92,  96,  98, 101, 104, 106, 108, 108, 111,
+             75,  71,  71,  68,  68,  67,  66,  64,  66,  68,  70,  71,  74,  75,  79,  79,  84,  84,  88,  90,  93,  95,  98, 101, 103, 107, 108, 110, 111, 113,
+             76,  72,  72,  69,  69,  68,  67,  65,  66,  69,  70,  72,  74,  76,  78,  81,  83,  85,  88,  90,  93,  95,  98, 100, 104, 105, 109, 111, 112, 113, 116,
+             78,  74,  74,  70,  70,  69,  69,  66,  66,  70,  70,  74,  74,  77,  78,  82,  82,  86,  87,  92,  92,  96,  97, 102, 102, 107, 107, 112, 113, 115, 115, 118,
+        },
+    }, {
+        {
+             32,
+             31,  32,
+             31,  32,  32,
+             31,  32,  32,  32,
+             31,  32,  32,  32,  33,
+             32,  32,  32,  33,  34,  35,
+             32,  33,  33,  33,  34,  36,  36,
+             34,  34,  33,  34,  35,  37,  38,  39,
+             36,  35,  34,  35,  36,  38,  40,  42,  48,
+             38,  37,  36,  36,  38,  39,  41,  44,  50,  51,
+             39,  38,  37,  38,  39,  40,  42,  45,  50,  52,  54,
+             44,  42,  41,  41,  42,  42,  44,  47,  54,  56,  58,  63,
+             47,  45,  44,  44,  45,  45,  47,  50,  56,  58,  60,  66,  69,
+             49,  47,  46,  45,  46,  46,  48,  51,  57,  60,  62,  68,  71,  73,
+             54,  51,  50,  49,  50,  49,  51,  54,  60,  63,  65,  71,  75,  77,  82,
+             59,  56,  54,  54,  54,  53,  55,  58,  64,  67,  69,  75,  79,  81,  87,  92,
+             61,  58,  56,  56,  56,  55,  57,  60,  65,  68,  70,  77,  81,  83,  89,  94,  97,
+             65,  62,  60,  59,  59,  58,  60,  63,  68,  71,  73,  79,  84,  87,  92,  98, 101, 105,
+             71,  68,  65,  65,  64,  63,  65,  68,  73,  76,  78,  84,  89,  92,  97, 103, 106, 111, 117,
+             76,  72,  70,  69,  68,  66,  68,  71,  76,  79,  81,  88,  92,  95, 101, 107, 110, 115, 122, 127,
+             80,  76,  73,  72,  71,  69,  71,  74,  79,  82,  84,  90,  95,  98, 104, 110, 113, 118, 125, 130, 134,
+             83,  78,  76,  75,  74,  72,  73,  76,  81,  84,  86,  92,  97, 100, 106, 113, 116, 121, 128, 133, 137, 140,
+             86,  82,  79,  78,  77,  74,  76,  79,  84,  87,  89,  95, 100, 103, 109, 116, 119, 124, 131, 136, 140, 144, 147,
+             89,  85,  82,  81,  79,  78,  78,  82,  86,  87,  92,  97, 100, 105, 112, 114, 120, 128, 131, 136, 146, 147, 150, 155,
+             92,  88,  85,  84,  82,  81,  80,  85,  86,  90,  95,  97, 102, 107, 110, 117, 122, 125, 134, 138, 142, 152, 154, 156, 162,
+             95,  90,  88,  86,  85,  84,  82,  86,  88,  93,  95,  99, 105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169,
+             98,  93,  91,  89,  88,  87,  85,  87,  90,  94,  96, 102, 104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176,
+            101,  96,  95,  92,  91,  90,  88,  88,  93,  95,  99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184,
+            104,  99,  98,  95,  94,  93,  91,  90,  95,  96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186, 193,
+            108, 102, 101,  98,  97,  96,  95,  93,  97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188, 192, 194, 201,
+            111, 105, 105, 101, 100,  99,  98,  96,  98, 103, 105, 109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210,
+            114, 109, 109, 104, 104, 102, 102,  99, 100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203, 204, 210, 211, 219,
+        }, {
+             32,
+             31,  31,
+             30,  31,  31,
+             31,  32,  32,  33,
+             33,  34,  35,  36,  39,
+             36,  38,  39,  40,  43,  47,
+             38,  40,  41,  41,  44,  47,  47,
+             41,  42,  42,  43,  45,  47,  48,  48,
+             49,  47,  46,  46,  47,  48,  49,  50,  53,
+             49,  47,  46,  46,  46,  47,  48,  50,  53,  53,
+             48,  47,  46,  45,  46,  46,  48,  49,  53,  54,  54,
+             49,  47,  45,  45,  45,  45,  47,  49,  53,  55,  55,  58,
+             50,  48,  46,  46,  46,  46,  47,  50,  54,  55,  56,  59,  61,
+             51,  48,  47,  46,  47,  46,  47,  50,  54,  55,  56,  60,  61,  62,
+             52,  50,  48,  47,  47,  47,  48,  50,  54,  56,  57,  61,  63,  64,  66,
+             54,  52,  50,  49,  49,  48,  49,  52,  55,  57,  58,  62,  64,  66,  68,  71,
+             55,  53,  51,  50,  50,  49,  50,  52,  56,  58,  59,  63,  65,  66,  69,  72,  73,
+             57,  54,  52,  51,  51,  50,  51,  53,  56,  58,  60,  63,  66,  67,  70,  73,  74,  76,
+             60,  57,  55,  54,  53,  52,  53,  55,  58,  60,  61,  65,  68,  69,  72,  75,  77,  79,  82,
+             62,  59,  57,  56,  55,  53,  54,  56,  59,  61,  63,  66,  69,  70,  74,  77,  78,  80,  84,  86,
+             63,  60,  58,  57,  56,  54,  55,  57,  60,  62,  63,  67,  70,  71,  75,  78,  79,  82,  85,  87,  89,
+             65,  61,  59,  58,  57,  55,  56,  58,  61,  63,  64,  68,  71,  72,  75,  79,  80,  83,  86,  88,  90,  91,
+             66,  63,  60,  59,  58,  56,  58,  59,  62,  64,  65,  69,  72,  73,  76,  80,  81,  84,  87,  90,  91,  93,  94,
+             67,  64,  62,  61,  59,  58,  58,  60,  63,  64,  66,  69,  71,  73,  77,  78,  81,  85,  86,  89,  93,  94,  95,  97,
+             68,  65,  63,  62,  60,  59,  58,  61,  62,  64,  67,  68,  71,  74,  75,  79,  81,  83,  87,  89,  91,  95,  96,  97,  99,
+             69,  66,  64,  63,  61,  61,  59,  61,  62,  65,  66,  68,  72,  73,  76,  78,  80,  84,  85,  88,  91,  92,  97,  98,  98, 101,
+             70,  67,  65,  63,  62,  62,  60,  61,  63,  65,  66,  69,  71,  73,  76,  77,  81,  83,  85,  88,  90,  94,  95,  99, 100, 100, 103,
+             71,  67,  67,  64,  63,  63,  61,  61,  64,  65,  67,  69,  71,  74,  75,  78,  80,  83,  85,  87,  91,  92,  95,  97, 100, 102, 102, 105,
+             72,  68,  68,  65,  65,  64,  62,  62,  64,  65,  68,  69,  72,  73,  76,  78,  80,  83,  84,  88,  89,  93,  95,  97, 100, 102, 104, 104, 107,
+             73,  69,  69,  66,  66,  65,  64,  63,  64,  66,  68,  69,  72,  73,  77,  77,  81,  82,  86,  87,  90,  92,  95,  97,  99, 103, 104, 106, 106, 109,
+             74,  70,  70,  67,  67,  66,  65,  63,  64,  67,  68,  70,  72,  74,  76,  78,  80,  82,  85,  87,  90,  91,  95,  96, 100, 101, 105, 106, 108, 108, 111,
+             75,  71,  71,  68,  68,  66,  66,  64,  64,  68,  68,  71,  71,  75,  75,  79,  79,  83,  84,  88,  89,  93,  93,  98,  98, 102, 103, 108, 108, 110, 110, 113,
+        },
+    }, {
+        {
+             32,
+             31,  32,
+             31,  32,  32,
+             31,  32,  32,  32,
+             31,  32,  32,  32,  33,
+             32,  32,  32,  32,  33,  34,
+             32,  32,  32,  32,  34,  34,  35,
+             34,  34,  33,  33,  35,  36,  37,  39,
+             34,  34,  34,  34,  36,  36,  37,  41,  42,
+             36,  35,  34,  34,  36,  37,  38,  42,  45,  48,
+             39,  38,  38,  37,  39,  40,  40,  45,  47,  50,  54,
+             41,  39,  39,  38,  40,  40,  41,  46,  48,  51,  55,  56,
+             44,  42,  41,  41,  42,  42,  42,  47,  50,  54,  58,  59,  63,
+             48,  46,  45,  44,  45,  45,  45,  50,  53,  56,  61,  62,  66,  70,
+             49,  47,  46,  45,  46,  46,  46,  51,  53,  57,  62,  63,  68,  71,  73,
+             54,  51,  50,  49,  50,  49,  49,  54,  56,  60,  65,  67,  71,  76,  77,  82,
+             58,  55,  54,  53,  53,  53,  52,  57,  59,  63,  68,  70,  74,  79,  81,  86,  90,
+             59,  57,  55,  54,  54,  54,  54,  59,  61,  64,  69,  71,  75,  80,  82,  87,  91,  93,
+             65,  62,  60,  59,  59,  58,  58,  63,  65,  68,  73,  75,  79,  85,  87,  92,  97,  99, 105,
+             69,  66,  64,  63,  63,  62,  61,  66,  68,  71,  76,  78,  83,  88,  90,  96, 100, 102, 109, 113,
+             71,  68,  66,  65,  64,  63,  63,  68,  70,  73,  78,  80,  84,  90,  92,  97, 102, 104, 111, 115, 117,
+             80,  76,  73,  72,  71,  70,  69,  74,  76,  79,  84,  86,  90,  96,  98, 104, 109, 111, 118, 123, 125, 134,
+             81,  77,  75,  74,  73,  72,  71,  75,  77,  80,  85,  87,  91,  97,  99, 105, 110, 112, 120, 125, 127, 136, 137,
+             83,  78,  76,  75,  74,  73,  72,  76,  78,  81,  86,  88,  92,  98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140,
+             87,  83,  81,  79,  78,  77,  75,  80,  82,  85,  90,  91,  96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150,
+             90,  85,  83,  81,  80,  79,  78,  81,  83,  87,  89,  93,  98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151, 156,
+             93,  88,  86,  84,  83,  82,  80,  82,  85,  89,  90,  96,  98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163,
+             95,  90,  89,  86,  85,  85,  83,  83,  88,  89,  93,  97,  99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169,
+             98,  93,  92,  89,  88,  87,  86,  85,  89,  90,  96,  97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170, 176,
+            101,  96,  95,  91,  91,  90,  89,  87,  90,  93,  97,  99, 104, 105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172, 176, 177, 184,
+            104,  99,  98,  94,  94,  92,  92,  90,  92,  96,  98, 102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191,
+            107, 101, 101,  97,  97,  95,  95,  93,  93,  99,  99, 105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199,
+        }, {
+             32,
+             31,  31,
+             30,  31,  31,
+             30,  31,  31,  32,
+             33,  34,  35,  35,  39,
+             35,  36,  37,  37,  41,  43,
+             36,  38,  39,  40,  43,  45,  47,
+             41,  42,  42,  42,  45,  46,  47,  48,
+             44,  44,  44,  44,  46,  46,  47,  49,  50,
+             49,  47,  47,  46,  47,  47,  48,  50,  51,  53,
+             48,  47,  46,  45,  46,  46,  46,  49,  51,  53,  54,
+             48,  47,  46,  45,  46,  46,  46,  49,  51,  53,  54,  55,
+             49,  47,  46,  45,  45,  45,  45,  49,  51,  53,  55,  56,  58,
+             50,  48,  47,  46,  46,  46,  46,  50,  51,  54,  56,  57,  59,  61,
+             51,  48,  47,  46,  47,  46,  46,  50,  51,  54,  56,  57,  60,  62,  62,
+             52,  50,  48,  47,  47,  47,  47,  50,  52,  54,  57,  58,  61,  63,  64,  66,
+             54,  51,  50,  49,  49,  48,  48,  51,  53,  55,  58,  59,  62,  64,  65,  68,  70,
+             55,  52,  51,  50,  49,  49,  48,  52,  53,  55,  59,  60,  62,  65,  66,  68,  70,  71,
+             57,  54,  53,  52,  51,  50,  50,  53,  54,  56,  60,  61,  63,  66,  67,  70,  73,  73,  76,
+             59,  56,  54,  53,  53,  52,  51,  54,  56,  58,  61,  62,  65,  68,  69,  72,  74,  75,  78,  80,
+             60,  57,  55,  54,  53,  53,  52,  55,  56,  58,  61,  63,  65,  68,  69,  72,  75,  76,  79,  81,  82,
+             63,  60,  58,  57,  56,  55,  54,  57,  59,  60,  63,  65,  67,  70,  71,  75,  77,  78,  82,  84,  85,  89,
+             64,  61,  59,  58,  57,  56,  55,  58,  59,  61,  64,  65,  68,  71,  72,  75,  78,  79,  82,  85,  86,  89,  90,
+             65,  61,  60,  58,  57,  56,  55,  58,  59,  61,  64,  65,  68,  71,  72,  75,  78,  79,  83,  85,  86,  90,  91,  91,
+             67,  63,  61,  60,  59,  58,  57,  60,  61,  63,  65,  66,  69,  72,  73,  77,  79,  80,  84,  86,  88,  92,  93,  93,  95,
+             68,  64,  63,  61,  60,  59,  58,  60,  61,  63,  65,  67,  70,  71,  74,  76,  78,  81,  83,  86,  88,  89,  94,  94,  95,  97,
+             68,  65,  64,  62,  61,  60,  58,  59,  61,  64,  64,  68,  69,  71,  74,  75,  79,  80,  83,  86,  87,  91,  92,  95,  96,  97,  99,
+             69,  66,  65,  63,  62,  61,  59,  59,  62,  63,  65,  67,  69,  72,  72,  76,  78,  80,  83,  84,  88,  89,  92,  94,  97,  98,  99, 101,
+             70,  67,  66,  63,  63,  62,  61,  60,  63,  63,  66,  67,  69,  71,  73,  76,  77,  81,  82,  85,  86,  90,  91,  94,  96,  99, 100, 100, 103,
+             71,  67,  67,  64,  64,  63,  62,  61,  62,  64,  66,  67,  70,  71,  74,  74,  78,  79,  83,  84,  87,  89,  91,  94,  95,  99, 100, 102, 102, 104,
+             72,  68,  68,  65,  65,  64,  63,  61,  62,  65,  66,  68,  69,  71,  73,  75,  77,  79,  82,  84,  87,  88,  92,  93,  96,  97, 101, 102, 104, 104, 106,
+             73,  69,  69,  66,  66,  64,  64,  62,  62,  66,  66,  69,  69,  72,  73,  76,  77,  81,  81,  85,  85,  89,  90,  94,  94,  99,  99, 104, 104, 106, 106, 108,
+        },
+    }, {
+        {
+             32,
+             31,  32,
+             31,  32,  32,
+             31,  32,  32,  32,
+             31,  32,  32,  32,  33,
+             31,  32,  32,  32,  33,  33,
+             32,  32,  32,  32,  33,  34,  35,
+             32,  33,  33,  33,  34,  34,  36,  36,
+             34,  34,  34,  33,  35,  35,  37,  38,  39,
+             35,  35,  34,  34,  36,  36,  38,  39,  42,  46,
+             36,  35,  35,  34,  36,  36,  38,  40,  42,  47,  48,
+             39,  38,  38,  37,  39,  39,  40,  42,  45,  49,  50,  54,
+             41,  40,  39,  38,  40,  40,  41,  43,  46,  50,  52,  55,  57,
+             44,  42,  42,  41,  42,  42,  42,  44,  47,  52,  54,  58,  60,  63,
+             47,  45,  45,  44,  44,  45,  45,  47,  50,  55,  56,  60,  62,  66,  69,
+             48,  46,  45,  44,  45,  45,  46,  47,  51,  55,  57,  61,  63,  67,  70,  71,
+             54,  51,  50,  49,  49,  50,  49,  51,  54,  59,  60,  65,  67,  71,  75,  76,  82,
+             56,  53,  52,  51,  51,  51,  51,  53,  56,  60,  61,  66,  69,  73,  77,  78,  84,  86,
+             59,  56,  55,  54,  54,  54,  53,  55,  58,  62,  64,  69,  71,  75,  79,  80,  87,  89,  92,
+             64,  61,  60,  58,  58,  58,  57,  59,  62,  66,  67,  72,  75,  79,  83,  84,  91,  93,  97, 102,
+             65,  62,  61,  59,  59,  59,  58,  60,  63,  67,  68,  73,  75,  79,  84,  85,  92,  94,  98, 103, 105,
+             71,  68,  67,  65,  64,  64,  63,  65,  68,  72,  73,  78,  80,  84,  89,  90,  97, 100, 103, 109, 111, 117,
+             74,  71,  69,  68,  67,  67,  65,  67,  70,  74,  75,  80,  83,  86,  91,  93, 100, 102, 106, 112, 114, 120, 123,
+             80,  76,  74,  72,  71,  71,  69,  71,  74,  78,  79,  84,  86,  90,  95,  96, 104, 106, 110, 116, 118, 125, 128, 134,
+             82,  78,  76,  74,  73,  73,  71,  73,  76,  79,  80,  86,  88,  92,  97,  98, 106, 108, 112, 118, 120, 127, 131, 136, 139,
+             83,  78,  77,  75,  74,  74,  72,  73,  76,  80,  81,  86,  89,  92,  97,  99, 106, 109, 113, 119, 121, 128, 131, 137, 139, 140,
+             87,  83,  81,  79,  78,  78,  75,  77,  80,  83,  85,  90,  92,  96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150,
+             90,  85,  84,  81,  80,  80,  78,  78,  82,  84,  87,  91,  93,  98,  99, 106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156,
+             92,  88,  87,  84,  83,  82,  80,  80,  84,  85,  90,  91,  95,  98, 102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156, 162,
+             95,  90,  89,  86,  85,  84,  83,  82,  85,  87,  91,  92,  97,  98, 105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158, 161, 162, 168,
+             97,  92,  92,  88,  88,  86,  86,  84,  85,  90,  91,  95,  97, 101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150, 152, 162, 164, 168, 168, 174,
+            100,  95,  95,  90,  90,  89,  89,  86,  86,  92,  92,  97,  98, 104, 104, 111, 111, 119, 119, 128, 129, 137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181,
+        }, {
+             32,
+             31,  31,
+             31,  31,  31,
+             30,  31,  31,  32,
+             33,  34,  34,  34,  37,
+             33,  34,  35,  35,  38,  39,
+             36,  38,  39,  40,  42,  43,  47,
+             38,  40,  40,  41,  43,  44,  47,  47,
+             41,  42,  42,  42,  44,  45,  47,  48,  48,
+             47,  46,  46,  45,  46,  47,  47,  48,  50,  52,
+             49,  47,  47,  46,  47,  47,  48,  49,  50,  52,  53,
+             48,  47,  46,  45,  46,  46,  46,  48,  49,  52,  53,  54,
+             49,  47,  46,  45,  46,  46,  46,  47,  49,  52,  53,  55,  55,
+             49,  47,  46,  45,  45,  45,  45,  47,  49,  52,  53,  55,  57,  58,
+             50,  48,  47,  46,  46,  46,  46,  47,  50,  53,  54,  56,  57,  59,  61,
+             50,  48,  47,  46,  46,  46,  46,  47,  50,  53,  54,  56,  58,  60,  61,  61,
+             52,  50,  49,  47,  47,  47,  47,  48,  50,  53,  54,  57,  59,  61,  63,  63,  66,
+             53,  50,  50,  48,  48,  48,  47,  49,  51,  54,  55,  58,  59,  62,  64,  64,  67,  68,
+             54,  52,  51,  49,  49,  49,  48,  49,  52,  55,  55,  58,  60,  62,  64,  65,  68,  69,  71,
+             56,  54,  53,  51,  51,  51,  49,  51,  53,  55,  56,  59,  61,  63,  66,  66,  70,  71,  73,  75,
+             57,  54,  53,  52,  51,  51,  50,  51,  53,  56,  56,  60,  61,  63,  66,  67,  70,  71,  73,  76,  76,
+             60,  57,  56,  54,  53,  53,  52,  53,  55,  58,  58,  61,  63,  65,  68,  68,  72,  73,  75,  78,  79,  82,
+             61,  58,  57,  55,  55,  54,  53,  54,  56,  58,  59,  62,  64,  66,  69,  69,  73,  74,  76,  79,  80,  83,  84,
+             63,  60,  59,  57,  56,  56,  54,  55,  57,  60,  60,  63,  65,  67,  70,  71,  75,  76,  78,  81,  82,  85,  86,  89,
+             64,  61,  60,  58,  57,  57,  55,  56,  58,  60,  61,  64,  66,  68,  70,  71,  75,  77,  79,  82,  82,  86,  87,  90,  91,
+             65,  61,  60,  58,  57,  57,  55,  56,  58,  61,  61,  64,  66,  68,  71,  71,  75,  77,  79,  82,  83,  86,  88,  90,  91,  91,
+             67,  63,  62,  60,  59,  59,  57,  58,  60,  62,  63,  66,  67,  69,  72,  73,  77,  78,  80,  83,  84,  88,  89,  92,  93,  93,  95,
+             67,  64,  63,  61,  60,  60,  58,  58,  61,  61,  63,  65,  67,  70,  70,  74,  75,  78,  80,  81,  85,  86,  89,  91,  93,  94,  95,  97,
+             68,  65,  64,  62,  61,  60,  59,  58,  61,  61,  64,  65,  67,  69,  71,  73,  75,  78,  79,  83,  83,  87,  88,  91,  93,  95,  96,  97,  99,
+             69,  65,  65,  62,  62,  61,  60,  59,  61,  62,  64,  65,  68,  68,  72,  72,  76,  76,  80,  81,  84,  86,  88,  90,  92,  95,  96,  98,  98, 100,
+             70,  66,  66,  63,  63,  62,  61,  60,  60,  63,  64,  66,  67,  69,  71,  73,  75,  77,  79,  81,  84,  85,  88,  89,  93,  93,  97,  98, 100, 100, 102,
+             71,  67,  67,  64,  64,  62,  62,  60,  60,  64,  64,  67,  67,  70,  70,  74,  74,  78,  78,  82,  82,  86,  86,  91,  91,  95,  95, 100, 100, 101, 101, 104,
+        },
+    }, {
+        {
+             32,
+             31,  32,
+             31,  32,  32,
+             31,  32,  32,  32,
+             31,  32,  32,  32,  32,
+             31,  32,  32,  32,  33,  33,
+             32,  32,  32,  32,  33,  33,  34,
+             32,  32,  32,  32,  33,  34,  35,  35,
+             33,  33,  33,  33,  34,  35,  36,  36,  38,
+             34,  34,  34,  33,  34,  35,  36,  37,  39,  39,
+             36,  35,  35,  34,  35,  36,  37,  38,  42,  42,  48,
+             36,  35,  35,  34,  35,  36,  38,  38,  42,  43,  48,  49,
+             39,  38,  38,  37,  38,  39,  40,  40,  44,  45,  50,  51,  54,
+             41,  39,  39,  38,  39,  40,  40,  41,  45,  46,  51,  52,  55,  56,
+             44,  42,  42,  41,  41,  42,  42,  42,  46,  47,  54,  54,  58,  59,  63,
+             46,  44,  44,  42,  43,  44,  44,  44,  48,  49,  55,  55,  59,  61,  65,  67,
+             48,  46,  46,  44,  45,  45,  45,  46,  50,  51,  57,  57,  61,  63,  67,  69,  71,
+             52,  50,  49,  48,  48,  48,  48,  48,  52,  53,  59,  59,  64,  65,  70,  72,  74,  78,
+             54,  51,  51,  49,  49,  50,  49,  49,  53,  54,  60,  60,  65,  67,  71,  74,  76,  80,  82,
+             58,  56,  55,  53,  53,  53,  53,  53,  57,  58,  63,  64,  68,  70,  75,  77,  80,  84,  86,  91,
+             59,  56,  56,  54,  54,  54,  53,  53,  57,  58,  64,  64,  69,  70,  75,  78,  80,  85,  87,  91,  92,
+             65,  62,  61,  59,  59,  59,  58,  58,  62,  63,  68,  68,  73,  75,  79,  82,  85,  90,  92,  97,  98, 105,
+             66,  63,  63,  60,  60,  60,  59,  59,  63,  64,  69,  69,  74,  76,  80,  83,  86,  91,  93,  98,  99, 106, 107,
+             71,  68,  67,  65,  65,  64,  63,  63,  67,  68,  73,  73,  78,  80,  84,  87,  90,  95,  97, 103, 103, 111, 112, 117,
+             74,  71,  70,  68,  67,  67,  66,  65,  69,  70,  75,  75,  80,  82,  86,  89,  93,  97, 100, 105, 106, 114, 115, 120, 123,
+             80,  76,  75,  72,  72,  71,  70,  69,  73,  74,  79,  79,  84,  86,  90,  93,  96, 101, 104, 110, 110, 118, 119, 125, 128, 134,
+             81,  77,  77,  74,  73,  73,  71,  71,  74,  75,  80,  80,  85,  87,  91,  94,  98, 103, 105, 111, 112, 120, 121, 127, 130, 136, 137,
+             83,  78,  78,  75,  74,  74,  72,  72,  75,  76,  81,  81,  86,  88,  92,  95,  99, 104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140,
+             86,  82,  81,  78,  77,  77,  75,  74,  78,  79,  84,  84,  89,  91,  95,  98, 101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144, 147,
+             89,  84,  84,  80,  80,  79,  78,  77,  79,  81,  85,  86,  91,  92,  97,  98, 104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145, 148, 149, 153,
+             91,  86,  86,  82,  82,  81,  80,  79,  80,  84,  85,  88,  91,  94,  97, 100, 104, 107, 112, 115, 120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159,
+             93,  88,  88,  84,  84,  83,  83,  80,  81,  86,  86,  91,  91,  96,  97, 103, 103, 110, 110, 118, 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164,
+        }, {
+             32,
+             31,  31,
+             31,  31,  31,
+             30,  31,  31,  32,
+             31,  32,  32,  33,  34,
+             33,  34,  35,  35,  37,  39,
+             35,  37,  37,  38,  39,  41,  44,
+             36,  38,  39,  40,  41,  43,  46,  47,
+             40,  41,  41,  42,  43,  44,  46,  47,  48,
+             41,  42,  42,  42,  43,  45,  46,  47,  48,  48,
+             49,  47,  47,  46,  46,  47,  47,  48,  50,  50,  53,
+             49,  47,  47,  46,  46,  47,  47,  47,  49,  50,  53,  53,
+             48,  47,  47,  45,  46,  46,  46,  46,  49,  49,  53,  53,  54,
+             48,  47,  46,  45,  45,  46,  46,  46,  49,  49,  53,  53,  54,  55,
+             49,  47,  46,  45,  45,  45,  45,  45,  48,  49,  53,  54,  55,  56,  58,
+             50,  47,  47,  45,  46,  46,  46,  46,  49,  49,  54,  54,  56,  57,  59,  60,
+             50,  48,  48,  46,  46,  46,  46,  46,  49,  50,  54,  54,  56,  57,  60,  60,  61,
+             52,  49,  49,  47,  47,  47,  47,  46,  49,  50,  54,  54,  57,  58,  61,  62,  63,  65,
+             52,  50,  49,  47,  47,  47,  47,  47,  49,  50,  54,  54,  57,  58,  61,  62,  63,  65,  66,
+             54,  52,  51,  49,  49,  49,  48,  48,  51,  52,  55,  55,  58,  59,  62,  63,  65,  67,  68,  70,
+             54,  52,  51,  49,  49,  49,  48,  48,  51,  52,  55,  56,  58,  60,  62,  64,  65,  67,  68,  70,  71,
+             57,  54,  54,  52,  51,  51,  50,  50,  52,  53,  56,  57,  60,  61,  63,  65,  67,  69,  70,  73,  73,  76,
+             57,  55,  54,  52,  52,  51,  51,  50,  53,  53,  57,  57,  60,  61,  64,  65,  67,  70,  71,  73,  74,  77,  77,
+             60,  57,  56,  54,  54,  53,  52,  52,  54,  55,  58,  59,  61,  63,  65,  67,  68,  71,  72,  75,  75,  79,  79,  82,
+             61,  58,  57,  55,  55,  54,  53,  53,  55,  56,  59,  59,  62,  63,  66,  68,  69,  72,  73,  76,  76,  80,  80,  83,  84,
+             63,  60,  59,  57,  57,  56,  55,  54,  57,  57,  60,  61,  63,  65,  67,  69,  71,  73,  75,  78,  78,  82,  82,  85,  86,  89,
+             64,  61,  60,  58,  57,  57,  56,  55,  57,  58,  61,  61,  64,  65,  68,  69,  71,  74,  75,  78,  78,  82,  83,  86,  87,  89,  90,
+             65,  61,  61,  58,  58,  57,  56,  55,  58,  58,  61,  62,  64,  65,  68,  70,  71,  74,  75,  78,  79,  83,  83,  86,  88,  90,  91,  91,
+             66,  63,  62,  60,  59,  58,  57,  56,  59,  59,  62,  63,  65,  66,  69,  70,  72,  75,  76,  79,  80,  84,  84,  87,  89,  91,  92,  93,  94,
+             67,  64,  63,  61,  60,  59,  58,  57,  59,  60,  62,  63,  66,  66,  70,  70,  73,  74,  77,  78,  81,  83,  85,  87,  89,  92,  93,  94,  94,  96,
+             68,  64,  64,  61,  61,  60,  59,  58,  59,  61,  62,  64,  65,  67,  69,  71,  72,  74,  77,  78,  81,  82,  85,  86,  89,  90,  94,  94,  96,  96,  98,
+             69,  65,  65,  62,  62,  61,  61,  58,  59,  62,  62,  65,  65,  68,  68,  71,  71,  75,  75,  79,  79,  83,  83,  87,  87,  91,  91,  96,  96,  97,  97,  99,
+        },
+    }, {
+        {
+             32,
+             31,  32,
+             31,  32,  32,
+             31,  32,  32,  32,
+             31,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  33,
+             31,  32,  32,  32,  32,  33,  33,
+             32,  32,  32,  32,  32,  34,  34,  35,
+             32,  32,  32,  32,  32,  34,  34,  35,  35,
+             34,  34,  34,  33,  33,  35,  35,  37,  37,  39,
+             34,  34,  34,  33,  33,  35,  35,  37,  37,  39,  39,
+             36,  35,  35,  34,  34,  36,  36,  38,  38,  42,  42,  48,
+             36,  35,  35,  34,  34,  36,  36,  38,  38,  42,  42,  48,  48,
+             39,  38,  38,  37,  37,  39,  39,  40,  40,  45,  45,  50,  50,  54,
+             39,  38,  38,  37,  37,  39,  39,  40,  40,  45,  45,  50,  50,  54,  54,
+             44,  42,  42,  41,  41,  42,  42,  42,  42,  47,  47,  54,  54,  58,  58,  63,
+             44,  42,  42,  41,  41,  42,  42,  42,  42,  47,  47,  54,  54,  58,  58,  63,  63,
+             48,  46,  46,  44,  44,  45,  45,  46,  46,  51,  51,  57,  57,  61,  61,  67,  67,  71,
+             48,  46,  46,  44,  44,  45,  45,  46,  46,  51,  51,  57,  57,  61,  61,  67,  67,  71,  71,
+             54,  51,  51,  49,  49,  50,  50,  49,  49,  54,  54,  60,  60,  65,  65,  71,  71,  76,  76,  82,
+             54,  51,  51,  49,  49,  50,  50,  49,  49,  54,  54,  60,  60,  65,  65,  71,  71,  76,  76,  82,  82,
+             59,  56,  56,  54,  54,  54,  54,  53,  53,  58,  58,  64,  64,  69,  69,  75,  75,  80,  80,  87,  87,  92,
+             59,  56,  56,  54,  54,  54,  54,  53,  53,  58,  58,  64,  64,  69,  69,  75,  75,  80,  80,  87,  87,  92,  92,
+             65,  62,  62,  59,  59,  59,  59,  58,  58,  63,  63,  68,  68,  73,  73,  79,  79,  85,  85,  92,  92,  98,  98, 105,
+             65,  62,  62,  59,  59,  59,  59,  58,  58,  63,  63,  68,  68,  73,  73,  79,  79,  85,  85,  92,  92,  98,  98, 105, 105,
+             71,  68,  68,  65,  65,  64,  64,  63,  63,  68,  68,  73,  73,  78,  78,  84,  84,  90,  90,  97,  97, 103, 103, 111, 111, 117,
+             71,  68,  68,  65,  65,  64,  64,  63,  63,  68,  68,  73,  73,  78,  78,  84,  84,  90,  90,  97,  97, 103, 103, 111, 111, 117, 117,
+             80,  76,  76,  72,  72,  71,  71,  69,  69,  74,  74,  79,  79,  84,  84,  90,  90,  96,  96, 104, 104, 110, 110, 118, 118, 125, 125, 134,
+             80,  76,  76,  72,  72,  71,  71,  69,  69,  74,  74,  79,  79,  84,  84,  90,  90,  96,  96, 104, 104, 110, 110, 118, 118, 125, 125, 134, 134,
+             83,  78,  78,  75,  75,  74,  74,  72,  72,  76,  76,  81,  81,  86,  86,  92,  92,  99,  99, 106, 106, 113, 113, 121, 121, 128, 128, 137, 137, 140,
+             83,  78,  78,  75,  75,  74,  74,  72,  72,  76,  76,  81,  81,  86,  86,  92,  92,  99,  99, 106, 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140,
+             87,  83,  83,  79,  79,  77,  77,  75,  75,  80,  80,  84,  84,  90,  90,  96,  96, 102, 102, 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149,
+        }, {
+             32,
+             31,  31,
+             31,  31,  31,
+             30,  31,  31,  32,
+             30,  31,  31,  32,  32,
+             33,  34,  34,  35,  35,  39,
+             33,  34,  34,  35,  35,  39,  39,
+             36,  38,  38,  40,  40,  43,  43,  47,
+             36,  38,  38,  40,  40,  43,  43,  47,  47,
+             41,  42,  42,  42,  42,  45,  45,  47,  47,  48,
+             41,  42,  42,  42,  42,  45,  45,  47,  47,  48,  48,
+             49,  47,  47,  46,  46,  47,  47,  48,  48,  50,  50,  53,
+             49,  47,  47,  46,  46,  47,  47,  48,  48,  50,  50,  53,  53,
+             48,  47,  47,  45,  45,  46,  46,  46,  46,  49,  49,  53,  53,  54,
+             48,  47,  47,  45,  45,  46,  46,  46,  46,  49,  49,  53,  53,  54,  54,
+             49,  47,  47,  45,  45,  45,  45,  45,  45,  49,  49,  53,  53,  55,  55,  58,
+             49,  47,  47,  45,  45,  45,  45,  45,  45,  49,  49,  53,  53,  55,  55,  58,  58,
+             50,  48,  48,  46,  46,  46,  46,  46,  46,  50,  50,  54,  54,  56,  56,  60,  60,  61,
+             50,  48,  48,  46,  46,  46,  46,  46,  46,  50,  50,  54,  54,  56,  56,  60,  60,  61,  61,
+             52,  50,  50,  47,  47,  47,  47,  47,  47,  50,  50,  54,  54,  57,  57,  61,  61,  63,  63,  66,
+             52,  50,  50,  47,  47,  47,  47,  47,  47,  50,  50,  54,  54,  57,  57,  61,  61,  63,  63,  66,  66,
+             54,  52,  52,  49,  49,  49,  49,  48,  48,  52,  52,  55,  55,  58,  58,  62,  62,  65,  65,  68,  68,  71,
+             54,  52,  52,  49,  49,  49,  49,  48,  48,  52,  52,  55,  55,  58,  58,  62,  62,  65,  65,  68,  68,  71,  71,
+             57,  54,  54,  52,  52,  51,  51,  50,  50,  53,  53,  56,  56,  60,  60,  63,  63,  67,  67,  70,  70,  73,  73,  76,
+             57,  54,  54,  52,  52,  51,  51,  50,  50,  53,  53,  56,  56,  60,  60,  63,  63,  67,  67,  70,  70,  73,  73,  76,  76,
+             60,  57,  57,  54,  54,  53,  53,  52,  52,  55,  55,  58,  58,  61,  61,  65,  65,  68,  68,  72,  72,  75,  75,  79,  79,  82,
+             60,  57,  57,  54,  54,  53,  53,  52,  52,  55,  55,  58,  58,  61,  61,  65,  65,  68,  68,  72,  72,  75,  75,  79,  79,  82,  82,
+             63,  60,  60,  57,  57,  56,  56,  54,  54,  57,  57,  60,  60,  63,  63,  67,  67,  71,  71,  75,  75,  78,  78,  82,  82,  85,  85,  89,
+             63,  60,  60,  57,  57,  56,  56,  54,  54,  57,  57,  60,  60,  63,  63,  67,  67,  71,  71,  75,  75,  78,  78,  82,  82,  85,  85,  89,  89,
+             65,  61,  61,  58,  58,  57,  57,  55,  55,  58,  58,  61,  61,  64,  64,  68,  68,  71,  71,  75,  75,  79,  79,  83,  83,  86,  86,  90,  90,  91,
+             65,  61,  61,  58,  58,  57,  57,  55,  55,  58,  58,  61,  61,  64,  64,  68,  68,  71,  71,  75,  75,  79,  79,  83,  83,  86,  86,  90,  90,  91,  91,
+             67,  63,  63,  60,  60,  59,  59,  57,  57,  60,  60,  62,  62,  66,  66,  69,  69,  72,  72,  76,  76,  80,  80,  84,  84,  88,  88,  92,  92,  93,  93,  95,
+        },
+    }, {
+        {
+             32,
+             31,  31,
+             31,  32,  32,
+             31,  32,  32,  32,
+             31,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  33,  33,
+             32,  32,  32,  32,  32,  33,  33,  34,
+             32,  32,  32,  32,  32,  33,  34,  34,  35,
+             32,  32,  32,  32,  33,  33,  34,  34,  35,  35,
+             34,  34,  34,  33,  33,  34,  35,  35,  37,  37,  39,
+             34,  34,  34,  33,  33,  34,  35,  35,  37,  37,  39,  39,
+             35,  35,  35,  34,  34,  35,  36,  36,  38,  38,  42,  42,  46,
+             36,  35,  35,  34,  34,  35,  36,  37,  38,  38,  42,  42,  47,  48,
+             38,  37,  37,  36,  36,  37,  38,  38,  39,  40,  44,  44,  48,  50,  51,
+             39,  38,  38,  38,  37,  38,  39,  39,  40,  41,  45,  45,  49,  50,  52,  54,
+             41,  40,  40,  39,  38,  39,  40,  40,  41,  41,  46,  46,  50,  52,  54,  55,  57,
+             44,  42,  42,  41,  41,  41,  42,  42,  42,  43,  47,  47,  52,  54,  56,  58,  60,  63,
+             45,  43,  43,  42,  41,  42,  42,  43,  43,  43,  48,  48,  53,  54,  57,  58,  60,  64,  65,
+             48,  46,  46,  45,  44,  45,  45,  45,  46,  46,  51,  51,  55,  57,  59,  61,  63,  67,  68,  71,
+             48,  46,  46,  45,  44,  45,  45,  45,  46,  46,  51,  51,  55,  57,  59,  61,  63,  67,  68,  71,  71,
+             53,  51,  51,  49,  49,  49,  49,  49,  49,  49,  54,  54,  58,  59,  62,  64,  67,  71,  72,  75,  75,  81,
+             54,  52,  51,  50,  49,  49,  50,  49,  49,  50,  54,  54,  59,  60,  63,  65,  67,  71,  72,  76,  76,  81,  82,
+             57,  55,  55,  53,  52,  52,  52,  52,  52,  52,  57,  57,  61,  62,  65,  67,  70,  74,  75,  79,  79,  85,  85,  89,
+             59,  56,  56,  54,  54,  54,  54,  54,  53,  54,  58,  58,  62,  64,  67,  69,  71,  75,  76,  80,  80,  86,  87,  90,  92,
+             62,  59,  59,  57,  56,  56,  56,  56,  55,  56,  60,  60,  64,  66,  69,  71,  73,  77,  78,  83,  83,  89,  89,  93,  95,  98,
+             65,  62,  62,  60,  59,  59,  59,  59,  58,  58,  63,  63,  67,  68,  71,  73,  75,  79,  81,  85,  85,  91,  92,  96,  98, 101, 105,
+             67,  64,  64,  62,  61,  61,  60,  60,  59,  60,  64,  64,  68,  69,  72,  74,  77,  81,  82,  87,  87,  93,  94,  98,  99, 103, 106, 108,
+             71,  68,  68,  66,  65,  64,  64,  64,  63,  63,  68,  68,  72,  73,  76,  78,  80,  84,  85,  90,  90,  97,  97, 102, 103, 107, 111, 113, 117,
+             72,  69,  69,  66,  65,  65,  65,  64,  63,  64,  68,  68,  72,  73,  76,  78,  81,  85,  86,  91,  91,  97,  98, 102, 104, 108, 111, 113, 118, 119,
+             80,  76,  76,  73,  72,  72,  71,  70,  69,  70,  74,  74,  78,  79,  82,  84,  86,  90,  91,  96,  96, 103, 104, 108, 110, 114, 118, 120, 125, 126, 134,
+             80,  76,  76,  73,  72,  72,  71,  70,  69,  70,  74,  74,  78,  79,  82,  84,  86,  90,  91,  96,  96, 103, 104, 108, 110, 114, 118, 120, 125, 126, 134, 134,
+        }, {
+             32,
+             31,  31,
+             31,  31,  31,
+             30,  31,  31,  31,
+             30,  31,  31,  31,  32,
+             32,  32,  33,  33,  33,  35,
+             33,  34,  34,  35,  35,  37,  39,
+             34,  35,  35,  36,  36,  38,  40,  41,
+             36,  38,  38,  39,  40,  41,  43,  44,  47,
+             37,  38,  39,  40,  40,  42,  43,  44,  47,  47,
+             41,  42,  42,  42,  42,  43,  45,  45,  47,  47,  48,
+             41,  42,  42,  42,  42,  43,  45,  45,  47,  47,  48,  48,
+             47,  46,  46,  46,  45,  46,  47,  47,  47,  48,  50,  50,  52,
+             49,  48,  47,  47,  46,  47,  47,  47,  48,  48,  50,  50,  52,  53,
+             49,  47,  47,  46,  46,  46,  46,  47,  47,  47,  50,  50,  52,  53,  53,
+             48,  47,  47,  46,  45,  46,  46,  46,  46,  47,  49,  49,  52,  53,  54,  54,
+             49,  47,  47,  46,  45,  45,  46,  46,  46,  46,  49,  49,  52,  53,  54,  55,  55,
+             49,  47,  47,  45,  45,  45,  45,  45,  45,  45,  49,  49,  52,  53,  55,  55,  57,  58,
+             49,  47,  47,  46,  45,  45,  45,  45,  45,  46,  49,  49,  52,  53,  55,  56,  57,  59,  59,
+             50,  48,  48,  47,  46,  46,  46,  46,  46,  46,  50,  50,  53,  54,  55,  56,  58,  60,  60,  61,
+             50,  48,  48,  47,  46,  46,  46,  46,  46,  46,  50,  50,  53,  54,  55,  56,  58,  60,  60,  61,  61,
+             52,  50,  49,  48,  47,  47,  47,  47,  46,  47,  50,  50,  53,  54,  56,  57,  59,  61,  61,  63,  63,  66,
+             52,  50,  50,  48,  47,  47,  47,  47,  47,  47,  50,  50,  53,  54,  56,  57,  59,  61,  61,  63,  63,  66,  66,
+             54,  51,  51,  50,  49,  49,  49,  48,  48,  48,  51,  51,  54,  55,  57,  58,  60,  62,  62,  65,  65,  67,  68,  69,
+             54,  52,  52,  50,  49,  49,  49,  49,  48,  48,  52,  52,  55,  55,  57,  58,  60,  62,  63,  65,  65,  68,  68,  70,  71,
+             56,  53,  53,  51,  51,  50,  50,  50,  49,  49,  52,  52,  55,  56,  58,  59,  61,  63,  63,  66,  66,  69,  69,  71,  72,  73,
+             57,  54,  54,  52,  52,  51,  51,  51,  50,  50,  53,  53,  56,  56,  58,  60,  61,  63,  64,  67,  67,  70,  70,  72,  73,  75,  76,
+             58,  55,  55,  53,  52,  52,  52,  51,  50,  51,  54,  54,  56,  57,  59,  60,  62,  64,  65,  67,  67,  71,  71,  73,  74,  75,  77,  78,
+             60,  57,  57,  55,  54,  54,  53,  53,  52,  52,  55,  55,  58,  58,  60,  61,  63,  65,  66,  68,  68,  72,  72,  74,  75,  77,  79,  80,  82,
+             60,  57,  57,  55,  54,  54,  54,  53,  52,  52,  55,  55,  58,  58,  60,  62,  63,  65,  66,  69,  69,  72,  73,  75,  76,  77,  79,  80,  82,  82,
+             63,  60,  60,  58,  57,  57,  56,  55,  54,  55,  57,  57,  60,  60,  62,  63,  65,  67,  68,  71,  71,  74,  75,  77,  78,  80,  82,  83,  85,  85,  89,
+             63,  60,  60,  58,  57,  57,  56,  55,  54,  55,  57,  57,  60,  60,  62,  63,  65,  67,  68,  71,  71,  74,  75,  77,  78,  80,  82,  83,  85,  85,  89,  89,
+        },
+    }, {
+        {
+             32,
+             31,  31,
+             31,  31,  32,
+             31,  32,  32,  32,
+             31,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  33,
+             31,  32,  32,  32,  32,  32,  33,  33,
+             32,  32,  32,  32,  32,  32,  33,  33,  34,
+             32,  32,  32,  32,  32,  32,  33,  34,  34,  35,
+             32,  32,  32,  32,  32,  32,  33,  34,  34,  35,  35,
+             33,  33,  33,  33,  33,  33,  34,  35,  35,  36,  36,  38,
+             34,  34,  34,  34,  33,  33,  35,  35,  36,  37,  37,  39,  39,
+             34,  34,  34,  34,  34,  34,  35,  36,  36,  37,  37,  40,  41,  42,
+             36,  35,  35,  35,  34,  34,  36,  36,  37,  38,  38,  42,  42,  45,  48,
+             36,  35,  35,  35,  34,  34,  36,  36,  37,  38,  38,  42,  42,  45,  48,  48,
+             38,  38,  38,  37,  37,  37,  38,  38,  39,  40,  40,  43,  44,  46,  50,  50,  52,
+             39,  38,  38,  38,  37,  37,  39,  39,  39,  40,  40,  44,  45,  47,  50,  50,  53,  54,
+             41,  40,  40,  39,  38,  38,  40,  40,  40,  41,  41,  45,  46,  48,  52,  52,  54,  55,  57,
+             44,  42,  42,  42,  41,  41,  42,  42,  42,  42,  42,  46,  47,  50,  54,  54,  57,  58,  60,  63,
+             44,  42,  42,  42,  41,  41,  42,  42,  42,  42,  42,  46,  47,  50,  54,  54,  57,  58,  60,  63,  63,
+             47,  46,  45,  45,  44,  44,  44,  45,  45,  45,  45,  49,  50,  52,  56,  56,  59,  60,  62,  66,  66,  69,
+             48,  47,  46,  45,  44,  44,  45,  45,  45,  46,  46,  50,  51,  53,  57,  57,  60,  61,  63,  67,  67,  70,  71,
+             50,  49,  48,  47,  46,  46,  47,  47,  47,  47,  47,  51,  52,  54,  58,  58,  61,  62,  65,  68,  68,  72,  73,  75,
+             54,  52,  51,  50,  49,  49,  49,  50,  49,  49,  49,  53,  54,  56,  60,  60,  64,  65,  67,  71,  71,  75,  76,  78,  82,
+             54,  52,  51,  50,  49,  49,  49,  50,  49,  49,  49,  53,  54,  56,  60,  60,  64,  65,  67,  71,  71,  75,  76,  78,  82,  82,
+             58,  56,  55,  54,  53,  53,  53,  53,  53,  52,  52,  56,  57,  59,  63,  63,  67,  68,  70,  74,  74,  78,  79,  82,  86,  86,  90,
+             59,  57,  56,  55,  54,  54,  54,  54,  54,  53,  53,  57,  58,  60,  64,  64,  68,  69,  71,  75,  75,  79,  80,  83,  87,  87,  91,  92,
+             61,  59,  58,  57,  56,  56,  56,  56,  55,  55,  55,  59,  60,  62,  65,  65,  69,  70,  73,  77,  77,  81,  82,  85,  89,  89,  93,  94,  97,
+             65,  63,  62,  61,  59,  59,  59,  59,  59,  58,  58,  62,  63,  65,  68,  68,  72,  73,  75,  79,  79,  84,  85,  88,  92,  92,  97,  98, 101, 105,
+             65,  63,  62,  61,  59,  59,  59,  59,  59,  58,  58,  62,  63,  65,  68,  68,  72,  73,  75,  79,  79,  84,  85,  88,  92,  92,  97,  98, 101, 105, 105,
+             70,  67,  67,  65,  64,  64,  63,  63,  63,  62,  62,  66,  67,  69,  72,  72,  76,  77,  79,  83,  83,  88,  89,  92,  96,  96, 101, 102, 105, 109, 109, 114,
+        }, {
+             32,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             30,  31,  31,  31,  32,
+             30,  31,  31,  31,  32,  32,
+             33,  33,  34,  34,  34,  34,  37,
+             33,  34,  34,  35,  35,  35,  38,  39,
+             34,  36,  36,  36,  37,  37,  40,  40,  42,
+             36,  38,  38,  39,  40,  40,  42,  43,  45,  47,
+             36,  38,  38,  39,  40,  40,  42,  43,  45,  47,  47,
+             40,  41,  41,  41,  42,  42,  44,  44,  45,  47,  47,  48,
+             41,  42,  42,  42,  42,  42,  44,  45,  46,  47,  47,  48,  48,
+             44,  44,  44,  44,  44,  44,  45,  46,  46,  47,  47,  49,  49,  50,
+             49,  48,  47,  47,  46,  46,  47,  47,  47,  48,  48,  50,  50,  51,  53,
+             49,  48,  47,  47,  46,  46,  47,  47,  47,  48,  48,  50,  50,  51,  53,  53,
+             48,  47,  47,  46,  45,  45,  46,  46,  46,  47,  47,  49,  50,  51,  53,  53,  54,
+             48,  47,  47,  46,  45,  45,  46,  46,  46,  46,  46,  49,  49,  51,  53,  53,  54,  54,
+             49,  47,  47,  46,  45,  45,  46,  46,  46,  46,  46,  49,  49,  51,  53,  53,  54,  55,  55,
+             49,  47,  47,  46,  45,  45,  45,  45,  45,  45,  45,  48,  49,  51,  53,  53,  55,  55,  57,  58,
+             49,  47,  47,  46,  45,  45,  45,  45,  45,  45,  45,  48,  49,  51,  53,  53,  55,  55,  57,  58,  58,
+             50,  48,  48,  47,  46,  46,  46,  46,  46,  46,  46,  49,  50,  51,  54,  54,  56,  56,  57,  59,  59,  61,
+             50,  49,  48,  47,  46,  46,  46,  46,  46,  46,  46,  49,  50,  51,  54,  54,  56,  56,  58,  60,  60,  61,  61,
+             51,  49,  49,  48,  47,  47,  47,  47,  47,  46,  46,  49,  50,  51,  54,  54,  56,  57,  58,  60,  60,  62,  62,  63,
+             52,  50,  50,  49,  47,  47,  47,  47,  47,  47,  47,  49,  50,  52,  54,  54,  57,  57,  59,  61,  61,  63,  63,  65,  66,
+             52,  50,  50,  49,  47,  47,  47,  47,  47,  47,  47,  49,  50,  52,  54,  54,  57,  57,  59,  61,  61,  63,  63,  65,  66,  66,
+             54,  52,  51,  50,  49,  49,  49,  49,  48,  48,  48,  51,  51,  53,  55,  55,  58,  58,  60,  62,  62,  64,  65,  66,  68,  68,  70,
+             54,  52,  52,  51,  49,  49,  49,  49,  49,  48,  48,  51,  52,  53,  55,  55,  58,  58,  60,  62,  62,  64,  65,  66,  68,  68,  70,  71,
+             55,  53,  53,  52,  50,  50,  50,  50,  49,  49,  49,  51,  52,  54,  56,  56,  58,  59,  60,  63,  63,  65,  66,  67,  69,  69,  71,  72,  73,
+             57,  55,  54,  53,  52,  52,  51,  51,  50,  50,  50,  52,  53,  54,  56,  56,  59,  60,  61,  63,  63,  66,  67,  68,  70,  70,  73,  73,  74,  76,
+             57,  55,  54,  53,  52,  52,  51,  51,  50,  50,  50,  52,  53,  54,  56,  56,  59,  60,  61,  63,  63,  66,  67,  68,  70,  70,  73,  73,  74,  76,  76,
+             59,  57,  56,  55,  54,  54,  53,  53,  52,  51,  51,  54,  55,  56,  58,  58,  60,  61,  63,  65,  65,  67,  68,  70,  72,  72,  74,  75,  76,  78,  78,  80,
+        },
+    }, {
+        {
+             32,
+             31,  31,
+             31,  31,  32,
+             31,  31,  32,  32,
+             31,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  32,  33,
+             31,  32,  32,  32,  32,  32,  32,  33,  33,
+             32,  32,  32,  32,  32,  32,  33,  33,  33,  34,
+             32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  35,
+             32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  35,  35,
+             32,  33,  33,  33,  33,  33,  33,  34,  34,  35,  36,  36,  36,
+             34,  34,  34,  34,  33,  33,  34,  35,  35,  35,  37,  37,  38,  39,
+             34,  34,  34,  34,  33,  33,  34,  35,  35,  35,  37,  37,  38,  39,  39,
+             35,  34,  34,  34,  34,  34,  34,  35,  36,  36,  37,  37,  39,  41,  41,  43,
+             36,  35,  35,  35,  34,  34,  35,  36,  36,  37,  38,  38,  40,  42,  42,  45,  48,
+             36,  35,  35,  35,  34,  34,  35,  36,  36,  37,  38,  38,  40,  42,  42,  45,  48,  48,
+             38,  37,  37,  37,  36,  36,  36,  38,  38,  38,  39,  39,  41,  44,  44,  47,  50,  50,  51,
+             39,  39,  38,  38,  37,  37,  38,  39,  39,  39,  40,  40,  42,  45,  45,  47,  50,  50,  52,  54,
+             39,  39,  38,  38,  37,  37,  38,  39,  39,  39,  40,  40,  42,  45,  45,  47,  50,  50,  52,  54,  54,
+             42,  41,  41,  41,  40,  40,  40,  41,  41,  41,  42,  42,  44,  47,  47,  49,  53,  53,  55,  56,  56,  60,
+             44,  43,  42,  42,  41,  41,  41,  42,  42,  42,  42,  42,  44,  47,  47,  50,  54,  54,  56,  58,  58,  61,  63,
+             44,  43,  43,  42,  41,  41,  41,  42,  42,  42,  43,  43,  45,  48,  48,  51,  54,  54,  56,  58,  58,  62,  64,  64,
+             47,  46,  45,  45,  44,  44,  44,  44,  45,  45,  45,  45,  47,  50,  50,  53,  56,  56,  58,  60,  60,  64,  66,  66,  69,
+             48,  47,  46,  46,  45,  44,  45,  45,  45,  45,  46,  46,  47,  51,  51,  53,  57,  57,  59,  61,  61,  65,  67,  67,  70,  71,
+             49,  48,  47,  47,  46,  45,  45,  46,  46,  46,  46,  46,  48,  51,  51,  54,  57,  57,  60,  62,  62,  66,  68,  68,  71,  72,  73,
+             53,  51,  51,  51,  49,  49,  49,  49,  49,  49,  49,  49,  51,  54,  54,  57,  59,  59,  62,  64,  64,  69,  71,  71,  74,  75,  77,  81,
+             54,  52,  51,  51,  50,  49,  49,  50,  50,  49,  49,  49,  51,  54,  54,  57,  60,  60,  63,  65,  65,  69,  71,  72,  75,  76,  77,  81,  82,
+             55,  53,  53,  52,  51,  50,  50,  51,  51,  51,  50,  50,  52,  55,  55,  58,  61,  61,  64,  66,  66,  70,  72,  73,  76,  77,  78,  83,  83,  85,
+             59,  57,  56,  56,  54,  54,  54,  54,  54,  54,  53,  53,  55,  58,  58,  61,  64,  64,  67,  69,  69,  73,  75,  76,  79,  80,  81,  86,  87,  88,  92,
+             59,  57,  56,  56,  54,  54,  54,  54,  54,  54,  53,  53,  55,  58,  58,  61,  64,  64,  67,  69,  69,  73,  75,  76,  79,  80,  81,  86,  87,  88,  92,  92,
+        }, {
+             32,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             30,  31,  31,  31,  31,
+             30,  31,  31,  31,  31,  32,
+             31,  31,  32,  32,  32,  32,  33,
+             33,  34,  34,  34,  35,  35,  35,  38,
+             33,  34,  34,  34,  35,  35,  36,  38,  39,
+             34,  35,  35,  36,  36,  36,  37,  40,  40,  41,
+             36,  38,  38,  38,  39,  40,  40,  43,  43,  44,  47,
+             36,  38,  38,  38,  39,  40,  40,  43,  43,  44,  47,  47,
+             38,  39,  40,  40,  41,  41,  41,  43,  44,  45,  47,  47,  47,
+             41,  42,  42,  42,  42,  42,  43,  44,  45,  45,  47,  47,  48,  48,
+             41,  42,  42,  42,  42,  42,  43,  44,  45,  45,  47,  47,  48,  48,  48,
+             45,  45,  45,  45,  44,  44,  44,  46,  46,  46,  47,  47,  48,  49,  49,  50,
+             49,  48,  47,  47,  46,  46,  46,  47,  47,  47,  48,  48,  49,  50,  50,  51,  53,
+             49,  48,  47,  47,  46,  46,  46,  47,  47,  47,  48,  48,  49,  50,  50,  51,  53,  53,
+             49,  47,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  48,  50,  50,  51,  53,  53,  53,
+             48,  47,  47,  47,  46,  45,  45,  46,  46,  46,  46,  46,  48,  49,  49,  51,  53,  53,  54,  54,
+             48,  47,  47,  47,  46,  45,  45,  46,  46,  46,  46,  46,  48,  49,  49,  51,  53,  53,  54,  54,  54,
+             49,  47,  47,  47,  45,  45,  45,  45,  45,  45,  45,  45,  47,  49,  49,  51,  53,  53,  54,  55,  55,  57,
+             49,  47,  47,  46,  45,  45,  45,  45,  45,  45,  45,  45,  47,  49,  49,  51,  53,  53,  55,  55,  55,  57,  58,
+             49,  47,  47,  47,  45,  45,  45,  45,  45,  45,  45,  45,  47,  49,  49,  51,  53,  53,  55,  56,  56,  58,  58,  59,
+             50,  49,  48,  48,  46,  46,  46,  46,  46,  46,  46,  46,  47,  50,  50,  52,  54,  54,  55,  56,  56,  58,  59,  59,  61,
+             50,  49,  48,  48,  47,  46,  46,  46,  46,  46,  46,  46,  47,  50,  50,  52,  54,  54,  55,  56,  56,  59,  60,  60,  61,  61,
+             51,  49,  48,  48,  47,  46,  46,  47,  47,  46,  46,  46,  47,  50,  50,  52,  54,  54,  55,  56,  56,  59,  60,  60,  61,  62,  62,
+             52,  50,  49,  49,  48,  47,  47,  47,  47,  47,  46,  46,  48,  50,  50,  52,  54,  54,  56,  57,  57,  60,  61,  61,  63,  63,  64,  66,
+             52,  50,  50,  49,  48,  47,  47,  47,  47,  47,  47,  47,  48,  50,  50,  52,  54,  54,  56,  57,  57,  60,  61,  61,  63,  63,  64,  66,  66,
+             53,  51,  50,  50,  48,  48,  48,  48,  48,  48,  47,  47,  48,  51,  51,  52,  54,  54,  56,  58,  58,  60,  61,  62,  63,  64,  64,  67,  67,  68,
+             54,  53,  52,  52,  50,  49,  49,  49,  49,  49,  48,  48,  49,  52,  52,  53,  55,  55,  57,  58,  58,  61,  62,  63,  64,  65,  66,  68,  68,  69,  71,
+             54,  53,  52,  52,  50,  49,  49,  49,  49,  49,  48,  48,  49,  52,  52,  53,  55,  55,  57,  58,  58,  61,  62,  63,  64,  65,  66,  68,  68,  69,  71,  71,
+        },
+    }, {
+        {
+             32,
+             31,  31,
+             31,  31,  32,
+             31,  31,  32,  32,
+             31,  31,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  32,  32,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,
+             32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,
+             32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  35,  35,
+             32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  35,  35,  35,
+             32,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  35,  36,  36,  36,
+             34,  34,  34,  34,  34,  33,  33,  34,  35,  35,  35,  36,  37,  37,  38,  39,
+             34,  34,  34,  34,  34,  33,  33,  34,  35,  35,  35,  36,  37,  37,  38,  39,  39,
+             34,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  36,  37,  37,  38,  40,  40,  41,
+             35,  35,  35,  35,  34,  34,  34,  34,  36,  36,  36,  37,  38,  38,  39,  42,  42,  43,  46,
+             36,  35,  35,  35,  35,  34,  34,  35,  36,  36,  36,  37,  38,  38,  40,  42,  42,  44,  47,  48,
+             36,  35,  35,  35,  35,  34,  34,  35,  36,  36,  36,  37,  38,  38,  40,  42,  42,  44,  47,  48,  48,
+             38,  37,  37,  37,  36,  36,  36,  36,  37,  38,  38,  39,  39,  39,  41,  44,  44,  45,  48,  50,  50,  51,
+             39,  39,  38,  38,  38,  37,  37,  38,  39,  39,  39,  40,  40,  40,  42,  45,  45,  46,  49,  50,  50,  52,  54,
+             39,  39,  38,  38,  38,  37,  37,  38,  39,  39,  39,  40,  40,  40,  42,  45,  45,  46,  49,  50,  50,  52,  54,  54,
+             41,  40,  40,  40,  39,  38,  38,  39,  40,  40,  40,  41,  41,  41,  43,  46,  46,  47,  50,  52,  52,  54,  55,  55,  57,
+             44,  43,  42,  42,  42,  41,  41,  41,  42,  42,  42,  42,  42,  42,  44,  47,  47,  49,  52,  54,  54,  56,  58,  58,  60,  63,
+             44,  43,  42,  42,  42,  41,  41,  41,  42,  42,  42,  42,  42,  42,  44,  47,  47,  49,  52,  54,  54,  56,  58,  58,  60,  63,  63,
+             45,  44,  43,  43,  42,  41,  41,  42,  42,  42,  42,  43,  43,  43,  45,  48,  48,  49,  53,  54,  54,  57,  58,  58,  60,  64,  64,  65,
+             47,  46,  45,  45,  45,  44,  44,  44,  44,  45,  45,  45,  45,  45,  47,  50,  50,  51,  55,  56,  56,  58,  60,  60,  62,  66,  66,  67,  69,
+             48,  47,  46,  46,  45,  44,  44,  45,  45,  45,  45,  45,  46,  46,  47,  51,  51,  52,  55,  57,  57,  59,  61,  61,  63,  67,  67,  68,  70,  71,
+             48,  47,  46,  46,  45,  44,  44,  45,  45,  45,  45,  45,  46,  46,  47,  51,  51,  52,  55,  57,  57,  59,  61,  61,  63,  67,  67,  68,  70,  71,  71,
+             51,  50,  49,  49,  48,  47,  47,  47,  48,  48,  48,  48,  48,  48,  50,  53,  53,  54,  57,  58,  58,  61,  63,  63,  66,  69,  69,  70,  73,  74,  74,  77,
+        }, {
+             32,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             31,  31,  31,  31,  31,
+             30,  31,  31,  31,  31,  32,
+             30,  31,  31,  31,  31,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  33,
+             33,  33,  34,  34,  34,  34,  34,  35,  37,
+             33,  34,  34,  34,  35,  35,  35,  36,  38,  39,
+             33,  34,  34,  34,  35,  35,  35,  36,  38,  39,  39,
+             35,  36,  37,  37,  37,  38,  38,  38,  41,  41,  41,  44,
+             36,  37,  38,  38,  39,  40,  40,  40,  42,  43,  43,  46,  47,
+             36,  37,  38,  38,  39,  40,  40,  40,  42,  43,  43,  46,  47,  47,
+             38,  39,  40,  40,  40,  41,  41,  41,  43,  44,  44,  46,  47,  47,  47,
+             41,  42,  42,  42,  42,  42,  42,  43,  44,  45,  45,  46,  47,  47,  48,  48,
+             41,  42,  42,  42,  42,  42,  42,  43,  44,  45,  45,  46,  47,  47,  48,  48,  48,
+             43,  43,  43,  43,  43,  43,  43,  43,  45,  45,  45,  46,  47,  47,  48,  49,  49,  49,
+             47,  47,  46,  46,  46,  45,  45,  46,  46,  47,  47,  47,  47,  47,  48,  50,  50,  50,  52,
+             49,  48,  47,  47,  47,  46,  46,  46,  47,  47,  47,  47,  48,  48,  49,  50,  50,  51,  52,  53,
+             49,  48,  47,  47,  47,  46,  46,  46,  47,  47,  47,  47,  48,  48,  49,  50,  50,  51,  52,  53,  53,
+             49,  48,  47,  47,  46,  46,  46,  46,  46,  46,  46,  47,  47,  47,  48,  50,  50,  50,  52,  53,  53,  53,
+             48,  47,  47,  47,  46,  45,  45,  45,  46,  46,  46,  46,  46,  46,  48,  49,  49,  50,  52,  53,  53,  54,  54,
+             48,  47,  47,  47,  46,  45,  45,  45,  46,  46,  46,  46,  46,  46,  48,  49,  49,  50,  52,  53,  53,  54,  54,  54,
+             49,  47,  47,  47,  46,  45,  45,  45,  46,  46,  46,  46,  46,  46,  47,  49,  49,  50,  52,  53,  53,  54,  55,  55,  55,
+             49,  47,  47,  47,  46,  45,  45,  45,  45,  45,  45,  45,  45,  45,  47,  49,  49,  50,  52,  53,  53,  55,  55,  55,  57,  58,
+             49,  47,  47,  47,  46,  45,  45,  45,  45,  45,  45,  45,  45,  45,  47,  49,  49,  50,  52,  53,  53,  55,  55,  55,  57,  58,  58,
+             49,  48,  47,  47,  46,  45,  45,  45,  45,  45,  45,  45,  45,  45,  47,  49,  49,  50,  52,  53,  53,  55,  56,  56,  57,  59,  59,  59,
+             50,  49,  48,  48,  47,  46,  46,  46,  46,  46,  46,  46,  46,  46,  47,  50,  50,  50,  53,  54,  54,  55,  56,  56,  57,  59,  59,  60,  61,
+             50,  49,  48,  48,  47,  46,  46,  46,  46,  46,  46,  46,  46,  46,  47,  50,  50,  50,  53,  54,  54,  55,  56,  56,  58,  60,  60,  60,  61,  61,
+             50,  49,  48,  48,  47,  46,  46,  46,  46,  46,  46,  46,  46,  46,  47,  50,  50,  50,  53,  54,  54,  55,  56,  56,  58,  60,  60,  60,  61,  61,  61,
+             51,  50,  49,  49,  48,  47,  47,  47,  47,  47,  47,  47,  46,  46,  48,  50,  50,  51,  53,  54,  54,  56,  57,  57,  58,  60,  60,  61,  62,  63,  63,  64,
+        },
+    }, {
+        {
+             32,
+             31,  31,
+             31,  31,  32,
+             31,  31,  32,  32,
+             31,  31,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  34,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  34,  35,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  34,  35,  35,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  34,  35,  35,  35,
+             33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  35,  36,  36,  36,  37,
+             34,  34,  34,  34,  34,  34,  33,  33,  33,  34,  35,  35,  35,  36,  37,  37,  37,  38,  39,
+             34,  34,  34,  34,  34,  34,  33,  33,  33,  34,  35,  35,  35,  36,  37,  37,  37,  38,  39,  39,
+             34,  34,  34,  34,  34,  34,  33,  33,  33,  34,  35,  35,  35,  36,  37,  37,  37,  38,  39,  39,  39,
+             35,  34,  34,  34,  34,  34,  34,  34,  34,  35,  36,  36,  36,  36,  37,  37,  37,  39,  41,  41,  41,  43,
+             36,  35,  35,  35,  35,  35,  34,  34,  34,  35,  36,  36,  36,  37,  38,  38,  38,  40,  42,  42,  42,  45,  48,
+             36,  35,  35,  35,  35,  35,  34,  34,  34,  35,  36,  36,  36,  37,  38,  38,  38,  40,  42,  42,  42,  45,  48,  48,
+             36,  35,  35,  35,  35,  35,  34,  34,  34,  35,  36,  36,  36,  37,  38,  38,  38,  40,  42,  42,  42,  45,  48,  48,  48,
+             37,  37,  37,  37,  37,  36,  36,  36,  36,  37,  38,  38,  38,  38,  39,  39,  39,  41,  44,  44,  44,  46,  49,  49,  49,  51,
+             39,  39,  38,  38,  38,  38,  37,  37,  37,  38,  39,  39,  39,  40,  40,  40,  40,  42,  45,  45,  45,  47,  50,  50,  50,  52,  54,
+             39,  39,  38,  38,  38,  38,  37,  37,  37,  38,  39,  39,  39,  40,  40,  40,  40,  42,  45,  45,  45,  47,  50,  50,  50,  52,  54,  54,
+             39,  39,  38,  38,  38,  38,  37,  37,  37,  38,  39,  39,  39,  40,  40,  40,  40,  42,  45,  45,  45,  47,  50,  50,  50,  52,  54,  54,  54,
+             41,  41,  40,  40,  40,  39,  39,  39,  39,  40,  40,  40,  40,  41,  41,  41,  41,  44,  46,  46,  46,  49,  52,  52,  52,  54,  56,  56,  56,  58,
+             44,  43,  42,  42,  42,  41,  41,  41,  41,  41,  42,  42,  42,  42,  42,  42,  42,  45,  47,  47,  47,  50,  54,  54,  54,  56,  58,  58,  58,  60,  63,
+             44,  43,  42,  42,  42,  41,  41,  41,  41,  41,  42,  42,  42,  42,  42,  42,  42,  45,  47,  47,  47,  50,  54,  54,  54,  56,  58,  58,  58,  60,  63,  63,
+        }, {
+             32,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,
+             30,  31,  31,  31,  31,  31,  32,
+             30,  31,  31,  31,  31,  31,  32,  32,
+             30,  31,  31,  31,  31,  31,  32,  32,  32,
+             32,  32,  33,  33,  33,  33,  33,  33,  33,  35,
+             33,  34,  34,  34,  34,  35,  35,  35,  35,  37,  39,
+             33,  34,  34,  34,  34,  35,  35,  35,  35,  37,  39,  39,
+             33,  34,  34,  34,  34,  35,  35,  35,  35,  37,  39,  39,  39,
+             35,  35,  36,  36,  36,  37,  37,  37,  37,  39,  41,  41,  41,  43,
+             36,  37,  38,  38,  38,  39,  40,  40,  40,  41,  43,  43,  43,  45,  47,
+             36,  37,  38,  38,  38,  39,  40,  40,  40,  41,  43,  43,  43,  45,  47,  47,
+             36,  37,  38,  38,  38,  39,  40,  40,  40,  41,  43,  43,  43,  45,  47,  47,  47,
+             39,  39,  40,  40,  40,  41,  41,  41,  41,  42,  44,  44,  44,  45,  47,  47,  47,  47,
+             41,  42,  42,  42,  42,  42,  42,  42,  42,  43,  45,  45,  45,  46,  47,  47,  47,  48,  48,
+             41,  42,  42,  42,  42,  42,  42,  42,  42,  43,  45,  45,  45,  46,  47,  47,  47,  48,  48,  48,
+             41,  42,  42,  42,  42,  42,  42,  42,  42,  43,  45,  45,  45,  46,  47,  47,  47,  48,  48,  48,  48,
+             45,  45,  45,  45,  45,  44,  44,  44,  44,  45,  46,  46,  46,  47,  47,  47,  47,  48,  49,  49,  49,  50,
+             49,  48,  47,  47,  47,  47,  46,  46,  46,  47,  47,  47,  47,  47,  48,  48,  48,  49,  50,  50,  50,  51,  53,
+             49,  48,  47,  47,  47,  47,  46,  46,  46,  47,  47,  47,  47,  47,  48,  48,  48,  49,  50,  50,  50,  51,  53,  53,
+             49,  48,  47,  47,  47,  47,  46,  46,  46,  47,  47,  47,  47,  47,  48,  48,  48,  49,  50,  50,  50,  51,  53,  53,  53,
+             49,  48,  47,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  47,  48,  50,  50,  50,  51,  53,  53,  53,  53,
+             48,  48,  47,  47,  47,  46,  45,  45,  45,  46,  46,  46,  46,  46,  46,  46,  46,  48,  49,  49,  49,  51,  53,  53,  53,  53,  54,
+             48,  48,  47,  47,  47,  46,  45,  45,  45,  46,  46,  46,  46,  46,  46,  46,  46,  48,  49,  49,  49,  51,  53,  53,  53,  53,  54,  54,
+             48,  48,  47,  47,  47,  46,  45,  45,  45,  46,  46,  46,  46,  46,  46,  46,  46,  48,  49,  49,  49,  51,  53,  53,  53,  53,  54,  54,  54,
+             49,  48,  47,  47,  47,  46,  45,  45,  45,  45,  46,  46,  46,  46,  46,  46,  46,  47,  49,  49,  49,  51,  53,  53,  53,  54,  55,  55,  55,  56,
+             49,  48,  47,  47,  47,  46,  45,  45,  45,  45,  45,  45,  45,  45,  45,  45,  45,  47,  49,  49,  49,  51,  53,  53,  53,  54,  55,  55,  55,  57,  58,
+             49,  48,  47,  47,  47,  46,  45,  45,  45,  45,  45,  45,  45,  45,  45,  45,  45,  47,  49,  49,  49,  51,  53,  53,  53,  54,  55,  55,  55,  57,  58,  58,
+        },
+    }, {
+        {
+             32,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  32,
+             31,  31,  31,  32,  32,
+             31,  31,  31,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  34,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  35,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  35,  35,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  35,  35,  35,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  34,  34,  34,  34,  35,  35,  35,  35,
+             32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  36,  36,  36,  36,
+             33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  35,  35,  35,  35,  36,  36,  36,  36,  37,  38,
+             34,  34,  34,  34,  34,  34,  34,  33,  33,  33,  33,  34,  35,  35,  35,  35,  36,  36,  37,  37,  37,  38,  39,  39,
+             34,  34,  34,  34,  34,  34,  34,  33,  33,  33,  33,  34,  35,  35,  35,  35,  36,  36,  37,  37,  37,  38,  39,  39,  39,
+             34,  34,  34,  34,  34,  34,  34,  33,  33,  33,  33,  34,  35,  35,  35,  35,  36,  36,  37,  37,  37,  38,  39,  39,  39,  39,
+             34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  35,  36,  36,  36,  36,  37,  37,  37,  37,  38,  40,  41,  41,  41,  42,
+             35,  35,  35,  35,  35,  35,  34,  34,  34,  34,  34,  35,  36,  36,  36,  36,  37,  37,  38,  38,  38,  39,  41,  42,  42,  42,  44,  46,
+             36,  35,  35,  35,  35,  35,  35,  34,  34,  34,  34,  35,  36,  36,  36,  36,  37,  38,  38,  38,  38,  40,  42,  42,  42,  42,  45,  47,  48,
+             36,  35,  35,  35,  35,  35,  35,  34,  34,  34,  34,  35,  36,  36,  36,  36,  37,  38,  38,  38,  38,  40,  42,  42,  42,  42,  45,  47,  48,  48,
+             36,  35,  35,  35,  35,  35,  35,  34,  34,  34,  34,  35,  36,  36,  36,  36,  37,  38,  38,  38,  38,  40,  42,  42,  42,  42,  45,  47,  48,  48,  48,
+             37,  37,  36,  36,  36,  36,  36,  35,  35,  35,  35,  36,  37,  37,  37,  37,  38,  39,  39,  39,  39,  41,  42,  43,  43,  43,  45,  48,  49,  49,  49,  50,
+        }, {
+             32,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,
+             30,  31,  31,  31,  31,  31,  31,  31,
+             30,  30,  31,  31,  31,  31,  31,  31,  32,
+             30,  30,  31,  31,  31,  31,  31,  31,  32,  32,
+             30,  30,  31,  31,  31,  31,  31,  31,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  34,
+             33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  36,  37,
+             33,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  37,  38,  39,
+             33,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  37,  38,  39,  39,
+             33,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  37,  38,  39,  39,  39,
+             34,  35,  36,  36,  36,  36,  36,  37,  37,  37,  37,  38,  40,  40,  40,  40,  42,
+             36,  36,  37,  37,  37,  37,  38,  38,  39,  39,  39,  40,  41,  42,  42,  42,  44,  46,
+             36,  37,  38,  38,  38,  38,  39,  39,  40,  40,  40,  41,  42,  43,  43,  43,  45,  46,  47,
+             36,  37,  38,  38,  38,  38,  39,  39,  40,  40,  40,  41,  42,  43,  43,  43,  45,  46,  47,  47,
+             36,  37,  38,  38,  38,  38,  39,  39,  40,  40,  40,  41,  42,  43,  43,  43,  45,  46,  47,  47,  47,
+             38,  39,  39,  40,  40,  40,  40,  41,  41,  41,  41,  42,  43,  44,  44,  44,  45,  47,  47,  47,  47,  47,
+             40,  41,  41,  41,  41,  41,  41,  42,  42,  42,  42,  43,  44,  44,  44,  44,  45,  47,  47,  47,  47,  48,  48,
+             41,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  43,  44,  45,  45,  45,  46,  47,  47,  47,  47,  48,  48,  48,
+             41,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  43,  44,  45,  45,  45,  46,  47,  47,  47,  47,  48,  48,  48,  48,
+             41,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  43,  44,  45,  45,  45,  46,  47,  47,  47,  47,  48,  48,  48,  48,  48,
+             44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  45,  46,  46,  46,  46,  47,  47,  47,  47,  48,  49,  49,  49,  49,  50,
+             47,  47,  46,  46,  46,  46,  46,  46,  45,  45,  45,  46,  46,  47,  47,  47,  47,  47,  47,  47,  47,  48,  49,  50,  50,  50,  51,  52,
+             49,  48,  48,  47,  47,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  48,  48,  48,  49,  50,  50,  50,  50,  51,  52,  53,
+             49,  48,  48,  47,  47,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  48,  48,  48,  49,  50,  50,  50,  50,  51,  52,  53,  53,
+             49,  48,  48,  47,  47,  47,  47,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  48,  48,  48,  49,  50,  50,  50,  50,  51,  52,  53,  53,  53,
+             49,  48,  47,  47,  47,  47,  47,  46,  46,  46,  46,  46,  46,  47,  47,  47,  47,  47,  47,  47,  47,  48,  49,  50,  50,  50,  51,  52,  53,  53,  53,  53,
+        },
+    }, {
+        {
+             32,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             31,  31,  31,  32,  32,
+             31,  31,  31,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  34,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  34,  34,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,  34,  35,  35,  35,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,  34,  35,  35,  35,  35,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,
+             32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  35,  35,  35,  36,  36,  36,  36,  36,
+             33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  35,  35,  35,  35,  35,  35,  36,  36,  36,  36,  36,  37,  38,
+             34,  34,  34,  34,  34,  34,  34,  34,  34,  33,  33,  33,  33,  33,  34,  34,  35,  35,  35,  35,  35,  35,  36,  36,  37,  37,  37,  37,  38,  38,  39,
+             34,  34,  34,  34,  34,  34,  34,  34,  34,  33,  33,  33,  33,  33,  34,  34,  35,  35,  35,  35,  35,  35,  36,  36,  37,  37,  37,  37,  38,  38,  39,  39,
+        }, {
+             32,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,
+             30,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  32,
+             30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,
+             30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,
+             30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,
+             32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  35,
+             33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  35,  36,  37,
+             33,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  36,  37,  38,  39,
+             33,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  36,  37,  38,  39,  39,
+             33,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  36,  37,  38,  39,  39,  39,
+             33,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  36,  37,  38,  39,  39,  39,  39,
+             34,  35,  35,  35,  35,  35,  35,  36,  36,  36,  36,  36,  36,  36,  37,  38,  39,  40,  40,  40,  40,  41,
+             35,  36,  36,  36,  37,  37,  37,  37,  37,  37,  38,  38,  38,  38,  38,  39,  41,  41,  41,  41,  41,  42,  44,
+             36,  37,  37,  38,  38,  38,  38,  38,  38,  39,  39,  39,  39,  39,  40,  41,  42,  43,  43,  43,  43,  44,  45,  46,
+             36,  37,  37,  38,  38,  38,  38,  38,  39,  39,  40,  40,  40,  40,  40,  41,  42,  43,  43,  43,  43,  44,  46,  47,  47,
+             36,  37,  37,  38,  38,  38,  38,  38,  39,  39,  40,  40,  40,  40,  40,  41,  42,  43,  43,  43,  43,  44,  46,  47,  47,  47,
+             36,  37,  37,  38,  38,  38,  38,  38,  39,  39,  40,  40,  40,  40,  40,  41,  42,  43,  43,  43,  43,  44,  46,  47,  47,  47,  47,
+             37,  37,  38,  38,  39,  39,  39,  39,  39,  40,  40,  40,  40,  40,  41,  42,  43,  43,  43,  43,  43,  44,  46,  47,  47,  47,  47,  47,
+             38,  39,  39,  40,  40,  40,  40,  40,  40,  40,  41,  41,  41,  41,  41,  42,  43,  44,  44,  44,  44,  45,  46,  47,  47,  47,  47,  47,  47,
+             40,  40,  40,  41,  41,  41,  41,  41,  41,  41,  42,  42,  42,  42,  42,  43,  44,  44,  44,  44,  44,  45,  46,  47,  47,  47,  47,  47,  48,  48,
+             41,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  43,  43,  44,  45,  45,  45,  45,  45,  46,  47,  47,  47,  47,  47,  48,  48,  48,
+             41,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  42,  43,  43,  44,  45,  45,  45,  45,  45,  46,  47,  47,  47,  47,  47,  48,  48,  48,  48,
+        },
+    }, {
+        {
+             32,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  32,
+             31,  31,  31,  31,  31,  32,  32,
+             31,  31,  31,  31,  31,  32,  32,  32,
+             31,  31,  31,  31,  31,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,
+             31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,
+             32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,
+        }, {
+             32,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             30,  30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,
+             30,  30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,
+             30,  30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,
+             30,  30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,
+             30,  30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,
+             30,  30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,
+             31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,
+             32,  32,  32,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  35,  36,
+             33,  33,  33,  33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  35,  36,  37,  37,
+             33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  35,  35,  35,  35,  36,  37,  37,  38,  39,
+             33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  35,  35,  35,  35,  36,  37,  37,  38,  39,  39,
+             33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  35,  35,  35,  35,  36,  37,  37,  38,  39,  39,  39,
+             33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  35,  35,  35,  35,  36,  37,  37,  38,  39,  39,  39,  39,
+             33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  35,  35,  35,  35,  36,  37,  37,  38,  39,  39,  39,  39,  39,
+             33,  33,  34,  34,  34,  34,  34,  34,  34,  34,  34,  35,  35,  35,  35,  35,  35,  35,  35,  35,  35,  36,  37,  37,  38,  39,  39,  39,  39,  39,  39,
+             34,  34,  34,  35,  35,  35,  35,  35,  35,  35,  35,  35,  35,  36,  36,  36,  36,  36,  36,  36,  36,  37,  37,  38,  39,  40,  40,  40,  40,  40,  40,  40,
+        },
+    }, {
+        {
+             32,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+             31,  31,  31,  31,  31,  31,  31,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32,
+        }, {
+             32,
+             31,  31,
+             31,  31,  31,
+             31,  31,  31,  31,
+             31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             30,  30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             30,  30,  30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             30,  30,  30,  30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,
+             30,  30,  30,  30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,
+             30,  30,  30,  30,  30,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  31,  32,  32,
+        },
+    },
+};
+
+const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
+static uint8_t pb_32x32[32 * 32];
+static uint8_t qm_tbl_4x4[15][2][16];
+static uint8_t qm_tbl_4x8[15][2][32];
+static uint8_t qm_tbl_4x16[15][2][64];
+static uint8_t qm_tbl_8x8[15][2][64];
+static uint8_t qm_tbl_8x16[15][2][128];
+static uint8_t qm_tbl_8x32[15][2][256];
+static uint8_t qm_tbl_16x16[15][2][256];
+static uint8_t qm_tbl_16x32[15][2][512];
+static uint8_t qm_tbl_32x32[15][2][1024];
+
+static void subsample(uint8_t *const dst, const uint8_t *const src,
+                      const int sz, const int step)
+{
+    for (int y = 0; y < sz; y++)
+        for (int x = 0; x < sz; x++)
+            dst[y * sz + x] = src[y * sz * step * step + x * step];
+}
+
+static void transpose(uint8_t *const dst, const uint8_t *const src,
+                      const int w, const int h)
+{
+    for (int y = 0, y_off = 0; y < h; y++, y_off += w)
+        for (int x = 0, x_off = 0; x < w; x++, x_off += h)
+            dst[x_off + y] = src[y_off + x];
+}
+
+static void untriangle(uint8_t *dst, const uint8_t *src, const int sz) {
+    for (int y = 0; y < sz; y++) {
+        memcpy(dst, src, y + 1);
+        const uint8_t *src_ptr = &src[y];
+        for (int x = y + 1; x < sz; x++) {
+            src_ptr += x;
+            dst[x] = *src_ptr;
+        }
+        dst += sz;
+        src += y + 1;
+    }
+}
+
+COLD void dav1d_init_qm_tables(void) {
+    // This function is guaranteed to be called only once
+
+    for (int i = 0; i < 15; i++)
+        for (int j = 0; j < 2; j++) {
+            // note that the w/h in the assignment is inverted, this is on purpose
+            // because we store coefficients transposed
+            dav1d_qm_tbl[i][j][RTX_4X8  ] = qm_tbl_8x4[i][j];
+            dav1d_qm_tbl[i][j][RTX_8X4  ] = qm_tbl_4x8[i][j];
+            transpose(qm_tbl_4x8[i][j], qm_tbl_8x4[i][j], 8, 4);
+            dav1d_qm_tbl[i][j][RTX_4X16 ] = qm_tbl_16x4[i][j];
+            dav1d_qm_tbl[i][j][RTX_16X4 ] = qm_tbl_4x16[i][j];
+            transpose(qm_tbl_4x16[i][j], qm_tbl_16x4[i][j], 16, 4);
+            dav1d_qm_tbl[i][j][RTX_8X16 ] = qm_tbl_16x8[i][j];
+            dav1d_qm_tbl[i][j][RTX_16X8 ] = qm_tbl_8x16[i][j];
+            transpose(qm_tbl_8x16[i][j], qm_tbl_16x8[i][j], 16, 8);
+            dav1d_qm_tbl[i][j][RTX_8X32 ] = qm_tbl_32x8[i][j];
+            dav1d_qm_tbl[i][j][RTX_32X8 ] = qm_tbl_8x32[i][j];
+            transpose(qm_tbl_8x32[i][j], qm_tbl_32x8[i][j], 32, 8);
+            dav1d_qm_tbl[i][j][RTX_16X32] = qm_tbl_32x16[i][j];
+            dav1d_qm_tbl[i][j][RTX_32X16] = qm_tbl_16x32[i][j];
+            transpose(qm_tbl_16x32[i][j], qm_tbl_32x16[i][j], 32, 16);
+
+            dav1d_qm_tbl[i][j][ TX_4X4  ] = qm_tbl_4x4[i][j];
+            dav1d_qm_tbl[i][j][ TX_8X8  ] = qm_tbl_8x8[i][j];
+            dav1d_qm_tbl[i][j][ TX_16X16] = qm_tbl_16x16[i][j];
+            dav1d_qm_tbl[i][j][ TX_32X32] = qm_tbl_32x32[i][j];
+            untriangle(qm_tbl_4x4[i][j], qm_tbl_4x4_t[i][j], 4);
+            untriangle(qm_tbl_8x8[i][j], qm_tbl_8x8_t[i][j], 8);
+            untriangle(qm_tbl_32x32[i][j], qm_tbl_32x32_t[i][j], 32);
+            subsample(qm_tbl_16x16[i][j], qm_tbl_32x32[i][j], 16, 2);
+
+            dav1d_qm_tbl[i][j][ TX_64X64] = dav1d_qm_tbl[i][j][ TX_32X32];
+            dav1d_qm_tbl[i][j][RTX_64X32] = dav1d_qm_tbl[i][j][ TX_32X32];
+            dav1d_qm_tbl[i][j][RTX_64X16] = dav1d_qm_tbl[i][j][RTX_32X16];
+            dav1d_qm_tbl[i][j][RTX_32X64] = dav1d_qm_tbl[i][j][ TX_32X32];
+            dav1d_qm_tbl[i][j][RTX_16X64] = dav1d_qm_tbl[i][j][RTX_16X32];
+        }
+
+    memset(pb_32x32, 32, sizeof(pb_32x32));
+    for (int j = 0; j < 2; j++)
+        for (int k = 0; k < N_RECT_TX_SIZES; k++)
+            dav1d_qm_tbl[15][j][k] = pb_32x32;
+}
diff --git a/src/qm.h b/src/qm.h
new file mode 100644 (file)
index 0000000..23b2348
--- /dev/null
+++ b/src/qm.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_QM_H
+#define DAV1D_SRC_QM_H
+
+#include "src/levels.h"
+
+extern const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
+
+void dav1d_init_qm_tables(void);
+
+#endif /* DAV1D_SRC_QM_H */
diff --git a/src/recon.h b/src/recon.h
new file mode 100644 (file)
index 0000000..f84c8ab
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_RECON_H
+#define DAV1D_SRC_RECON_H
+
+#include "src/internal.h"
+#include "src/levels.h"
+
+#define DEBUG_BLOCK_INFO 0 && \
+        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
+        t->bx >= 8 && t->bx < 12
+#define DEBUG_B_PIXELS 0
+
+#define decl_recon_b_intra_fn(name) \
+void (name)(Dav1dTileContext *t, enum BlockSize bs, \
+            enum EdgeFlags intra_edge_flags, const Av1Block *b)
+typedef decl_recon_b_intra_fn(*recon_b_intra_fn);
+
+#define decl_recon_b_inter_fn(name) \
+int (name)(Dav1dTileContext *t, enum BlockSize bs, const Av1Block *b)
+typedef decl_recon_b_inter_fn(*recon_b_inter_fn);
+
+#define decl_filter_sbrow_fn(name) \
+void (name)(Dav1dFrameContext *f, int sby)
+typedef decl_filter_sbrow_fn(*filter_sbrow_fn);
+
+#define decl_backup_ipred_edge_fn(name) \
+void (name)(Dav1dTileContext *t)
+typedef decl_backup_ipred_edge_fn(*backup_ipred_edge_fn);
+
+#define decl_read_coef_blocks_fn(name) \
+void (name)(Dav1dTileContext *t, enum BlockSize bs, const Av1Block *b)
+typedef decl_read_coef_blocks_fn(*read_coef_blocks_fn);
+
+decl_recon_b_intra_fn(dav1d_recon_b_intra_8bpc);
+decl_recon_b_intra_fn(dav1d_recon_b_intra_16bpc);
+
+decl_recon_b_inter_fn(dav1d_recon_b_inter_8bpc);
+decl_recon_b_inter_fn(dav1d_recon_b_inter_16bpc);
+
+decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc);
+
+decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_8bpc);
+decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc);
+
+decl_read_coef_blocks_fn(dav1d_read_coef_blocks_8bpc);
+decl_read_coef_blocks_fn(dav1d_read_coef_blocks_16bpc);
+
+#endif /* DAV1D_SRC_RECON_H */
diff --git a/src/recon_tmpl.c b/src/recon_tmpl.c
new file mode 100644 (file)
index 0000000..8e96f8e
--- /dev/null
@@ -0,0 +1,2063 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+#include <stdio.h>
+
+#include "common/attributes.h"
+#include "common/bitdepth.h"
+#include "common/dump.h"
+#include "common/intops.h"
+#include "common/mem.h"
+
+#include "src/cdef_apply.h"
+#include "src/ctx.h"
+#include "src/ipred_prepare.h"
+#include "src/lf_apply.h"
+#include "src/lr_apply.h"
+#include "src/recon.h"
+#include "src/scan.h"
+#include "src/tables.h"
+#include "src/wedge.h"
+
+static inline unsigned read_golomb(MsacContext *const msac) {
+    int len = 0;
+    unsigned val = 1;
+
+    while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;
+    while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);
+
+    return val - 1;
+}
+
+static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
+                                    const enum BlockSize bs,
+                                    const uint8_t *const a,
+                                    const uint8_t *const l,
+                                    const int chroma,
+                                    const enum Dav1dPixelLayout layout)
+{
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+
+    if (chroma) {
+        const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
+                                b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
+        unsigned ca, cl;
+
+#define MERGE_CTX(dir, type, no_val) \
+        c##dir = *(const type *) dir != no_val; \
+        break
+
+        switch (t_dim->lw) {
+        /* For some reason the MSVC CRT _wassert() function is not flagged as
+         * __declspec(noreturn), so when using those headers the compiler will
+         * expect execution to continue after an assertion has been triggered
+         * and will therefore complain about the use of uninitialized variables
+         * when compiled in debug mode if we put the default case at the end. */
+        default: assert(0); /* fall-through */
+        case TX_4X4:   MERGE_CTX(a, uint8_t,  0x40);
+        case TX_8X8:   MERGE_CTX(a, uint16_t, 0x4040);
+        case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);
+        case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);
+        }
+        switch (t_dim->lh) {
+        default: assert(0); /* fall-through */
+        case TX_4X4:   MERGE_CTX(l, uint8_t,  0x40);
+        case TX_8X8:   MERGE_CTX(l, uint16_t, 0x4040);
+        case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);
+        case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);
+        }
+#undef MERGE_CTX
+
+        return 7 + not_one_blk * 3 + ca + cl;
+    } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
+        return 0;
+    } else {
+        unsigned la, ll;
+
+#define MERGE_CTX(dir, type, tx) \
+        if (tx == TX_64X64) { \
+            uint64_t tmp = *(const uint64_t *) dir; \
+            tmp |= *(const uint64_t *) &dir[8]; \
+            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
+        } else \
+            l##dir = *(const type *) dir; \
+        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
+        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
+        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
+        break
+
+        switch (t_dim->lw) {
+        default: assert(0); /* fall-through */
+        case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4);
+        case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8);
+        case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
+        case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
+        case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
+        }
+        switch (t_dim->lh) {
+        default: assert(0); /* fall-through */
+        case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4);
+        case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8);
+        case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
+        case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
+        case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
+        }
+#undef MERGE_CTX
+
+        return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];
+    }
+}
+
+static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,
+                                       const uint8_t *const a,
+                                       const uint8_t *const l)
+{
+    uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;
+    int s;
+
+#if ARCH_X86_64 && defined(__GNUC__)
+    /* Coerce compilers into producing better code. For some reason
+     * every x86-64 compiler is awful at handling 64-bit constants. */
+    __asm__("" : "+r"(mask), "+r"(mul));
+#endif
+
+    switch(tx) {
+    default: assert(0); /* fall-through */
+    case TX_4X4: {
+        int t = *(const uint8_t *) a >> 6;
+        t    += *(const uint8_t *) l >> 6;
+        s = t - 1 - 1;
+        break;
+    }
+    case TX_8X8: {
+        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
+        t         += *(const uint16_t *) l & (uint32_t) mask;
+        t *= 0x04040404U;
+        s = (int) (t >> 24) - 2 - 2;
+        break;
+    }
+    case TX_16X16: {
+        uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;
+        t         += (*(const uint32_t *) l & (uint32_t) mask) >> 6;
+        t *= (uint32_t) mul;
+        s = (int) (t >> 24) - 4 - 4;
+        break;
+    }
+    case TX_32X32: {
+        uint64_t t = (*(const uint64_t *) a & mask) >> 6;
+        t         += (*(const uint64_t *) l & mask) >> 6;
+        t *= mul;
+        s = (int) (t >> 56) - 8 - 8;
+        break;
+    }
+    case TX_64X64: {
+        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
+        t         += (*(const uint64_t *) &a[8] & mask) >> 6;
+        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
+        t         += (*(const uint64_t *) &l[8] & mask) >> 6;
+        t *= mul;
+        s = (int) (t >> 56) - 16 - 16;
+        break;
+    }
+    case RTX_4X8: {
+        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
+        t         += *(const uint16_t *) l & (uint32_t) mask;
+        t *= 0x04040404U;
+        s = (int) (t >> 24) - 1 - 2;
+        break;
+    }
+    case RTX_8X4: {
+        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
+        t         += *(const uint8_t  *) l & (uint32_t) mask;
+        t *= 0x04040404U;
+        s = (int) (t >> 24) - 2 - 1;
+        break;
+    }
+    case RTX_8X16: {
+        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
+        t         += *(const uint32_t *) l & (uint32_t) mask;
+        t = (t >> 6) * (uint32_t) mul;
+        s = (int) (t >> 24) - 2 - 4;
+        break;
+    }
+    case RTX_16X8: {
+        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
+        t         += *(const uint16_t *) l & (uint32_t) mask;
+        t = (t >> 6) * (uint32_t) mul;
+        s = (int) (t >> 24) - 4 - 2;
+        break;
+    }
+    case RTX_16X32: {
+        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
+        t         += *(const uint64_t *) l & mask;
+        t = (t >> 6) * mul;
+        s = (int) (t >> 56) - 4 - 8;
+        break;
+    }
+    case RTX_32X16: {
+        uint64_t t = *(const uint64_t *) a & mask;
+        t         += *(const uint32_t *) l & (uint32_t) mask;
+        t = (t >> 6) * mul;
+        s = (int) (t >> 56) - 8 - 4;
+        break;
+    }
+    case RTX_32X64: {
+        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
+        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
+        t         += (*(const uint64_t *) &l[8] & mask) >> 6;
+        t *= mul;
+        s = (int) (t >> 56) - 8 - 16;
+        break;
+    }
+    case RTX_64X32: {
+        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
+        t         += (*(const uint64_t *) &a[8] & mask) >> 6;
+        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
+        t *= mul;
+        s = (int) (t >> 56) - 16 - 8;
+        break;
+    }
+    case RTX_4X16: {
+        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
+        t         += *(const uint32_t *) l & (uint32_t) mask;
+        t = (t >> 6) * (uint32_t) mul;
+        s = (int) (t >> 24) - 1 - 4;
+        break;
+    }
+    case RTX_16X4: {
+        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
+        t         += *(const uint8_t  *) l & (uint32_t) mask;
+        t = (t >> 6) * (uint32_t) mul;
+        s = (int) (t >> 24) - 4 - 1;
+        break;
+    }
+    case RTX_8X32: {
+        uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
+        t         += *(const uint64_t *) l & mask;
+        t = (t >> 6) * mul;
+        s = (int) (t >> 56) - 2 - 8;
+        break;
+    }
+    case RTX_32X8: {
+        uint64_t t = *(const uint64_t *) a & mask;
+        t         += *(const uint16_t *) l & (uint32_t) mask;
+        t = (t >> 6) * mul;
+        s = (int) (t >> 56) - 8 - 2;
+        break;
+    }
+    case RTX_16X64: {
+        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
+        t         += *(const uint64_t *) &l[0] & mask;
+        t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);
+        t *= mul;
+        s = (int) (t >> 56) - 4 - 16;
+        break;
+    }
+    case RTX_64X16: {
+        uint64_t t = *(const uint64_t *) &a[0] & mask;
+        t         += *(const uint32_t *) l & (uint32_t) mask;
+        t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);
+        t *= mul;
+        s = (int) (t >> 56) - 16 - 4;
+        break;
+    }
+    }
+
+    return (s != 0) + (s > 0);
+}
+
+static inline unsigned get_lo_ctx(const uint8_t *const levels,
+                                  const enum TxClass tx_class,
+                                  unsigned *const hi_mag,
+                                  const uint8_t (*const ctx_offsets)[5],
+                                  const unsigned x, const unsigned y,
+                                  const ptrdiff_t stride)
+{
+    unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];
+    unsigned offset;
+    if (tx_class == TX_CLASS_2D) {
+        mag += levels[1 * stride + 1];
+        *hi_mag = mag;
+        mag += levels[0 * stride + 2] + levels[2 * stride + 0];
+        offset = ctx_offsets[umin(y, 4)][umin(x, 4)];
+    } else {
+        mag += levels[0 * stride + 2];
+        *hi_mag = mag;
+        mag += levels[0 * stride + 3] + levels[0 * stride + 4];
+        offset = 26 + (y > 1 ? 10 : y * 5);
+    }
+    return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
+}
+
+static int decode_coefs(Dav1dTileContext *const t,
+                        uint8_t *const a, uint8_t *const l,
+                        const enum RectTxfmSize tx, const enum BlockSize bs,
+                        const Av1Block *const b, const int intra,
+                        const int plane, coef *cf,
+                        enum TxfmType *const txtp, uint8_t *res_ctx)
+{
+    Dav1dTileState *const ts = t->ts;
+    const int chroma = !!plane;
+    const Dav1dFrameContext *const f = t->f;
+    const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+    const int dbg = DEBUG_BLOCK_INFO && plane && 0;
+
+    if (dbg)
+        printf("Start: r=%d\n", ts->msac.rng);
+
+    // does this block have any non-zero coefficients
+    const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
+    const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
+                             ts->cdf.coef.skip[t_dim->ctx][sctx]);
+    if (dbg)
+        printf("Post-non-zero[%d][%d][%d]: r=%d\n",
+               t_dim->ctx, sctx, all_skip, ts->msac.rng);
+    if (all_skip) {
+        *res_ctx = 0x40;
+        *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
+        return -1;
+    }
+
+    // transform type (chroma: derived, luma: explicitly coded)
+    if (lossless) {
+        assert(t_dim->max == TX_4X4);
+        *txtp = WHT_WHT;
+    } else if (t_dim->max + intra >= TX_64X64) {
+        *txtp = DCT_DCT;
+    } else if (chroma) {
+        // inferred from either the luma txtp (inter) or a LUT (intra)
+        *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
+                        get_uv_inter_txtp(t_dim, *txtp);
+    } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
+        // In libaom, lossless is checked by a literal qidx == 0, but not all
+        // such blocks are actually lossless. The remainder gets an implicit
+        // transform type (for luma)
+        *txtp = DCT_DCT;
+    } else {
+        unsigned idx;
+        if (intra) {
+            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
+                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
+            if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
+                idx = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                          ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
+                *txtp = dav1d_tx_types_per_set[idx + 0];
+            } else {
+                idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+                          ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);
+                *txtp = dav1d_tx_types_per_set[idx + 5];
+            }
+            if (dbg)
+                printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",
+                       tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
+        } else {
+            if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
+                idx = dav1d_msac_decode_bool_adapt(&ts->msac,
+                          ts->cdf.m.txtp_inter3[t_dim->min]);
+                *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */
+            } else if (t_dim->min == TX_16X16) {
+                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+                          ts->cdf.m.txtp_inter2, 11);
+                *txtp = dav1d_tx_types_per_set[idx + 12];
+            } else {
+                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+                          ts->cdf.m.txtp_inter1[t_dim->min], 15);
+                *txtp = dav1d_tx_types_per_set[idx + 24];
+            }
+            if (dbg)
+                printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",
+                       tx, t_dim->min, idx, *txtp, ts->msac.rng);
+        }
+    }
+
+    // find end-of-block (eob)
+    int eob_bin;
+    const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);
+    const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
+    const int is_1d = tx_class != TX_CLASS_2D;
+    switch (tx2dszctx) {
+#define case_sz(sz, bin, ns, is_1d) \
+    case sz: { \
+        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
+        eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
+        break; \
+    }
+    case_sz(0,   16,  4, [is_1d]);
+    case_sz(1,   32,  8, [is_1d]);
+    case_sz(2,   64,  8, [is_1d]);
+    case_sz(3,  128,  8, [is_1d]);
+    case_sz(4,  256, 16, [is_1d]);
+    case_sz(5,  512, 16,        );
+    case_sz(6, 1024, 16,        );
+#undef case_sz
+    }
+    if (dbg)
+        printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
+               16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);
+    int eob;
+    if (eob_bin > 1) {
+        uint16_t *const eob_hi_bit_cdf =
+            ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
+        const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
+        if (dbg)
+            printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
+                   t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
+        eob = ((eob_hi_bit | 2) << (eob_bin - 2)) |
+              dav1d_msac_decode_bools(&ts->msac, eob_bin - 2);
+        if (dbg)
+            printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
+    } else {
+        eob = eob_bin;
+    }
+
+    // base tokens
+    uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
+    uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
+    const uint16_t *const scan = dav1d_scans[tx][tx_class];
+    int dc_tok;
+
+    if (eob) {
+        uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
+        uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
+        const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8);
+        const unsigned shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1;
+
+        /* eob */
+        unsigned rc = scan[eob], x = rc >> shift, y = rc & mask;
+        unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
+        int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
+        int tok = eob_tok + 1;
+        int level_tok = tok * 0x41;
+        unsigned mag;
+        if (dbg)
+            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
+                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng);
+
+#define DECODE_COEFS_CLASS(tx_class) \
+        if (eob_tok == 2) { \
+            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : \
+                   tx_class == TX_CLASS_H ? x != 0 : y != 0) ? 14 : 7; \
+            tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
+            level_tok = tok + (3 << 6); \
+            if (dbg) \
+                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+                       imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
+                       ts->msac.rng); \
+        } \
+        cf[rc] = tok; \
+        if (tx_class == TX_CLASS_H) \
+            /* Transposing reduces the stride and padding requirements */ \
+            levels[y * stride + x] = (uint8_t) level_tok; \
+        else \
+            levels[x * stride + y] = (uint8_t) level_tok; \
+        for (int i = eob - 1; i > 0; i--) { /* ac */ \
+            if (tx_class == TX_CLASS_H) \
+                rc = i, x = rc & mask, y = rc >> shift; \
+            else \
+                rc = scan[i], x = rc >> shift, y = rc & mask; \
+            assert(x < 32 && y < 32); \
+            uint8_t *const level = levels + x * stride + y; \
+            ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
+            if (tx_class == TX_CLASS_2D) \
+                y |= x; \
+            tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
+            level_tok = tok * 0x41; \
+            if (dbg) \
+                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+                       t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); \
+            if (tok == 3) { \
+                mag &= 63; \
+                ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
+                      (mag > 12 ? 6 : (mag + 1) >> 1); \
+                tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
+                level_tok = tok + (3 << 6); \
+                if (dbg) \
+                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+                           imin(t_dim->ctx, 3), chroma, ctx, i, rc, tok, \
+                           ts->msac.rng); \
+            } \
+            cf[rc] = tok; \
+            *level = (uint8_t) level_tok; \
+        } \
+        /* dc */ \
+        ctx = (tx_class == TX_CLASS_2D) ? 0 : \
+            get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
+        dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
+        if (dbg) \
+            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
+                   t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
+        if (dc_tok == 3) { \
+            if (tx_class == TX_CLASS_2D) \
+                mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
+                      levels[1 * stride + 1]; \
+            mag &= 63; \
+            ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
+            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
+            if (dbg) \
+                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
+                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
+        } \
+        break
+
+        switch (tx_class) {
+        case TX_CLASS_2D: {
+            const unsigned nonsquare_tx = tx >= RTX_4X8;
+            const uint8_t (*const lo_ctx_offsets)[5] =
+                dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
+            const ptrdiff_t stride = 4 * sh;
+            memset(levels, 0, stride * (4 * sw + 2));
+            DECODE_COEFS_CLASS(TX_CLASS_2D);
+        }
+        case TX_CLASS_H: {
+#define lo_ctx_offsets NULL
+            const ptrdiff_t stride = 16;
+            memset(levels, 0, stride * (4 * sh + 2));
+            DECODE_COEFS_CLASS(TX_CLASS_H);
+        }
+        case TX_CLASS_V: {
+            const ptrdiff_t stride = 16;
+            memset(levels, 0, stride * (4 * sw + 2));
+            DECODE_COEFS_CLASS(TX_CLASS_V);
+        }
+#undef lo_ctx_offsets
+#undef DECODE_COEFS_CLASS
+        default: assert(0);
+        }
+    } else { // dc-only
+        int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);
+        dc_tok = 1 + tok_br;
+        if (dbg)
+            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
+                   t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);
+        if (tok_br == 2) {
+            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);
+            if (dbg)
+                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
+                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
+        }
+    }
+
+    // residual and sign
+    int dc_sign = 1 << 6;
+    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
+    const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane];
+    const int dq_shift = imax(0, t_dim->ctx - 2);
+    const int bitdepth = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
+    const int cf_max = (1 << (7 + bitdepth)) - 1;
+    unsigned cul_level = 0;
+
+    if (dc_tok) { // dc
+        const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
+        uint16_t *const dc_sign_cdf =
+            ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
+        const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
+        const unsigned dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;
+        if (dbg)
+            printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
+                   chroma, dc_sign_ctx, sign, ts->msac.rng);
+        dc_sign = (sign - 1) & (2 << 6);
+
+        if (dc_tok == 15) {
+            dc_tok += read_golomb(&ts->msac);
+            if (dbg)
+                printf("Post-dc_residual[%d->%d]: r=%d\n",
+                       dc_tok - 15, dc_tok, ts->msac.rng);
+
+            dc_tok &= 0xfffff;
+        }
+
+        cul_level += dc_tok;
+        dc_tok = ((dq * dc_tok) & 0xffffff) >> dq_shift;
+        cf[0] = imin(dc_tok - sign, cf_max) ^ -sign;
+    }
+    for (int i = 1; i <= eob; i++) { // ac
+        const int rc = scan[i];
+        int tok = cf[rc];
+        if (!tok) continue;
+
+        // sign
+        const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
+        const unsigned dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;
+        if (dbg)
+            printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);
+
+        // residual
+        if (tok == 15) {
+            tok += read_golomb(&ts->msac);
+            if (dbg)
+                printf("Post-residual[%d=%d=%d->%d]: r=%d\n",
+                       i, rc, tok - 15, tok, ts->msac.rng);
+
+            // coefficient parsing, see 5.11.39
+            tok &= 0xfffff;
+        }
+
+        // dequant, see 7.12.3
+        cul_level += tok;
+        tok = ((dq * tok) & 0xffffff) >> dq_shift;
+        cf[rc] = imin(tok - sign, cf_max) ^ -sign;
+    }
+
+    // context
+    *res_ctx = umin(cul_level, 63) | dc_sign;
+
+    return eob;
+}
+
+static void read_coef_tree(Dav1dTileContext *const t,
+                           const enum BlockSize bs, const Av1Block *const b,
+                           const enum RectTxfmSize ytx, const int depth,
+                           const uint16_t *const tx_split,
+                           const int x_off, const int y_off, pixel *dst)
+{
+    const Dav1dFrameContext *const f = t->f;
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dDSPContext *const dsp = f->dsp;
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
+    const int txw = t_dim->w, txh = t_dim->h;
+
+    /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't
+     * be splitted. Aviods an undefined left shift. */
+    if (depth < 2 && tx_split[depth] &&
+        tx_split[depth] & (1 << (y_off * 4 + x_off)))
+    {
+        const enum RectTxfmSize sub = t_dim->sub;
+        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
+        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
+
+        read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
+                       x_off * 2 + 0, y_off * 2 + 0, dst);
+        t->bx += txsw;
+        if (txw >= txh && t->bx < f->bw)
+            read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
+                           y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
+        t->bx -= txsw;
+        t->by += txsh;
+        if (txh >= txw && t->by < f->bh) {
+            if (dst)
+                dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]);
+            read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
+                           x_off * 2 + 0, y_off * 2 + 1, dst);
+            t->bx += txsw;
+            if (txw >= txh && t->bx < f->bw)
+                read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
+                               y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
+            t->bx -= txsw;
+        }
+        t->by -= txsh;
+    } else {
+        const int bx4 = t->bx & 31, by4 = t->by & 31;
+        enum TxfmType txtp;
+        uint8_t cf_ctx;
+        int eob;
+        coef *cf;
+        struct CodedBlockInfo *cbi;
+
+        if (f->frame_thread.pass) {
+            assert(ts->frame_thread.cf);
+            cf = ts->frame_thread.cf;
+            ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+            cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+        } else {
+            cf = bitfn(t->cf);
+        }
+        if (f->frame_thread.pass != 2) {
+            eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
+                               ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+                       ytx, txtp, eob, ts->msac.rng);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+            memset(&t->dir lcoef[off], cf_ctx, sz)
+            case_set_upto16_with_default(imin(txh, f->bh - t->by), l., 1, by4);
+            case_set_upto16_with_default(imin(txw, f->bw - t->bx), a->, 0, bx4);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            for (int y = 0; y < txh; y++) { \
+                rep_macro(type, txtp_map, 0, mul * txtp); \
+                txtp_map += 32; \
+            }
+            uint8_t *txtp_map = &t->txtp_map[by4 * 32 + bx4];
+            case_set_upto16(txw,,,);
+#undef set_ctx
+            if (f->frame_thread.pass == 1) {
+                cbi->eob[0] = eob;
+                cbi->txtp[0] = txtp;
+            }
+        } else {
+            eob = cbi->eob[0];
+            txtp = cbi->txtp[0];
+        }
+        if (!(f->frame_thread.pass & 1)) {
+            assert(dst);
+            if (eob >= 0) {
+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                    coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
+                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob
+                                              HIGHBD_CALL_SUFFIX);
+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                    hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
+            }
+        }
+    }
+}
+
+void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
+                                    const enum BlockSize bs, const Av1Block *const b)
+{
+    const Dav1dFrameContext *const f = t->f;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int bx4 = t->bx & 31, by4 = t->by & 31;
+    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+    const int bw4 = b_dim[0], bh4 = b_dim[1];
+    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
+    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+                           (bw4 > ss_hor || t->bx & 1) &&
+                           (bh4 > ss_ver || t->by & 1);
+
+    if (b->skip) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir lcoef, off, mul * 0x40)
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+        if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
+            rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
+            case_set(cbh4, l., 1, cby4);
+            case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+        }
+        return;
+    }
+
+    Dav1dTileState *const ts = t->ts;
+    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+    assert(f->frame_thread.pass == 1);
+    assert(!b->skip);
+    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
+    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
+
+    for (int init_y = 0; init_y < h4; init_y += 16) {
+        const int sub_h4 = imin(h4, 16 + init_y);
+        for (int init_x = 0; init_x < w4; init_x += 16) {
+            const int sub_w4 = imin(w4, init_x + 16);
+            int y_off = !!init_y, y, x;
+            for (y = init_y, t->by += init_y; y < sub_h4;
+                 y += t_dim->h, t->by += t_dim->h, y_off++)
+            {
+                struct CodedBlockInfo *const cbi =
+                    &f->frame_thread.cbi[t->by * f->b4_stride];
+                int x_off = !!init_x;
+                for (x = init_x, t->bx += init_x; x < sub_w4;
+                     x += t_dim->w, t->bx += t_dim->w, x_off++)
+                {
+                    if (!b->intra) {
+                        read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
+                                       x_off, y_off, NULL);
+                    } else {
+                        uint8_t cf_ctx = 0x40;
+                        enum TxfmType txtp;
+                        const int eob = cbi[t->bx].eob[0] =
+                            decode_coefs(t, &t->a->lcoef[bx4 + x],
+                                         &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
+                                         0, ts->frame_thread.cf, &txtp, &cf_ctx);
+                        if (DEBUG_BLOCK_INFO)
+                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+                                   b->tx, txtp, eob, ts->msac.rng);
+                        cbi[t->bx].txtp[0] = txtp;
+                        ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                        rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                        memset(&t->dir lcoef[off], cf_ctx, sz)
+                        case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by),
+                                                     l., 1, by4 + y);
+                        case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx),
+                                                     a->, 0, bx4 + x);
+#undef default_memset
+#undef set_ctx
+                    }
+                }
+                t->bx -= x;
+            }
+            t->by -= y;
+
+            if (!has_chroma) continue;
+
+            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
+            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
+            for (int pl = 0; pl < 2; pl++) {
+                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
+                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
+                {
+                    struct CodedBlockInfo *const cbi =
+                        &f->frame_thread.cbi[t->by * f->b4_stride];
+                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
+                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
+                    {
+                        uint8_t cf_ctx = 0x40;
+                        enum TxfmType txtp;
+                        if (!b->intra)
+                            txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
+                                                bx4 + (x << ss_hor)];
+                        const int eob = cbi[t->bx].eob[1 + pl] =
+                            decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+                                         &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
+                                         b, b->intra, 1 + pl, ts->frame_thread.cf,
+                                         &txtp, &cf_ctx);
+                        if (DEBUG_BLOCK_INFO)
+                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+                                   "txtp=%d,eob=%d]: r=%d\n",
+                                   pl, b->uvtx, txtp, eob, ts->msac.rng);
+                        cbi[t->bx].txtp[1 + pl] = txtp;
+                        ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                        rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                        memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+                        case_set_upto16_with_default( \
+                                 imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
+                                 l., 1, cby4 + y);
+                        case_set_upto16_with_default( \
+                                 imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+                                 a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
+                    }
+                    t->bx -= x << ss_hor;
+                }
+                t->by -= y << ss_ver;
+            }
+        }
+    }
+}
+
+static int mc(Dav1dTileContext *const t,
+              pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride,
+              const int bw4, const int bh4,
+              const int bx, const int by, const int pl,
+              const mv mv, const Dav1dThreadPicture *const refp, const int refidx,
+              const enum Filter2d filter_2d)
+{
+    assert((dst8 != NULL) ^ (dst16 != NULL));
+    const Dav1dFrameContext *const f = t->f;
+    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+    const int mvx = mv.x, mvy = mv.y;
+    const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
+    ptrdiff_t ref_stride = refp->p.stride[!!pl];
+    const pixel *ref;
+
+    if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) {
+        const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
+        const int dy = by * v_mul + (mvy >> (3 + ss_ver));
+        int w, h;
+
+        if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc
+            if (dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,
+                                          PLANE_TYPE_Y + !!pl))
+            {
+                return -1;
+            }
+            w = (f->cur.p.w + ss_hor) >> ss_hor;
+            h = (f->cur.p.h + ss_ver) >> ss_ver;
+        } else {
+            w = f->bw * 4 >> ss_hor;
+            h = f->bh * 4 >> ss_ver;
+        }
+        if (dx < !!mx * 3 || dy < !!my * 3 ||
+            dx + bw4 * h_mul + !!mx * 4 > w ||
+            dy + bh4 * v_mul + !!my * 4 > h)
+        {
+            pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
+            f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,
+                                w, h, dx - !!mx * 3, dy - !!my * 3,
+                                emu_edge_buf, 192 * sizeof(pixel),
+                                refp->p.data[pl], ref_stride);
+            ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3];
+            ref_stride = 192 * sizeof(pixel);
+        } else {
+            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
+        }
+
+        if (dst8 != NULL) {
+            f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
+                                     bh4 * v_mul, mx << !ss_hor, my << !ss_ver
+                                     HIGHBD_CALL_SUFFIX);
+        } else {
+            f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
+                                      bh4 * v_mul, mx << !ss_hor, my << !ss_ver
+                                      HIGHBD_CALL_SUFFIX);
+        }
+    } else {
+        assert(refp != &f->sr_cur);
+
+        const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
+        const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
+#define scale_mv(res, val, scale) do { \
+            const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
+            res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32;     \
+        } while (0)
+        int pos_y, pos_x;
+        scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale);
+        scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale);
+#undef scale_mv
+        const int left = pos_x >> 10;
+        const int top = pos_y >> 10;
+        const int right =
+            ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1;
+        const int bottom =
+            ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;
+
+        if (dav1d_thread_picture_wait(refp, bottom + 4, PLANE_TYPE_Y + !!pl))
+            return -1;
+        if (DEBUG_BLOCK_INFO)
+            printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",
+                   left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,
+                   right-left, bottom-top,
+                   f->svc[refidx][0].step, f->svc[refidx][1].step);
+
+        const int w = (refp->p.p.w + ss_hor) >> ss_hor;
+        const int h = (refp->p.p.h + ss_ver) >> ss_ver;
+        if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {
+            pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
+            f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7,
+                                w, h, left - 3, top - 3,
+                                emu_edge_buf, 320 * sizeof(pixel),
+                                refp->p.data[pl], ref_stride);
+            ref = &emu_edge_buf[320 * 3 + 3];
+            ref_stride = 320 * sizeof(pixel);
+            if (DEBUG_BLOCK_INFO) printf("Emu\n");
+        } else {
+            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;
+        }
+
+        if (dst8 != NULL) {
+            f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,
+                                            bw4 * h_mul, bh4 * v_mul,
+                                            pos_x & 0x3ff, pos_y & 0x3ff,
+                                            f->svc[refidx][0].step,
+                                            f->svc[refidx][1].step
+                                            HIGHBD_CALL_SUFFIX);
+        } else {
+            f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,
+                                             bw4 * h_mul, bh4 * v_mul,
+                                             pos_x & 0x3ff, pos_y & 0x3ff,
+                                             f->svc[refidx][0].step,
+                                             f->svc[refidx][1].step
+                                             HIGHBD_CALL_SUFFIX);
+        }
+    }
+
+    return 0;
+}
+
+static int obmc(Dav1dTileContext *const t,
+                pixel *const dst, const ptrdiff_t dst_stride,
+                const uint8_t *const b_dim, const int pl,
+                const int bx4, const int by4, const int w4, const int h4)
+{
+    assert(!(t->bx & 1) && !(t->by & 1));
+    const Dav1dFrameContext *const f = t->f;
+    /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
+    pixel *const lap = bitfn(t->scratch.lap);
+    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+    int res;
+
+    if (t->by > t->ts->tiling.row_start &&
+        (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
+    {
+        for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
+            // only odd blocks are considered for overlap handling, hence +1
+            const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
+            const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
+
+            if (a_r->ref.ref[0] > 0) {
+                const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]);
+                const int oh4 = imin(b_dim[1], 16) >> 1;
+                res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
+                         t->bx + x, t->by, pl, a_r->mv.mv[0],
+                         &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1,
+                         dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
+                if (res) return res;
+                f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
+                                   h_mul * ow4, v_mul * oh4);
+                i++;
+            }
+            x += imax(a_b_dim[0], 2);
+        }
+    }
+
+    if (t->bx > t->ts->tiling.col_start)
+        for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
+            // only odd blocks are considered for overlap handling, hence +1
+            const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
+            const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
+
+            if (l_r->ref.ref[0] > 0) {
+                const int ow4 = imin(b_dim[0], 16) >> 1;
+                const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]);
+                res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
+                         t->bx, t->by + y, pl, l_r->mv.mv[0],
+                         &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
+                         dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
+                if (res) return res;
+                f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
+                                   dst_stride, lap, h_mul * ow4, v_mul * oh4);
+                i++;
+            }
+            y += imax(l_b_dim[1], 2);
+        }
+    return 0;
+}
+
+static int warp_affine(Dav1dTileContext *const t,
+                       pixel *dst8, int16_t *dst16, const ptrdiff_t dstride,
+                       const uint8_t *const b_dim, const int pl,
+                       const Dav1dThreadPicture *const refp,
+                       const Dav1dWarpedMotionParams *const wmp)
+{
+    assert((dst8 != NULL) ^ (dst16 != NULL));
+    const Dav1dFrameContext *const f = t->f;
+    const Dav1dDSPContext *const dsp = f->dsp;
+    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
+    const int32_t *const mat = wmp->matrix;
+    const int width = (refp->p.p.w + ss_hor) >> ss_hor;
+    const int height = (refp->p.p.h + ss_ver) >> ss_ver;
+
+    for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
+        const int src_y = t->by * 4 + ((y + 4) << ss_ver);
+        const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0];
+        const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
+        for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
+            // calculate transformation relative to center of 8x8 block in
+            // luma pixel units
+            const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
+            const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor;
+            const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
+
+            const int dx = (int) (mvx >> 16) - 4;
+            const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 -
+                                                   wmp->beta  * 7) & ~0x3f;
+            const int dy = (int) (mvy >> 16) - 4;
+            const int my = (((int) mvy & 0xffff) - wmp->gamma * 4 -
+                                                   wmp->delta * 4) & ~0x3f;
+
+            const pixel *ref_ptr;
+            ptrdiff_t ref_stride = refp->p.stride[!!pl];
+
+            if (dav1d_thread_picture_wait(refp, dy + 4 + 8,
+                                          PLANE_TYPE_Y + !!pl))
+            {
+                return -1;
+            }
+            if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
+                pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
+                f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,
+                                    emu_edge_buf, 32 * sizeof(pixel),
+                                    refp->p.data[pl], ref_stride);
+                ref_ptr = &emu_edge_buf[32 * 3 + 3];
+                ref_stride = 32 * sizeof(pixel);
+            } else {
+                ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
+            }
+            if (dst16 != NULL)
+                dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
+                                 wmp->abcd, mx, my HIGHBD_CALL_SUFFIX);
+            else
+                dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
+                                wmp->abcd, mx, my HIGHBD_CALL_SUFFIX);
+        }
+        if (dst8) dst8  += 8 * PXSTRIDE(dstride);
+        else      dst16 += 8 * dstride;
+    }
+    return 0;
+}
+
+void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs,
+                                 const enum EdgeFlags intra_edge_flags,
+                                 const Av1Block *const b)
+{
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dFrameContext *const f = t->f;
+    const Dav1dDSPContext *const dsp = f->dsp;
+    const int bx4 = t->bx & 31, by4 = t->by & 31;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+    const int bw4 = b_dim[0], bh4 = b_dim[1];
+    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+                           (bw4 > ss_hor || t->bx & 1) &&
+                           (bh4 > ss_ver || t->by & 1);
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
+    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
+
+    // coefficient coding
+    pixel *const edge = bitfn(t->scratch.edge) + 128;
+    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
+
+    const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;
+
+    for (int init_y = 0; init_y < h4; init_y += 16) {
+        const int sub_h4 = imin(h4, 16 + init_y);
+        const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
+        for (int init_x = 0; init_x < w4; init_x += 16) {
+            if (b->pal_sz[0]) {
+                pixel *dst = ((pixel *) f->cur.data[0]) +
+                             4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
+                const uint8_t *pal_idx;
+                if (f->frame_thread.pass) {
+                    assert(ts->frame_thread.pal_idx);
+                    pal_idx = ts->frame_thread.pal_idx;
+                    ts->frame_thread.pal_idx += bw4 * bh4 * 16;
+                } else {
+                    pal_idx = t->scratch.pal_idx;
+                }
+                const uint16_t *const pal = f->frame_thread.pass ?
+                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+                                        ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
+                f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
+                                       pal_idx, bw4 * 4, bh4 * 4);
+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                    hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
+                             bw4 * 4, bh4 * 4, "y-pal-pred");
+            }
+
+            const int intra_flags = (sm_flag(t->a, bx4) |
+                                     sm_flag(&t->l, by4) |
+                                     intra_edge_filter_flag);
+            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
+                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
+            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
+                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
+            int y, x;
+            const int sub_w4 = imin(w4, init_x + 16);
+            for (y = init_y, t->by += init_y; y < sub_h4;
+                 y += t_dim->h, t->by += t_dim->h)
+            {
+                pixel *dst = ((pixel *) f->cur.data[0]) +
+                               4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
+                                    t->bx + init_x);
+                for (x = init_x, t->bx += init_x; x < sub_w4;
+                     x += t_dim->w, t->bx += t_dim->w)
+                {
+                    if (b->pal_sz[0]) goto skip_y_pred;
+
+                    int angle = b->y_angle;
+                    const enum EdgeFlags edge_flags =
+                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
+                             0 : EDGE_I444_TOP_HAS_RIGHT) |
+                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
+                             0 : EDGE_I444_LEFT_HAS_BOTTOM);
+                    const pixel *top_sb_edge = NULL;
+                    if (!(t->by & (f->sb_step - 1))) {
+                        top_sb_edge = f->ipred_edge[0];
+                        const int sby = t->by >> f->sb_shift;
+                        top_sb_edge += f->sb128w * 128 * (sby - 1);
+                    }
+                    const enum IntraPredMode m =
+                        bytefn(dav1d_prepare_intra_edges)(t->bx,
+                                                          t->bx > ts->tiling.col_start,
+                                                          t->by,
+                                                          t->by > ts->tiling.row_start,
+                                                          ts->tiling.col_end,
+                                                          ts->tiling.row_end,
+                                                          edge_flags, dst,
+                                                          f->cur.stride[0], top_sb_edge,
+                                                          b->y_mode, &angle,
+                                                          t_dim->w, t_dim->h,
+                                                          f->seq_hdr->intra_edge_filter,
+                                                          edge HIGHBD_CALL_SUFFIX);
+                    dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
+                                             t_dim->w * 4, t_dim->h * 4,
+                                             angle | intra_flags,
+                                             4 * f->bw - 4 * t->bx,
+                                             4 * f->bh - 4 * t->by
+                                             HIGHBD_CALL_SUFFIX);
+
+                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
+                                 t_dim->h * 4, 2, "l");
+                        hex_dump(edge, 0, 1, 1, "tl");
+                        hex_dump(edge + 1, t_dim->w * 4,
+                                 t_dim->w * 4, 2, "t");
+                        hex_dump(dst, f->cur.stride[0],
+                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
+                    }
+
+                skip_y_pred: {}
+                    if (!b->skip) {
+                        coef *cf;
+                        int eob;
+                        enum TxfmType txtp;
+                        if (f->frame_thread.pass) {
+                            cf = ts->frame_thread.cf;
+                            ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+                            const struct CodedBlockInfo *const cbi =
+                                &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+                            eob = cbi->eob[0];
+                            txtp = cbi->txtp[0];
+                        } else {
+                            uint8_t cf_ctx;
+                            cf = bitfn(t->cf);
+                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
+                                               &t->l.lcoef[by4 + y], b->tx, bs,
+                                               b, 1, 0, cf, &txtp, &cf_ctx);
+                            if (DEBUG_BLOCK_INFO)
+                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+                                       b->tx, txtp, eob, ts->msac.rng);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                            rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                            memset(&t->dir lcoef[off], cf_ctx, sz)
+                            case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), \
+                                                         l., 1, by4 + y);
+                            case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), \
+                                                         a->, 0, bx4 + x);
+#undef default_memset
+#undef set_ctx
+                        }
+                        if (eob >= 0) {
+                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                                coef_dump(cf, imin(t_dim->h, 8) * 4,
+                                          imin(t_dim->w, 8) * 4, 3, "dq");
+                            dsp->itx.itxfm_add[b->tx]
+                                              [txtp](dst,
+                                                     f->cur.stride[0],
+                                                     cf, eob HIGHBD_CALL_SUFFIX);
+                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                                hex_dump(dst, f->cur.stride[0],
+                                         t_dim->w * 4, t_dim->h * 4, "recon");
+                        }
+                    } else if (!f->frame_thread.pass) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                        rep_macro(type, t->dir lcoef, off, mul * 0x40)
+                        case_set_upto16(t_dim->h, l., 1, by4 + y);
+                        case_set_upto16(t_dim->w, a->, 0, bx4 + x);
+#undef set_ctx
+                    }
+                    dst += 4 * t_dim->w;
+                }
+                t->bx -= x;
+            }
+            t->by -= y;
+
+            if (!has_chroma) continue;
+
+            const ptrdiff_t stride = f->cur.stride[1];
+
+            if (b->uv_mode == CFL_PRED) {
+                assert(!init_x && !init_y);
+
+                int16_t *const ac = t->scratch.ac;
+                pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
+                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
+                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
+                                              (t->by >> ss_ver) * PXSTRIDE(stride));
+                pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
+                                           ((pixel *) f->cur.data[2]) + uv_off };
+
+                const int furthest_r =
+                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
+                const int furthest_b =
+                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
+                dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
+                                                         cbw4 - (furthest_r >> ss_hor),
+                                                         cbh4 - (furthest_b >> ss_ver),
+                                                         cbw4 * 4, cbh4 * 4);
+                for (int pl = 0; pl < 2; pl++) {
+                    if (!b->cfl_alpha[pl]) continue;
+                    int angle = 0;
+                    const pixel *top_sb_edge = NULL;
+                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
+                        top_sb_edge = f->ipred_edge[pl + 1];
+                        const int sby = t->by >> f->sb_shift;
+                        top_sb_edge += f->sb128w * 128 * (sby - 1);
+                    }
+                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
+                    const int xstart = ts->tiling.col_start >> ss_hor;
+                    const int ystart = ts->tiling.row_start >> ss_ver;
+                    const enum IntraPredMode m =
+                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
+                                                          ypos, ypos > ystart,
+                                                          ts->tiling.col_end >> ss_hor,
+                                                          ts->tiling.row_end >> ss_ver,
+                                                          0, uv_dst[pl], stride,
+                                                          top_sb_edge, DC_PRED, &angle,
+                                                          uv_t_dim->w, uv_t_dim->h, 0,
+                                                          edge HIGHBD_CALL_SUFFIX);
+                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
+                                           uv_t_dim->w * 4,
+                                           uv_t_dim->h * 4,
+                                           ac, b->cfl_alpha[pl]
+                                           HIGHBD_CALL_SUFFIX);
+                }
+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
+                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
+                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
+                }
+            } else if (b->pal_sz[1]) {
+                const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
+                                              (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
+                const uint16_t (*pal)[8];
+                const uint8_t *pal_idx;
+                if (f->frame_thread.pass) {
+                    assert(ts->frame_thread.pal_idx);
+                    pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+                                              ((t->bx >> 1) + (t->by & 1))];
+                    pal_idx = ts->frame_thread.pal_idx;
+                    ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
+                } else {
+                    pal = t->scratch.pal;
+                    pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
+                }
+
+                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
+                                       f->cur.stride[1], pal[1],
+                                       pal_idx, cbw4 * 4, cbh4 * 4);
+                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
+                                       f->cur.stride[1], pal[2],
+                                       pal_idx, cbw4 * 4, cbh4 * 4);
+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+                    hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
+                             PXSTRIDE(f->cur.stride[1]),
+                             cbw4 * 4, cbh4 * 4, "u-pal-pred");
+                    hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
+                             PXSTRIDE(f->cur.stride[1]),
+                             cbw4 * 4, cbh4 * 4, "v-pal-pred");
+                }
+            }
+
+            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
+                                 sm_uv_flag(&t->l, cby4);
+            const int uv_sb_has_tr =
+                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
+                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
+            const int uv_sb_has_bl =
+                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
+                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
+            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
+            for (int pl = 0; pl < 2; pl++) {
+                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
+                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
+                {
+                    pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
+                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
+                                        ((t->bx + init_x) >> ss_hor));
+                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
+                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
+                    {
+                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
+                            b->pal_sz[1])
+                        {
+                            goto skip_uv_pred;
+                        }
+
+                        int angle = b->uv_angle;
+                        // this probably looks weird because we're using
+                        // luma flags in a chroma loop, but that's because
+                        // prepare_intra_edges() expects luma flags as input
+                        const enum EdgeFlags edge_flags =
+                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
+                              (x + uv_t_dim->w >= sub_cw4)) ?
+                                 0 : EDGE_I444_TOP_HAS_RIGHT) |
+                            ((x > (init_x >> ss_hor) ||
+                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
+                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);
+                        const pixel *top_sb_edge = NULL;
+                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
+                            top_sb_edge = f->ipred_edge[1 + pl];
+                            const int sby = t->by >> f->sb_shift;
+                            top_sb_edge += f->sb128w * 128 * (sby - 1);
+                        }
+                        const enum IntraPredMode uv_mode =
+                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
+                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
+                        const int xstart = ts->tiling.col_start >> ss_hor;
+                        const int ystart = ts->tiling.row_start >> ss_ver;
+                        const enum IntraPredMode m =
+                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
+                                                              ypos, ypos > ystart,
+                                                              ts->tiling.col_end >> ss_hor,
+                                                              ts->tiling.row_end >> ss_ver,
+                                                              edge_flags, dst, stride,
+                                                              top_sb_edge, uv_mode,
+                                                              &angle, uv_t_dim->w,
+                                                              uv_t_dim->h,
+                                                              f->seq_hdr->intra_edge_filter,
+                                                              edge HIGHBD_CALL_SUFFIX);
+                        angle |= intra_edge_filter_flag;
+                        dsp->ipred.intra_pred[m](dst, stride, edge,
+                                                 uv_t_dim->w * 4,
+                                                 uv_t_dim->h * 4,
+                                                 angle | sm_uv_fl,
+                                                 (4 * f->bw + ss_hor -
+                                                  4 * (t->bx & ~ss_hor)) >> ss_hor,
+                                                 (4 * f->bh + ss_ver -
+                                                  4 * (t->by & ~ss_ver)) >> ss_ver
+                                                 HIGHBD_CALL_SUFFIX);
+                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
+                                     uv_t_dim->h * 4, 2, "l");
+                            hex_dump(edge, 0, 1, 1, "tl");
+                            hex_dump(edge + 1, uv_t_dim->w * 4,
+                                     uv_t_dim->w * 4, 2, "t");
+                            hex_dump(dst, stride, uv_t_dim->w * 4,
+                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
+                        }
+
+                    skip_uv_pred: {}
+                        if (!b->skip) {
+                            enum TxfmType txtp;
+                            int eob;
+                            coef *cf;
+                            if (f->frame_thread.pass) {
+                                cf = ts->frame_thread.cf;
+                                ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
+                                const struct CodedBlockInfo *const cbi =
+                                    &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+                                eob = cbi->eob[pl + 1];
+                                txtp = cbi->txtp[pl + 1];
+                            } else {
+                                uint8_t cf_ctx;
+                                cf = bitfn(t->cf);
+                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+                                                   &t->l.ccoef[pl][cby4 + y],
+                                                   b->uvtx, bs, b, 1, 1 + pl, cf,
+                                                   &txtp, &cf_ctx);
+                                if (DEBUG_BLOCK_INFO)
+                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
+                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                                rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                                memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+                                case_set_upto16_with_default( \
+                                         imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
+                                         l., 1, cby4 + y);
+                                case_set_upto16_with_default( \
+                                         imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+                                         a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
+                            }
+                            if (eob >= 0) {
+                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                                    coef_dump(cf, uv_t_dim->h * 4,
+                                              uv_t_dim->w * 4, 3, "dq");
+                                dsp->itx.itxfm_add[b->uvtx]
+                                                  [txtp](dst, stride,
+                                                         cf, eob HIGHBD_CALL_SUFFIX);
+                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                                    hex_dump(dst, stride, uv_t_dim->w * 4,
+                                             uv_t_dim->h * 4, "recon");
+                            }
+                        } else if (!f->frame_thread.pass) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                            rep_macro(type, t->dir ccoef[pl], off, mul * 0x40)
+                            case_set_upto16(uv_t_dim->h, l., 1, cby4 + y);
+                            case_set_upto16(uv_t_dim->w, a->, 0, cbx4 + x);
+#undef set_ctx
+                        }
+                        dst += uv_t_dim->w * 4;
+                    }
+                    t->bx -= x << ss_hor;
+                }
+                t->by -= y << ss_ver;
+            }
+        }
+    }
+}
+
+int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,
+                                const Av1Block *const b)
+{
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dFrameContext *const f = t->f;
+    const Dav1dDSPContext *const dsp = f->dsp;
+    const int bx4 = t->bx & 31, by4 = t->by & 31;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+    const int bw4 = b_dim[0], bh4 = b_dim[1];
+    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+                           (bw4 > ss_hor || t->bx & 1) &&
+                           (bh4 > ss_ver || t->by & 1);
+    const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
+                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
+    int res;
+
+    // prediction
+    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
+    pixel *dst = ((pixel *) f->cur.data[0]) +
+        4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
+    const ptrdiff_t uvdstoff =
+        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
+    if (!(f->frame_hdr->frame_type & 1)) {
+        // intrabc
+        assert(!f->frame_hdr->super_res.enabled);
+        res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
+                 b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
+        if (res) return res;
+        if (has_chroma) for (int pl = 1; pl < 3; pl++) {
+            res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
+                     bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
+                     t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
+                     &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
+            if (res) return res;
+        }
+    } else if (b->comp_type == COMP_INTER_NONE) {
+        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
+        const enum Filter2d filter_2d = b->filter2d;
+
+        if (imin(bw4, bh4) > 1 &&
+            ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+             (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
+        {
+            res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
+                              b->motion_mode == MM_WARP ? &t->warpmv :
+                                  &f->frame_hdr->gmv[b->ref[0]]);
+            if (res) return res;
+        } else {
+            res = mc(t, dst, NULL, f->cur.stride[0],
+                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
+            if (res) return res;
+            if (b->motion_mode == MM_OBMC) {
+                res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
+                if (res) return res;
+            }
+        }
+        if (b->interintra_type) {
+            pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
+            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
+                                   SMOOTH_PRED : b->interintra_mode;
+            pixel *const tmp = bitfn(t->scratch.interintra);
+            int angle = 0;
+            const pixel *top_sb_edge = NULL;
+            if (!(t->by & (f->sb_step - 1))) {
+                top_sb_edge = f->ipred_edge[0];
+                const int sby = t->by >> f->sb_shift;
+                top_sb_edge += f->sb128w * 128 * (sby - 1);
+            }
+            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
+                                                  t->by, t->by > ts->tiling.row_start,
+                                                  ts->tiling.col_end, ts->tiling.row_end,
+                                                  0, dst, f->cur.stride[0], top_sb_edge,
+                                                  m, &angle, bw4, bh4, 0, tl_edge
+                                                  HIGHBD_CALL_SUFFIX);
+            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
+                                     tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
+                                     HIGHBD_CALL_SUFFIX);
+            const uint8_t *const ii_mask =
+                b->interintra_type == INTER_INTRA_BLEND ?
+                     dav1d_ii_masks[bs][0][b->interintra_mode] :
+                     dav1d_wedge_masks[bs][0][0][b->wedge_idx];
+            dsp->mc.blend(dst, f->cur.stride[0], tmp,
+                          bw4 * 4, bh4 * 4, ii_mask);
+        }
+
+        if (!has_chroma) goto skip_inter_chroma_pred;
+
+        // sub8x8 derivation
+        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
+        refmvs_block *const *r;
+        if (is_sub8x8) {
+            assert(ss_hor == 1);
+            r = &t->rt.r[(t->by & 31) + 5];
+            if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
+            if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
+            if (bw4 == 1 && bh4 == ss_ver)
+                is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
+        }
+
+        // chroma prediction
+        if (is_sub8x8) {
+            assert(ss_hor == 1);
+            ptrdiff_t h_off = 0, v_off = 0;
+            if (bw4 == 1 && bh4 == ss_ver) {
+                for (int pl = 0; pl < 2; pl++) {
+                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
+                             NULL, f->cur.stride[1],
+                             bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
+                             r[-1][t->bx - 1].mv.mv[0],
+                             &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
+                             r[-1][t->bx - 1].ref.ref[0] - 1,
+                             f->frame_thread.pass != 2 ? t->tl_4x4_filter :
+                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
+                    if (res) return res;
+                }
+                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
+                h_off = 2;
+            }
+            if (bw4 == 1) {
+                const enum Filter2d left_filter_2d =
+                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
+                for (int pl = 0; pl < 2; pl++) {
+                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
+                             f->cur.stride[1], bw4, bh4, t->bx - 1,
+                             t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
+                             &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
+                             r[0][t->bx - 1].ref.ref[0] - 1,
+                             f->frame_thread.pass != 2 ? left_filter_2d :
+                                 f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
+                    if (res) return res;
+                }
+                h_off = 2;
+            }
+            if (bh4 == ss_ver) {
+                const enum Filter2d top_filter_2d =
+                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
+                for (int pl = 0; pl < 2; pl++) {
+                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
+                             f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
+                             1 + pl, r[-1][t->bx].mv.mv[0],
+                             &f->refp[r[-1][t->bx].ref.ref[0] - 1],
+                             r[-1][t->bx].ref.ref[0] - 1,
+                             f->frame_thread.pass != 2 ? top_filter_2d :
+                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
+                    if (res) return res;
+                }
+                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
+            }
+            for (int pl = 0; pl < 2; pl++) {
+                res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
+                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
+                         refp, b->ref[0], filter_2d);
+                if (res) return res;
+            }
+        } else {
+            if (imin(cbw4, cbh4) > 1 &&
+                ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+                 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
+            {
+                for (int pl = 0; pl < 2; pl++) {
+                    res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
+                                      f->cur.stride[1], b_dim, 1 + pl, refp,
+                                      b->motion_mode == MM_WARP ? &t->warpmv :
+                                          &f->frame_hdr->gmv[b->ref[0]]);
+                    if (res) return res;
+                }
+            } else {
+                for (int pl = 0; pl < 2; pl++) {
+                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
+                             NULL, f->cur.stride[1],
+                             bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
+                             t->bx & ~ss_hor, t->by & ~ss_ver,
+                             1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
+                    if (res) return res;
+                    if (b->motion_mode == MM_OBMC) {
+                        res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
+                                   f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
+                        if (res) return res;
+                    }
+                }
+            }
+            if (b->interintra_type) {
+                // FIXME for 8x32 with 4:2:2 subsampling, this probably does
+                // the wrong thing since it will select 4x16, not 4x32, as a
+                // transform size...
+                const uint8_t *const ii_mask =
+                    b->interintra_type == INTER_INTRA_BLEND ?
+                         dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :
+                         dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx];
+
+                for (int pl = 0; pl < 2; pl++) {
+                    pixel *const tmp = bitfn(t->scratch.interintra);
+                    pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
+                    enum IntraPredMode m =
+                        b->interintra_mode == II_SMOOTH_PRED ?
+                        SMOOTH_PRED : b->interintra_mode;
+                    int angle = 0;
+                    pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
+                    const pixel *top_sb_edge = NULL;
+                    if (!(t->by & (f->sb_step - 1))) {
+                        top_sb_edge = f->ipred_edge[pl + 1];
+                        const int sby = t->by >> f->sb_shift;
+                        top_sb_edge += f->sb128w * 128 * (sby - 1);
+                    }
+                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
+                                                          (t->bx >> ss_hor) >
+                                                              (ts->tiling.col_start >> ss_hor),
+                                                          t->by >> ss_ver,
+                                                          (t->by >> ss_ver) >
+                                                              (ts->tiling.row_start >> ss_ver),
+                                                          ts->tiling.col_end >> ss_hor,
+                                                          ts->tiling.row_end >> ss_ver,
+                                                          0, uvdst, f->cur.stride[1],
+                                                          top_sb_edge, m,
+                                                          &angle, cbw4, cbh4, 0, tl_edge
+                                                          HIGHBD_CALL_SUFFIX);
+                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
+                                             tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
+                                             HIGHBD_CALL_SUFFIX);
+                    dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
+                                  cbw4 * 4, cbh4 * 4, ii_mask);
+                }
+            }
+        }
+
+    skip_inter_chroma_pred: {}
+        t->tl_4x4_filter = filter_2d;
+    } else {
+        const enum Filter2d filter_2d = b->filter2d;
+        // Maximum super block size is 128x128
+        int16_t (*tmp)[128 * 128] = t->scratch.compinter;
+        int jnt_weight;
+        uint8_t *const seg_mask = t->scratch.seg_mask;
+        const uint8_t *mask;
+
+        for (int i = 0; i < 2; i++) {
+            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
+
+            if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
+                res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
+                                  &f->frame_hdr->gmv[b->ref[i]]);
+                if (res) return res;
+            } else {
+                res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
+                         b->mv[i], refp, b->ref[i], filter_2d);
+                if (res) return res;
+            }
+        }
+        switch (b->comp_type) {
+        case COMP_INTER_AVG:
+            dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
+                        bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);
+            break;
+        case COMP_INTER_WEIGHTED_AVG:
+            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
+            dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
+                          bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);
+            break;
+        case COMP_INTER_SEG:
+            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
+                                           tmp[b->mask_sign], tmp[!b->mask_sign],
+                                           bw4 * 4, bh4 * 4, seg_mask,
+                                           b->mask_sign HIGHBD_CALL_SUFFIX);
+            mask = seg_mask;
+            break;
+        case COMP_INTER_WEDGE:
+            mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];
+            dsp->mc.mask(dst, f->cur.stride[0],
+                         tmp[b->mask_sign], tmp[!b->mask_sign],
+                         bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
+            if (has_chroma)
+                mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx];
+            break;
+        }
+
+        // chroma
+        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
+            for (int i = 0; i < 2; i++) {
+                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
+                if (b->inter_mode == GLOBALMV_GLOBALMV &&
+                    imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
+                {
+                    res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
+                                      b_dim, 1 + pl,
+                                      refp, &f->frame_hdr->gmv[b->ref[i]]);
+                    if (res) return res;
+                } else {
+                    res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
+                             1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
+                    if (res) return res;
+                }
+            }
+            pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
+            switch (b->comp_type) {
+            case COMP_INTER_AVG:
+                dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
+                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
+                            HIGHBD_CALL_SUFFIX);
+                break;
+            case COMP_INTER_WEIGHTED_AVG:
+                dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
+                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
+                              HIGHBD_CALL_SUFFIX);
+                break;
+            case COMP_INTER_WEDGE:
+            case COMP_INTER_SEG:
+                dsp->mc.mask(uvdst, f->cur.stride[1],
+                             tmp[b->mask_sign], tmp[!b->mask_sign],
+                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
+                             HIGHBD_CALL_SUFFIX);
+                break;
+            }
+        }
+    }
+
+    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+        hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
+        if (has_chroma) {
+            hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
+                     cbw4 * 4, cbh4 * 4, "u-pred");
+            hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
+                     cbw4 * 4, cbh4 * 4, "v-pred");
+        }
+    }
+
+    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+
+    if (b->skip) {
+        // reset coef contexts
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir lcoef, off, mul * 0x40)
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+        if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
+            rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
+            case_set(cbh4, l., 1, cby4);
+            case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+        }
+        return 0;
+    }
+
+    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
+    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
+    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
+
+    for (int init_y = 0; init_y < bh4; init_y += 16) {
+        for (int init_x = 0; init_x < bw4; init_x += 16) {
+            // coefficient coding & inverse transforms
+            int y_off = !!init_y, y;
+            dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
+            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
+                 y += ytx->h, y_off++)
+            {
+                int x, x_off = !!init_x;
+                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
+                     x += ytx->w, x_off++)
+                {
+                    read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
+                                   x_off, y_off, &dst[x * 4]);
+                    t->bx += ytx->w;
+                }
+                dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
+                t->bx -= x;
+                t->by += ytx->h;
+            }
+            dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
+            t->by -= y;
+
+            // chroma coefs and inverse transform
+            if (has_chroma) for (int pl = 0; pl < 2; pl++) {
+                pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
+                    (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
+                for (y = init_y >> ss_ver, t->by += init_y;
+                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
+                {
+                    int x;
+                    for (x = init_x >> ss_hor, t->bx += init_x;
+                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
+                    {
+                        coef *cf;
+                        int eob;
+                        enum TxfmType txtp;
+                        if (f->frame_thread.pass) {
+                            cf = ts->frame_thread.cf;
+                            ts->frame_thread.cf += uvtx->w * uvtx->h * 16;
+                            const struct CodedBlockInfo *const cbi =
+                                &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+                            eob = cbi->eob[1 + pl];
+                            txtp = cbi->txtp[1 + pl];
+                        } else {
+                            uint8_t cf_ctx;
+                            cf = bitfn(t->cf);
+                            txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
+                                                bx4 + (x << ss_hor)];
+                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+                                               &t->l.ccoef[pl][cby4 + y],
+                                               b->uvtx, bs, b, 0, 1 + pl,
+                                               cf, &txtp, &cf_ctx);
+                            if (DEBUG_BLOCK_INFO)
+                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+                                       "txtp=%d,eob=%d]: r=%d\n",
+                                       pl, b->uvtx, txtp, eob, ts->msac.rng);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                            rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                            memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+                            case_set_upto16_with_default( \
+                                     imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver),
+                                     l., 1, cby4 + y);
+                            case_set_upto16_with_default( \
+                                     imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+                                     a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
+                        }
+                        if (eob >= 0) {
+                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
+                            dsp->itx.itxfm_add[b->uvtx]
+                                              [txtp](&uvdst[4 * x],
+                                                     f->cur.stride[1],
+                                                     cf, eob HIGHBD_CALL_SUFFIX);
+                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                                hex_dump(&uvdst[4 * x], f->cur.stride[1],
+                                         uvtx->w * 4, uvtx->h * 4, "recon");
+                        }
+                        t->bx += uvtx->w << ss_hor;
+                    }
+                    uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
+                    t->bx -= x << ss_hor;
+                    t->by += uvtx->h << ss_ver;
+                }
+                t->by -= y << ss_ver;
+            }
+        }
+    }
+    return 0;
+}
+
+void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
+    const int sbsz = f->sb_step, sbh = f->sbh;
+
+    if (f->frame_hdr->loopfilter.level_y[0] ||
+        f->frame_hdr->loopfilter.level_y[1])
+    {
+        int start_of_tile_row = 0;
+        if (f->frame_hdr->tiling.row_start_sb[f->lf.tile_row] == sby)
+            start_of_tile_row = f->lf.tile_row++;
+        bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,
+                                       start_of_tile_row);
+    }
+
+    if (f->lf.restore_planes) {
+        // Store loop filtered pixels required by loop restoration
+        bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
+    }
+    if (f->seq_hdr->cdef) {
+        if (sby) {
+            const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+            pixel *p_up[3] = {
+                f->lf.p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
+                f->lf.p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+                f->lf.p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+            };
+            bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
+                                    sby * sbsz - 2, sby * sbsz);
+        }
+        const int n_blks = sbsz - 2 * (sby + 1 < sbh);
+        bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
+                                imin(sby * sbsz + n_blks, f->bh));
+    }
+    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+        const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
+        for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
+            const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+            const int h_start = 8 * !!sby >> ss_ver;
+            const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
+            pixel *dst = f->lf.sr_p[pl] - h_start * PXSTRIDE(dst_stride);
+            const ptrdiff_t src_stride = f->cur.stride[!!pl];
+            const pixel *src = f->lf.p[pl] - h_start * PXSTRIDE(src_stride);
+            const int h_end = 4 * (sbsz - 2 * (sby + 1 < sbh)) >> ss_ver;
+            const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+            const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+            const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
+            const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
+
+            f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
+                              imin(img_h, h_end) + h_start, src_w,
+                              f->resize_step[!!pl], f->resize_start[!!pl]
+                              HIGHBD_CALL_SUFFIX);
+        }
+    }
+    if (f->lf.restore_planes) {
+        bytefn(dav1d_lr_sbrow)(f, f->lf.sr_p, sby);
+    }
+
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.stride[0]);
+    f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
+    f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
+    f->lf.sr_p[0] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[0]);
+    f->lf.sr_p[1] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver;
+    f->lf.sr_p[2] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver;
+    f->lf.prev_mask_ptr = f->lf.mask_ptr;
+    if ((sby & 1) || f->seq_hdr->sb128) {
+        f->lf.mask_ptr += f->sb128w;
+    }
+}
+
+void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {
+    const Dav1dFrameContext *const f = t->f;
+    Dav1dTileState *const ts = t->ts;
+    const int sby = t->by >> f->sb_shift;
+    const int sby_off = f->sb128w * 128 * sby;
+    const int x_off = ts->tiling.col_start;
+
+    const pixel *const y =
+        ((const pixel *) f->cur.data[0]) + x_off * 4 +
+                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
+    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
+               4 * (ts->tiling.col_end - x_off));
+
+    if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+
+        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
+            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
+        for (int pl = 1; pl <= 2; pl++)
+            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
+                       &((const pixel *) f->cur.data[pl])[uv_off],
+                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
+    }
+}
diff --git a/src/ref.c b/src/ref.c
new file mode 100644 (file)
index 0000000..32cc96f
--- /dev/null
+++ b/src/ref.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/mem.h"
+
+#include "src/ref.h"
+
+static void default_free_callback(const uint8_t *const data, void *const user_data) {
+    assert(data == user_data);
+    dav1d_free_aligned(user_data);
+}
+
+Dav1dRef *dav1d_ref_create(const size_t size) {
+    void *data = dav1d_alloc_aligned(size, 32);
+    if (!data) return NULL;
+
+    Dav1dRef *const res = dav1d_ref_wrap(data, default_free_callback, data);
+    if (res)
+        res->data = data;
+    else
+        dav1d_free_aligned(data);
+
+    return res;
+}
+
+Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr,
+                         void (*free_callback)(const uint8_t *data, void *user_data),
+                         void *const user_data)
+{
+    Dav1dRef *res = malloc(sizeof(Dav1dRef));
+    if (!res) return NULL;
+
+    res->data = NULL;
+    res->const_data = ptr;
+    atomic_init(&res->ref_cnt, 1);
+    res->free_callback = free_callback;
+    res->user_data = user_data;
+
+    return res;
+}
+
+void dav1d_ref_inc(Dav1dRef *const ref) {
+    atomic_fetch_add(&ref->ref_cnt, 1);
+}
+
+void dav1d_ref_dec(Dav1dRef **const pref) {
+    assert(pref != NULL);
+
+    Dav1dRef *const ref = *pref;
+    if (!ref) return;
+
+    if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
+        ref->free_callback(ref->const_data, ref->user_data);
+        free(ref);
+    }
+    *pref = NULL;
+}
+
+int dav1d_ref_is_writable(Dav1dRef *const ref) {
+    return atomic_load(&ref->ref_cnt) == 1 && ref->data;
+}
diff --git a/src/ref.h b/src/ref.h
new file mode 100644 (file)
index 0000000..b26c01a
--- /dev/null
+++ b/src/ref.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_REF_H
+#define DAV1D_SRC_REF_H
+
+#include "dav1d/dav1d.h"
+
+#include <stdatomic.h>
+#include <stddef.h>
+
+struct Dav1dRef {
+    void *data;
+    const void *const_data;
+    atomic_int ref_cnt;
+    void (*free_callback)(const uint8_t *data, void *user_data);
+    void *user_data;
+};
+
+Dav1dRef *dav1d_ref_create(size_t size);
+Dav1dRef *dav1d_ref_wrap(const uint8_t *ptr,
+                         void (*free_callback)(const uint8_t *data, void *user_data),
+                         void *user_data);
+void dav1d_ref_inc(Dav1dRef *ref);
+void dav1d_ref_dec(Dav1dRef **ref);
+
+int dav1d_ref_is_writable(Dav1dRef *ref);
+
+#endif /* DAV1D_SRC_REF_H */
diff --git a/src/refmvs.c b/src/refmvs.c
new file mode 100644 (file)
index 0000000..1e113b4
--- /dev/null
@@ -0,0 +1,909 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <limits.h>
+#include <stdlib.h>
+
+#include "dav1d/common.h"
+
+#include "common/intops.h"
+
+#include "src/env.h"
+#include "src/refmvs.h"
+
+static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cnt,
+                                  const int weight, const refmvs_block *const b,
+                                  const union refmvs_refpair ref, const mv gmv[2],
+                                  int *const have_newmv_match,
+                                  int *const have_refmv_match)
+{
+    if (b->mv.mv[0].n == INVALID_MV) return; // intra block, no intrabc
+
+    if (ref.ref[1] == -1) {
+        for (int n = 0; n < 2; n++) {
+            if (b->ref.ref[n] == ref.ref[0]) {
+                const mv cand_mv = ((b->mf & 1) && gmv[0].n != INVALID_MV) ?
+                                   gmv[0] : b->mv.mv[n];
+
+                const int last = *cnt;
+                for (int m = 0; m < last; m++)
+                    if (mvstack[m].mv.mv[0].n == cand_mv.n) {
+                        mvstack[m].weight += weight;
+                        *have_refmv_match = 1;
+                        *have_newmv_match |= b->mf >> 1;
+                        return;
+                    }
+
+                if (last < 8) {
+                    mvstack[last].mv.mv[0] = cand_mv;
+                    mvstack[last].weight = weight;
+                    *cnt = last + 1;
+                }
+                *have_refmv_match = 1;
+                *have_newmv_match |= b->mf >> 1;
+                return;
+            }
+        }
+    } else if (b->ref.pair == ref.pair) {
+        const refmvs_mvpair cand_mv = { .mv = {
+            [0] = ((b->mf & 1) && gmv[0].n != INVALID_MV) ? gmv[0] : b->mv.mv[0],
+            [1] = ((b->mf & 1) && gmv[1].n != INVALID_MV) ? gmv[1] : b->mv.mv[1],
+        }};
+
+        const int last = *cnt;
+        for (int n = 0; n < last; n++)
+            if (mvstack[n].mv.n == cand_mv.n) {
+                mvstack[n].weight += weight;
+                *have_refmv_match = 1;
+                *have_newmv_match |= b->mf >> 1;
+                return;
+            }
+
+        if (last < 8) {
+            mvstack[last].mv = cand_mv;
+            mvstack[last].weight = weight;
+            *cnt = last + 1;
+        }
+        *have_refmv_match = 1;
+        *have_newmv_match |= b->mf >> 1;
+    }
+}
+
+static int scan_row(refmvs_candidate *const mvstack, int *const cnt,
+                    const union refmvs_refpair ref, const mv gmv[2],
+                    const refmvs_block *b, const int bw4, const int w4,
+                    const int max_rows, const int step,
+                    int *const have_newmv_match, int *const have_refmv_match)
+{
+    const refmvs_block *cand_b = b;
+    const enum BlockSize first_cand_bs = cand_b->bs;
+    const uint8_t *const first_cand_b_dim = dav1d_block_dimensions[first_cand_bs];
+    int cand_bw4 = first_cand_b_dim[0];
+    int len = imax(step, imin(bw4, cand_bw4));
+
+    if (bw4 <= cand_bw4) {
+        // FIXME weight can be higher for odd blocks (bx4 & 1), but then the
+        // position of the first block has to be odd already, i.e. not just
+        // for row_offset=-3/-5
+        // FIXME why can this not be cand_bw4?
+        const int weight = bw4 == 1 ? 2 :
+                           imax(2, imin(2 * max_rows, first_cand_b_dim[1]));
+        add_spatial_candidate(mvstack, cnt, len * weight, cand_b, ref, gmv,
+                              have_newmv_match, have_refmv_match);
+        return weight >> 1;
+    }
+
+    for (int x = 0;;) {
+        // FIXME if we overhang above, we could fill a bitmask so we don't have
+        // to repeat the add_spatial_candidate() for the next row, but just increase
+        // the weight here
+        add_spatial_candidate(mvstack, cnt, len * 2, cand_b, ref, gmv,
+                              have_newmv_match, have_refmv_match);
+        x += len;
+        if (x >= w4) return 1;
+        cand_b = &b[x];
+        cand_bw4 = dav1d_block_dimensions[cand_b->bs][0];
+        assert(cand_bw4 < bw4);
+        len = imax(step, cand_bw4);
+    }
+}
+
+static int scan_col(refmvs_candidate *const mvstack, int *const cnt,
+                    const union refmvs_refpair ref, const mv gmv[2],
+                    /*const*/ refmvs_block *const *b, const int bh4, const int h4,
+                    const int bx4, const int max_cols, const int step,
+                    int *const have_newmv_match, int *const have_refmv_match)
+{
+    const refmvs_block *cand_b = &b[0][bx4];
+    const enum BlockSize first_cand_bs = cand_b->bs;
+    const uint8_t *const first_cand_b_dim = dav1d_block_dimensions[first_cand_bs];
+    int cand_bh4 = first_cand_b_dim[1];
+    int len = imax(step, imin(bh4, cand_bh4));
+
+    if (bh4 <= cand_bh4) {
+        // FIXME weight can be higher for odd blocks (by4 & 1), but then the
+        // position of the first block has to be odd already, i.e. not just
+        // for col_offset=-3/-5
+        // FIXME why can this not be cand_bh4?
+        const int weight = bh4 == 1 ? 2 :
+                           imax(2, imin(2 * max_cols, first_cand_b_dim[0]));
+        add_spatial_candidate(mvstack, cnt, len * weight, cand_b, ref, gmv,
+                            have_newmv_match, have_refmv_match);
+        return weight >> 1;
+    }
+
+    for (int y = 0;;) {
+        // FIXME if we overhang above, we could fill a bitmask so we don't have
+        // to repeat the add_spatial_candidate() for the next row, but just increase
+        // the weight here
+        add_spatial_candidate(mvstack, cnt, len * 2, cand_b, ref, gmv,
+                              have_newmv_match, have_refmv_match);
+        y += len;
+        if (y >= h4) return 1;
+        cand_b = &b[y][bx4];
+        cand_bh4 = dav1d_block_dimensions[cand_b->bs][1];
+        assert(cand_bh4 < bh4);
+        len = imax(step, cand_bh4);
+    }
+}
+
+static inline union mv mv_projection(const union mv mv, const int num, const int den) {
+    static const uint16_t div_mult[32] = {
+           0, 16384, 8192, 5461, 4096, 3276, 2730, 2340,
+        2048,  1820, 1638, 1489, 1365, 1260, 1170, 1092,
+        1024,   963,  910,  862,  819,  780,  744,  712,
+         682,   655,  630,  606,  585,  564,  546,  528
+    };
+    assert(den > 0 && den < 32);
+    assert(num > -32 && num < 32);
+    const int frac = num * div_mult[den];
+    const int y = mv.y * frac, x = mv.x * frac;
+    // Round and clip according to AV1 spec section 7.9.3
+    return (union mv) { // 0x3fff == (1 << 14) - 1
+        .y = iclip((y + 8192 + (y >> 31)) >> 14, -0x3fff, 0x3fff),
+        .x = iclip((x + 8192 + (x >> 31)) >> 14, -0x3fff, 0x3fff)
+    };
+}
+
+static void add_temporal_candidate(const refmvs_frame *const rf,
+                                   refmvs_candidate *const mvstack, int *const cnt,
+                                   const refmvs_temporal_block *const rb,
+                                   const union refmvs_refpair ref, int *const globalmv_ctx,
+                                   const union mv gmv[])
+{
+    if (rb->mv.n == INVALID_MV) return;
+
+    union mv mv = mv_projection(rb->mv, rf->pocdiff[ref.ref[0] - 1], rb->ref);
+    fix_mv_precision(rf->frm_hdr, &mv);
+
+    const int last = *cnt;
+    if (ref.ref[1] == -1) {
+        if (globalmv_ctx)
+            *globalmv_ctx = (abs(mv.x - gmv[0].x) | abs(mv.y - gmv[0].y)) >= 16;
+
+        for (int n = 0; n < last; n++)
+            if (mvstack[n].mv.mv[0].n == mv.n) {
+                mvstack[n].weight += 2;
+                return;
+            }
+        if (last < 8) {
+            mvstack[last].mv.mv[0] = mv;
+            mvstack[last].weight = 2;
+            *cnt = last + 1;
+        }
+    } else {
+        refmvs_mvpair mvp = { .mv = {
+            [0] = mv,
+            [1] = mv_projection(rb->mv, rf->pocdiff[ref.ref[1] - 1], rb->ref),
+        }};
+        fix_mv_precision(rf->frm_hdr, &mvp.mv[1]);
+
+        for (int n = 0; n < last; n++)
+            if (mvstack[n].mv.n == mvp.n) {
+                mvstack[n].weight += 2;
+                return;
+            }
+        if (last < 8) {
+            mvstack[last].mv = mvp;
+            mvstack[last].weight = 2;
+            *cnt = last + 1;
+        }
+    }
+}
+
+static void add_compound_extended_candidate(refmvs_candidate *const same,
+                                            int *const same_count,
+                                            const refmvs_block *const cand_b,
+                                            const int sign0, const int sign1,
+                                            const union refmvs_refpair ref,
+                                            const uint8_t *const sign_bias)
+{
+    refmvs_candidate *const diff = &same[2];
+    int *const diff_count = &same_count[2];
+
+    for (int n = 0; n < 2; n++) {
+        const int cand_ref = cand_b->ref.ref[n];
+
+        if (cand_ref <= 0) break;
+
+        mv cand_mv = cand_b->mv.mv[n];
+        if (cand_ref == ref.ref[0]) {
+            if (same_count[0] < 2)
+                same[same_count[0]++].mv.mv[0] = cand_mv;
+            if (diff_count[1] < 2) {
+                if (sign1 ^ sign_bias[cand_ref - 1]) {
+                    cand_mv.y = -cand_mv.y;
+                    cand_mv.x = -cand_mv.x;
+                }
+                diff[diff_count[1]++].mv.mv[1] = cand_mv;
+            }
+        } else if (cand_ref == ref.ref[1]) {
+            if (same_count[1] < 2)
+                same[same_count[1]++].mv.mv[1] = cand_mv;
+            if (diff_count[0] < 2) {
+                if (sign0 ^ sign_bias[cand_ref - 1]) {
+                    cand_mv.y = -cand_mv.y;
+                    cand_mv.x = -cand_mv.x;
+                }
+                diff[diff_count[0]++].mv.mv[0] = cand_mv;
+            }
+        } else {
+            mv i_cand_mv = (union mv) {
+                .x = -cand_mv.x,
+                .y = -cand_mv.y
+            };
+
+            if (diff_count[0] < 2) {
+                diff[diff_count[0]++].mv.mv[0] =
+                    sign0 ^ sign_bias[cand_ref - 1] ?
+                    i_cand_mv : cand_mv;
+            }
+
+            if (diff_count[1] < 2) {
+                diff[diff_count[1]++].mv.mv[1] =
+                    sign1 ^ sign_bias[cand_ref - 1] ?
+                    i_cand_mv : cand_mv;
+            }
+        }
+    }
+}
+
+static void add_single_extended_candidate(refmvs_candidate mvstack[8], int *const cnt,
+                                          const refmvs_block *const cand_b,
+                                          const int sign, const uint8_t *const sign_bias)
+{
+    for (int n = 0; n < 2; n++) {
+        const int cand_ref = cand_b->ref.ref[n];
+
+        if (cand_ref <= 0) break;
+        // we need to continue even if cand_ref == ref.ref[0], since
+        // the candidate could have been added as a globalmv variant,
+        // which changes the value
+        // FIXME if scan_{row,col}() returned a mask for the nearest
+        // edge, we could skip the appropriate ones here
+
+        mv cand_mv = cand_b->mv.mv[n];
+        if (sign ^ sign_bias[cand_ref - 1]) {
+            cand_mv.y = -cand_mv.y;
+            cand_mv.x = -cand_mv.x;
+        }
+
+        int m;
+        const int last = *cnt;
+        for (m = 0; m < last; m++)
+            if (cand_mv.n == mvstack[m].mv.mv[0].n)
+                break;
+        if (m == last) {
+            mvstack[m].mv.mv[0] = cand_mv;
+            mvstack[m].weight = 2; // "minimal"
+            *cnt = last + 1;
+        }
+    }
+}
+
+/*
+ * refmvs_frame allocates memory for one sbrow (32 blocks high, whole frame
+ * wide) of 4x4-resolution refmvs_block entries for spatial MV referencing.
+ * mvrefs_tile[] keeps a list of 35 (32 + 3 above) pointers into this memory,
+ * and each sbrow, the bottom entries (y=27/29/31) are exchanged with the top
+ * (-5/-3/-1) pointers by calling dav1d_refmvs_tile_sbrow_init() at the start
+ * of each tile/sbrow.
+ *
+ * For temporal MV referencing, we call dav1d_refmvs_save_tmvs() at the end of
+ * each tile/sbrow (when tile column threading is enabled), or at the start of
+ * each interleaved sbrow (i.e. once for all tile columns together, when tile
+ * column threading is disabled). This will copy the 4x4-resolution spatial MVs
+ * into 8x8-resolution refmvs_temporal_block structures. Then, for subsequent
+ * frames, at the start of each tile/sbrow (when tile column threading is
+ * enabled) or at the start of each interleaved sbrow (when tile column
+ * threading is disabled), we call load_tmvs(), which will project the MVs to
+ * their respective position in the current frame.
+ */
+
+void dav1d_refmvs_find(const refmvs_tile *const rt,
+                       refmvs_candidate mvstack[8], int *const cnt,
+                       int *const ctx,
+                       const union refmvs_refpair ref, const enum BlockSize bs,
+                       const enum EdgeFlags edge_flags,
+                       const int by4, const int bx4)
+{
+    const refmvs_frame *const rf = rt->rf;
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+    const int bw4 = b_dim[0], w4 = imin(imin(bw4, 16), rt->tile_col.end - bx4);
+    const int bh4 = b_dim[1], h4 = imin(imin(bh4, 16), rt->tile_row.end - by4);
+    mv gmv[2], tgmv[2];
+
+    *cnt = 0;
+    assert(ref.ref[0] >=  0 && ref.ref[0] <= 8 &&
+           ref.ref[1] >= -1 && ref.ref[1] <= 8);
+    if (ref.ref[0] > 0) {
+        tgmv[0] = get_gmv_2d(&rf->frm_hdr->gmv[ref.ref[0] - 1],
+                             bx4, by4, bw4, bh4, rf->frm_hdr);
+        gmv[0] = rf->frm_hdr->gmv[ref.ref[0] - 1].type > DAV1D_WM_TYPE_TRANSLATION ?
+                 tgmv[0] : (mv) { .n = INVALID_MV };
+    } else {
+        tgmv[0] = (mv) { .n = 0 };
+        gmv[0] = (mv) { .n = INVALID_MV };
+    }
+    if (ref.ref[1] > 0) {
+        tgmv[1] = get_gmv_2d(&rf->frm_hdr->gmv[ref.ref[1] - 1],
+                             bx4, by4, bw4, bh4, rf->frm_hdr);
+        gmv[1] = rf->frm_hdr->gmv[ref.ref[1] - 1].type > DAV1D_WM_TYPE_TRANSLATION ?
+                 tgmv[1] : (mv) { .n = INVALID_MV };
+    }
+
+    // top
+    int have_newmv = 0, have_col_mvs = 0, have_row_mvs = 0;
+    unsigned max_rows = 0, n_rows = ~0;
+    const refmvs_block *b_top;
+    if (by4 > rt->tile_row.start) {
+        max_rows = imin((by4 - rt->tile_row.start + 1) >> 1, 2 + (bh4 > 1));
+        b_top = &rt->r[(by4 & 31) - 1 + 5][bx4];
+        n_rows = scan_row(mvstack, cnt, ref, gmv, b_top,
+                          bw4, w4, max_rows, bw4 >= 16 ? 4 : 1,
+                          &have_newmv, &have_row_mvs);
+    }
+
+    // left
+    unsigned max_cols = 0, n_cols = ~0U;
+    refmvs_block *const *b_left;
+    if (bx4 > rt->tile_col.start) {
+        max_cols = imin((bx4 - rt->tile_col.start + 1) >> 1, 2 + (bw4 > 1));
+        b_left = &rt->r[(by4 & 31) + 5];
+        n_cols = scan_col(mvstack, cnt, ref, gmv, b_left,
+                          bh4, h4, bx4 - 1, max_cols, bh4 >= 16 ? 4 : 1,
+                          &have_newmv, &have_col_mvs);
+    }
+
+    // top/right
+    if (n_rows != ~0U && edge_flags & EDGE_I444_TOP_HAS_RIGHT &&
+        imax(bw4, bh4) <= 16 && bw4 + bx4 < rt->tile_col.end)
+    {
+        add_spatial_candidate(mvstack, cnt, 4, &b_top[bw4], ref, gmv,
+                              &have_newmv, &have_row_mvs);
+    }
+
+    const int nearest_match = have_col_mvs + have_row_mvs;
+    const int nearest_cnt = *cnt;
+    for (int n = 0; n < nearest_cnt; n++)
+        mvstack[n].weight += 640;
+
+    // temporal
+    int globalmv_ctx = rf->frm_hdr->use_ref_frame_mvs;
+    if (rf->use_ref_frame_mvs) {
+        const ptrdiff_t stride = rf->rp_stride;
+        const int by8 = by4 >> 1, bx8 = bx4 >> 1;
+        const refmvs_temporal_block *const rbi = &rt->rp_proj[(by8 & 15) * stride + bx8];
+        const refmvs_temporal_block *rb = rbi;
+        const int step_h = bw4 >= 16 ? 2 : 1, step_v = bh4 >= 16 ? 2 : 1;
+        const int w8 = imin((w4 + 1) >> 1, 8), h8 = imin((h4 + 1) >> 1, 8);
+        for (int y = 0; y < h8; y += step_v) {
+            for (int x = 0; x < w8; x+= step_h) {
+                add_temporal_candidate(rf, mvstack, cnt, &rb[x], ref,
+                                       !(x | y) ? &globalmv_ctx : NULL, tgmv);
+            }
+            rb += stride * step_v;
+        }
+        if (imin(bw4, bh4) >= 2 && imax(bw4, bh4) < 16) {
+            const int bh8 = bh4 >> 1, bw8 = bw4 >> 1;
+            rb = &rbi[bh8 * stride];
+            const int has_bottom = by8 + bh8 < imin(rt->tile_row.end >> 1,
+                                                    (by8 & ~7) + 8);
+            if (has_bottom && bx8 - 1 >= imax(rt->tile_col.start >> 1, bx8 & ~7)) {
+                add_temporal_candidate(rf, mvstack, cnt, &rb[-1], ref,
+                                       NULL, NULL);
+            }
+            if (bx8 + bw8 < imin(rt->tile_col.end >> 1, (bx8 & ~7) + 8)) {
+                if (has_bottom) {
+                    add_temporal_candidate(rf, mvstack, cnt, &rb[bw8], ref,
+                                           NULL, NULL);
+                }
+                if (by8 + bh8 - 1 < imin(rt->tile_row.end >> 1, (by8 & ~7) + 8)) {
+                    add_temporal_candidate(rf, mvstack, cnt, &rb[bw8 - stride],
+                                           ref, NULL, NULL);
+                }
+            }
+        }
+    }
+    assert(*cnt <= 8);
+
+    // top/left (which, confusingly, is part of "secondary" references)
+    int have_dummy_newmv_match;
+    if ((n_rows | n_cols) != ~0U) {
+        add_spatial_candidate(mvstack, cnt, 4, &b_top[-1], ref, gmv,
+                              &have_dummy_newmv_match, &have_row_mvs);
+    }
+
+    // "secondary" (non-direct neighbour) top & left edges
+    // what is different about secondary is that everything is now in 8x8 resolution
+    for (int n = 2; n <= 3; n++) {
+        if ((unsigned) n > n_rows && (unsigned) n <= max_rows) {
+            n_rows += scan_row(mvstack, cnt, ref, gmv,
+                               &rt->r[(((by4 & 31) - 2 * n + 1) | 1) + 5][bx4 | 1],
+                               bw4, w4, 1 + max_rows - n, bw4 >= 16 ? 4 : 2,
+                               &have_dummy_newmv_match, &have_row_mvs);
+        }
+
+        if ((unsigned) n > n_cols && (unsigned) n <= max_cols) {
+            n_cols += scan_col(mvstack, cnt, ref, gmv, &rt->r[((by4 & 31) | 1) + 5],
+                               bh4, h4, (bx4 - n * 2 + 1) | 1,
+                               1 + max_cols - n, bh4 >= 16 ? 4 : 2,
+                               &have_dummy_newmv_match, &have_col_mvs);
+        }
+    }
+    assert(*cnt <= 8);
+
+    const int ref_match_count = have_col_mvs + have_row_mvs;
+
+    // context build-up
+    int refmv_ctx, newmv_ctx;
+    switch (nearest_match) {
+    case 0:
+        refmv_ctx = imin(2, ref_match_count);
+        newmv_ctx = ref_match_count > 0;
+        break;
+    case 1:
+        refmv_ctx = imin(ref_match_count * 3, 4);
+        newmv_ctx = 3 - have_newmv;
+        break;
+    case 2:
+        refmv_ctx = 5;
+        newmv_ctx = 5 - have_newmv;
+        break;
+    }
+
+    // sorting (nearest, then "secondary")
+    int len = nearest_cnt;
+    while (len) {
+        int last = 0;
+        for (int n = 1; n < len; n++) {
+            if (mvstack[n - 1].weight < mvstack[n].weight) {
+#define EXCHANGE(a, b) do { refmvs_candidate tmp = a; a = b; b = tmp; } while (0)
+                EXCHANGE(mvstack[n - 1], mvstack[n]);
+                last = n;
+            }
+        }
+        len = last;
+    }
+    len = *cnt;
+    while (len > nearest_cnt) {
+        int last = nearest_cnt;
+        for (int n = nearest_cnt + 1; n < len; n++) {
+            if (mvstack[n - 1].weight < mvstack[n].weight) {
+                EXCHANGE(mvstack[n - 1], mvstack[n]);
+#undef EXCHANGE
+                last = n;
+            }
+        }
+        len = last;
+    }
+
+    if (ref.ref[1] > 0) {
+        if (*cnt < 2) {
+            const int sign0 = rf->sign_bias[ref.ref[0] - 1];
+            const int sign1 = rf->sign_bias[ref.ref[1] - 1];
+            const int sz4 = imin(w4, h4);
+            refmvs_candidate *const same = &mvstack[*cnt];
+            int same_count[4] = { 0 };
+
+            // non-self references in top
+            if (n_rows != ~0U) for (int x = 0; x < sz4;) {
+                const refmvs_block *const cand_b = &b_top[x];
+                add_compound_extended_candidate(same, same_count, cand_b,
+                                                sign0, sign1, ref, rf->sign_bias);
+                x += dav1d_block_dimensions[cand_b->bs][0];
+            }
+
+            // non-self references in left
+            if (n_cols != ~0U) for (int y = 0; y < sz4;) {
+                const refmvs_block *const cand_b = &b_left[y][bx4 - 1];
+                add_compound_extended_candidate(same, same_count, cand_b,
+                                                sign0, sign1, ref, rf->sign_bias);
+                y += dav1d_block_dimensions[cand_b->bs][1];
+            }
+
+            refmvs_candidate *const diff = &same[2];
+            const int *const diff_count = &same_count[2];
+
+            // merge together
+            for (int n = 0; n < 2; n++) {
+                int m = same_count[n];
+
+                if (m >= 2) continue;
+
+                const int l = diff_count[n];
+                if (l) {
+                    same[m].mv.mv[n] = diff[0].mv.mv[n];
+                    if (++m == 2) continue;
+                    if (l == 2) {
+                        same[1].mv.mv[n] = diff[1].mv.mv[n];
+                        continue;
+                    }
+                }
+                do {
+                    same[m].mv.mv[n] = tgmv[n];
+                } while (++m < 2);
+            }
+
+            // if the first extended was the same as the non-extended one,
+            // then replace it with the second extended one
+            int n = *cnt;
+            if (n == 1 && mvstack[0].mv.n == same[0].mv.n)
+                mvstack[1].mv = mvstack[2].mv;
+            do {
+                mvstack[n].weight = 2;
+            } while (++n < 2);
+            *cnt = 2;
+        }
+
+        // clamping
+        const int left = -(bx4 + bw4 + 4) * 4 * 8;
+        const int right = (rf->iw4 - bx4 + 4) * 4 * 8;
+        const int top = -(by4 + bh4 + 4) * 4 * 8;
+        const int bottom = (rf->ih4 - by4 + 4) * 4 * 8;
+
+        const int n_refmvs = *cnt;
+        int n = 0;
+        do {
+            mvstack[n].mv.mv[0].x = iclip(mvstack[n].mv.mv[0].x, left, right);
+            mvstack[n].mv.mv[0].y = iclip(mvstack[n].mv.mv[0].y, top, bottom);
+            mvstack[n].mv.mv[1].x = iclip(mvstack[n].mv.mv[1].x, left, right);
+            mvstack[n].mv.mv[1].y = iclip(mvstack[n].mv.mv[1].y, top, bottom);
+        } while (++n < n_refmvs);
+
+        switch (refmv_ctx >> 1) {
+        case 0:
+            *ctx = imin(newmv_ctx, 1);
+            break;
+        case 1:
+            *ctx = 1 + imin(newmv_ctx, 3);
+            break;
+        case 2:
+            *ctx = iclip(3 + newmv_ctx, 4, 7);
+            break;
+        }
+
+        return;
+    } else if (*cnt < 2 && ref.ref[0] > 0) {
+        const int sign = rf->sign_bias[ref.ref[0] - 1];
+        const int sz4 = imin(w4, h4);
+
+        // non-self references in top
+        if (n_rows != ~0U) for (int x = 0; x < sz4 && *cnt < 2;) {
+            const refmvs_block *const cand_b = &b_top[x];
+            add_single_extended_candidate(mvstack, cnt, cand_b, sign, rf->sign_bias);
+            x += dav1d_block_dimensions[cand_b->bs][0];
+        }
+
+        // non-self references in left
+        if (n_cols != ~0U) for (int y = 0; y < sz4 && *cnt < 2;) {
+            const refmvs_block *const cand_b = &b_left[y][bx4 - 1];
+            add_single_extended_candidate(mvstack, cnt, cand_b, sign, rf->sign_bias);
+            y += dav1d_block_dimensions[cand_b->bs][1];
+        }
+    }
+    assert(*cnt <= 8);
+
+    // clamping
+    int n_refmvs = *cnt;
+    if (n_refmvs) {
+        const int left = -(bx4 + bw4 + 4) * 4 * 8;
+        const int right = (rf->iw4 - bx4 + 4) * 4 * 8;
+        const int top = -(by4 + bh4 + 4) * 4 * 8;
+        const int bottom = (rf->ih4 - by4 + 4) * 4 * 8;
+
+        int n = 0;
+        do {
+            mvstack[n].mv.mv[0].x = iclip(mvstack[n].mv.mv[0].x, left, right);
+            mvstack[n].mv.mv[0].y = iclip(mvstack[n].mv.mv[0].y, top, bottom);
+        } while (++n < n_refmvs);
+    }
+
+    for (int n = *cnt; n < 2; n++)
+        mvstack[n].mv.mv[0] = tgmv[0];
+
+    *ctx = (refmv_ctx << 4) | (globalmv_ctx << 3) | newmv_ctx;
+}
+
+void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *const rf,
+                                  const int tile_col_start4, const int tile_col_end4,
+                                  const int tile_row_start4, const int tile_row_end4,
+                                  const int sby, int tile_row_idx)
+{
+    if (rf->n_tile_threads == 1) tile_row_idx = 0;
+    rt->rp_proj = &rf->rp_proj[16 * rf->rp_stride * tile_row_idx];
+    refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx];
+    const int sbsz = rf->sbsz;
+    const int off = (sbsz * sby) & 16;
+    for (int i = 0; i < sbsz; i++, r += rf->r_stride)
+        rt->r[off + 5 + i] = r;
+    rt->r[off + 0] = r;
+    r += rf->r_stride;
+    rt->r[off + 1] = NULL;
+    rt->r[off + 2] = r;
+    r += rf->r_stride;
+    rt->r[off + 3] = NULL;
+    rt->r[off + 4] = r;
+    if (sby & 1) {
+#define EXCHANGE(a, b) do { void *const tmp = a; a = b; b = tmp; } while (0)
+        EXCHANGE(rt->r[off + 0], rt->r[off + sbsz + 0]);
+        EXCHANGE(rt->r[off + 2], rt->r[off + sbsz + 2]);
+        EXCHANGE(rt->r[off + 4], rt->r[off + sbsz + 4]);
+#undef EXCHANGE
+    }
+
+    rt->rf = rf;
+    rt->tile_row.start = tile_row_start4;
+    rt->tile_row.end = imin(tile_row_end4, rf->ih4);
+    rt->tile_col.start = tile_col_start4;
+    rt->tile_col.end = imin(tile_col_end4, rf->iw4);
+}
+
+void dav1d_refmvs_load_tmvs(const refmvs_frame *const rf, int tile_row_idx,
+                            const int col_start8, const int col_end8,
+                            const int row_start8, int row_end8)
+{
+    if (rf->n_tile_threads == 1) tile_row_idx = 0;
+    assert(row_start8 >= 0);
+    assert((unsigned) (row_end8 - row_start8) <= 16U);
+    row_end8 = imin(row_end8, rf->ih8);
+    const int col_start8i = imax(col_start8 - 8, 0);
+    const int col_end8i = imin(col_end8 + 8, rf->iw8);
+
+    const ptrdiff_t stride = rf->rp_stride;
+    refmvs_temporal_block *rp_proj =
+        &rf->rp_proj[16 * stride * tile_row_idx + (row_start8 & 15) * stride];
+    for (int y = row_start8; y < row_end8; y++) {
+        for (int x = col_start8; x < col_end8; x++)
+            rp_proj[x].mv.n = INVALID_MV;
+        rp_proj += stride;
+    }
+
+    rp_proj = &rf->rp_proj[16 * stride * tile_row_idx];
+    for (int n = 0; n < rf->n_mfmvs; n++) {
+        const int ref2cur = rf->mfmv_ref2cur[n];
+        if (ref2cur == INT_MIN) continue;
+
+        const int ref = rf->mfmv_ref[n];
+        const int ref_sign = ref - 4;
+        const refmvs_temporal_block *r = &rf->rp_ref[ref][row_start8 * stride];
+        for (int y = row_start8; y < row_end8; y++) {
+            const int y_sb_align = y & ~7;
+            const int y_proj_start = imax(y_sb_align, row_start8);
+            const int y_proj_end = imin(y_sb_align + 8, row_end8);
+            for (int x = col_start8i; x < col_end8i; x++) {
+                const refmvs_temporal_block *rb = &r[x];
+                const int b_ref = rb->ref;
+                if (!b_ref) continue;
+                const int ref2ref = rf->mfmv_ref2ref[n][b_ref - 1];
+                if (!ref2ref) continue;
+                const mv b_mv = rb->mv;
+                const mv offset = mv_projection(b_mv, ref2cur, ref2ref);
+                int pos_x = x + apply_sign(abs(offset.x) >> 6,
+                                           offset.x ^ ref_sign);
+                const int pos_y = y + apply_sign(abs(offset.y) >> 6,
+                                                 offset.y ^ ref_sign);
+                if (pos_y >= y_proj_start && pos_y < y_proj_end) {
+                    const ptrdiff_t pos = (pos_y & 15) * stride;
+                    for (;;) {
+                        const int x_sb_align = x & ~7;
+                        if (pos_x >= imax(x_sb_align - 8, col_start8) &&
+                            pos_x < imin(x_sb_align + 16, col_end8))
+                        {
+                            rp_proj[pos + pos_x].mv = rb->mv;
+                            rp_proj[pos + pos_x].ref = ref2ref;
+                        }
+                        if (++x >= col_end8i) break;
+                        rb++;
+                        if (rb->ref != b_ref || rb->mv.n != b_mv.n) break;
+                        pos_x++;
+                    }
+                } else {
+                    for (;;) {
+                        if (++x >= col_end8i) break;
+                        rb++;
+                        if (rb->ref != b_ref || rb->mv.n != b_mv.n) break;
+                    }
+                }
+                x--;
+            }
+            r += stride;
+        }
+    }
+}
+
+void dav1d_refmvs_save_tmvs(const refmvs_tile *const rt,
+                            const int col_start8, int col_end8,
+                            const int row_start8, int row_end8)
+{
+    const refmvs_frame *const rf = rt->rf;
+
+    assert(row_start8 >= 0);
+    assert((unsigned) (row_end8 - row_start8) <= 16U);
+    row_end8 = imin(row_end8, rf->ih8);
+    col_end8 = imin(col_end8, rf->iw8);
+
+    const ptrdiff_t stride = rf->rp_stride;
+    const uint8_t *const ref_sign = rf->mfmv_sign;
+    refmvs_temporal_block *rp = &rf->rp[row_start8 * stride];
+    for (int y = row_start8; y < row_end8; y++) {
+        const refmvs_block *const b = rt->r[6 + (y & 15) * 2];
+
+        for (int x = col_start8; x < col_end8;) {
+            const refmvs_block *const cand_b = &b[x * 2 + 1];
+            const int bw8 = (dav1d_block_dimensions[cand_b->bs][0] + 1) >> 1;
+
+            if (cand_b->ref.ref[1] > 0 && ref_sign[cand_b->ref.ref[1] - 1] &&
+                (abs(cand_b->mv.mv[1].y) | abs(cand_b->mv.mv[1].x)) < 4096)
+            {
+                for (int n = 0; n < bw8; n++, x++)
+                    rp[x] = (refmvs_temporal_block) { .mv = cand_b->mv.mv[1],
+                                                      .ref = cand_b->ref.ref[1] };
+            } else if (cand_b->ref.ref[0] > 0 && ref_sign[cand_b->ref.ref[0] - 1] &&
+                       (abs(cand_b->mv.mv[0].y) | abs(cand_b->mv.mv[0].x)) < 4096)
+            {
+                for (int n = 0; n < bw8; n++, x++)
+                    rp[x] = (refmvs_temporal_block) { .mv = cand_b->mv.mv[0],
+                                                      .ref = cand_b->ref.ref[0] };
+            } else {
+                for (int n = 0; n < bw8; n++, x++)
+                    rp[x].ref = 0; // "invalid"
+            }
+        }
+        rp += stride;
+    }
+}
+
+int dav1d_refmvs_init_frame(refmvs_frame *const rf,
+                            const Dav1dSequenceHeader *const seq_hdr,
+                            const Dav1dFrameHeader *const frm_hdr,
+                            const unsigned ref_poc[7],
+                            refmvs_temporal_block *const rp,
+                            const unsigned ref_ref_poc[7][7],
+                            /*const*/ refmvs_temporal_block *const rp_ref[7],
+                            const int n_tile_threads)
+{
+    rf->sbsz = 16 << seq_hdr->sb128;
+    rf->frm_hdr = frm_hdr;
+    rf->iw8 = (frm_hdr->width[0] + 7) >> 3;
+    rf->ih8 = (frm_hdr->height + 7) >> 3;
+    rf->iw4 = rf->iw8 << 1;
+    rf->ih4 = rf->ih8 << 1;
+
+    const ptrdiff_t r_stride = ((frm_hdr->width[0] + 127) & ~127) >> 2;
+    const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1;
+    if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
+        if (rf->r) free(rf->r);
+        rf->r = malloc(sizeof(*rf->r) * 35 * r_stride * n_tile_rows);
+        if (!rf->r) return DAV1D_ERR(ENOMEM);
+        rf->r_stride = r_stride;
+    }
+
+    const ptrdiff_t rp_stride = r_stride >> 1;
+    if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) {
+        if (rf->rp_proj) free(rf->rp_proj);
+        rf->rp_proj = malloc(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows);
+        if (!rf->rp_proj) return DAV1D_ERR(ENOMEM);
+        rf->rp_stride = rp_stride;
+    }
+    rf->n_tile_rows = n_tile_rows;
+    rf->n_tile_threads = n_tile_threads;
+    rf->rp = rp;
+    rf->rp_ref = rp_ref;
+    const unsigned poc = frm_hdr->frame_offset;
+    for (int i = 0; i < 7; i++) {
+        const int poc_diff = get_poc_diff(seq_hdr->order_hint_n_bits,
+                                          ref_poc[i], poc);
+        rf->sign_bias[i] = poc_diff > 0;
+        rf->mfmv_sign[i] = poc_diff < 0;
+        rf->pocdiff[i] = iclip(get_poc_diff(seq_hdr->order_hint_n_bits,
+                                            poc, ref_poc[i]), -31, 31);
+    }
+
+    // temporal MV setup
+    rf->n_mfmvs = 0;
+    if (frm_hdr->use_ref_frame_mvs && seq_hdr->order_hint_n_bits) {
+        int total = 2;
+        if (rp_ref[0] && ref_ref_poc[0][6] != ref_poc[3] /* alt-of-last != gold */) {
+            rf->mfmv_ref[rf->n_mfmvs++] = 0; // last
+            total = 3;
+        }
+        if (rp_ref[4] && get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[4],
+                                      frm_hdr->frame_offset) > 0)
+        {
+            rf->mfmv_ref[rf->n_mfmvs++] = 4; // bwd
+        }
+        if (rp_ref[5] && get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[5],
+                                      frm_hdr->frame_offset) > 0)
+        {
+            rf->mfmv_ref[rf->n_mfmvs++] = 5; // altref2
+        }
+        if (rf->n_mfmvs < total && rp_ref[6] &&
+            get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[6],
+                         frm_hdr->frame_offset) > 0)
+        {
+            rf->mfmv_ref[rf->n_mfmvs++] = 6; // altref
+        }
+        if (rf->n_mfmvs < total && rp_ref[1])
+            rf->mfmv_ref[rf->n_mfmvs++] = 1; // last2
+
+        for (int n = 0; n < rf->n_mfmvs; n++) {
+            const unsigned rpoc = ref_poc[rf->mfmv_ref[n]];
+            const int diff1 = get_poc_diff(seq_hdr->order_hint_n_bits,
+                                           rpoc, frm_hdr->frame_offset);
+            if (abs(diff1) > 31) {
+                rf->mfmv_ref2cur[n] = INT_MIN;
+            } else {
+                rf->mfmv_ref2cur[n] = rf->mfmv_ref[n] < 4 ? -diff1 : diff1;
+                for (int m = 0; m < 7; m++) {
+                    const unsigned rrpoc = ref_ref_poc[rf->mfmv_ref[n]][m];
+                    const int diff2 = get_poc_diff(seq_hdr->order_hint_n_bits,
+                                                   rpoc, rrpoc);
+                    // unsigned comparison also catches the < 0 case
+                    rf->mfmv_ref2ref[n][m] = (unsigned) diff2 > 31U ? 0 : diff2;
+                }
+            }
+        }
+    }
+    rf->use_ref_frame_mvs = rf->n_mfmvs > 0;
+
+    return 0;
+}
+
+void dav1d_refmvs_init(refmvs_frame *const rf) {
+    rf->r = NULL;
+    rf->r_stride = 0;
+    rf->rp_proj = NULL;
+    rf->rp_stride = 0;
+}
+
+void dav1d_refmvs_clear(refmvs_frame *const rf) {
+    if (rf->r) free(rf->r);
+    if (rf->rp_proj) free(rf->rp_proj);
+}
diff --git a/src/refmvs.h b/src/refmvs.h
new file mode 100644 (file)
index 0000000..6f68a76
--- /dev/null
@@ -0,0 +1,233 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_REF_MVS_H
+#define DAV1D_SRC_REF_MVS_H
+
+#include <stdint.h>
+
+#include "dav1d/headers.h"
+
+#include "common/intops.h"
+
+#include "src/intra_edge.h"
+#include "src/levels.h"
+#include "src/tables.h"
+
+#define INVALID_MV 0x80008000
+
+typedef struct refmvs_temporal_block {
+    mv mv;
+    int8_t ref;
+} refmvs_temporal_block;
+
+typedef union refmvs_refpair {
+    int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0
+    uint16_t pair;
+} refmvs_refpair;
+
+typedef union refmvs_mvpair {
+    mv mv[2];
+    uint64_t n;
+} refmvs_mvpair;
+
+typedef struct refmvs_block {
+    refmvs_mvpair mv;
+    refmvs_refpair ref;
+    uint8_t bs, mf; // 1 = globalmv+affine, 2 = newmv
+} refmvs_block;
+
+typedef struct refmvs_frame {
+    const Dav1dFrameHeader *frm_hdr;
+    int iw4, ih4, iw8, ih8;
+    int sbsz;
+    int use_ref_frame_mvs;
+    uint8_t sign_bias[7], mfmv_sign[7];
+    int8_t pocdiff[7];
+    uint8_t mfmv_ref[3];
+    int mfmv_ref2cur[3];
+    int mfmv_ref2ref[3][7];
+    int n_mfmvs;
+
+    refmvs_temporal_block *rp;
+    /*const*/ refmvs_temporal_block *const *rp_ref;
+    refmvs_temporal_block *rp_proj;
+    ptrdiff_t rp_stride;
+
+    refmvs_block *r; // 35 x r_stride memory
+    ptrdiff_t r_stride;
+    int n_tile_rows, n_tile_threads;
+} refmvs_frame;
+
+typedef struct refmvs_tile {
+    const refmvs_frame *rf;
+    refmvs_block *r[32 + 5];
+    refmvs_temporal_block *rp_proj;
+    struct {
+        int start, end;
+    } tile_col, tile_row;
+} refmvs_tile;
+
+typedef struct refmvs_candidate {
+    refmvs_mvpair mv;
+    int weight;
+} refmvs_candidate;
+
+// call once per frame thread
+void dav1d_refmvs_init(refmvs_frame *rf);
+void dav1d_refmvs_clear(refmvs_frame *rf);
+
+// call once per frame
+int dav1d_refmvs_init_frame(refmvs_frame *rf,
+                            const Dav1dSequenceHeader *seq_hdr,
+                            const Dav1dFrameHeader *frm_hdr,
+                            const unsigned ref_poc[7],
+                            refmvs_temporal_block *rp,
+                            const unsigned ref_ref_poc[7][7],
+                            /*const*/ refmvs_temporal_block *const rp_ref[7],
+                            int n_tile_threads);
+
+// initialize temporal MVs; this can be done in any configuration, e.g. one
+// tile/sbrow at a time, where col_{start,end}8 are the tile boundaries; or
+// it can just be for the whole frame's sbrow, where col_{start,end}8 are the
+// frame boundaries. row_{start,end}8 are the superblock row boundaries.
+void dav1d_refmvs_load_tmvs(const refmvs_frame *rf, int tile_row_idx,
+                            int col_start8, int col_end8,
+                            int row_start8, int row_end8);
+
+// cache the current tile/sbrow (or frame/sbrow)'s projectable motion vectors
+// into buffers for use in future frame's temporal MV prediction
+void dav1d_refmvs_save_tmvs(const refmvs_tile *rt,
+                            int col_start8, int col_end8,
+                            int row_start8, int row_end8);
+
+// initialize tile boundaries and refmvs_block pointers for one tile/sbrow
+void dav1d_refmvs_tile_sbrow_init(refmvs_tile *rt, const refmvs_frame *rf,
+                                  int tile_col_start4, int tile_col_end4,
+                                  int tile_row_start4, int tile_row_end4,
+                                  int sby, int tile_row_idx);
+
+// call for each block
+void dav1d_refmvs_find(const refmvs_tile *rt,
+                       refmvs_candidate mvstack[8], int *cnt,
+                       int *ctx, const refmvs_refpair ref, enum BlockSize bs,
+                       enum EdgeFlags edge_flags, int by4, int bx4);
+
+static inline void splat_oneref_mv(refmvs_tile *const rt,
+                                   const int by4, const int bx4,
+                                   const enum BlockSize bs,
+                                   const enum InterPredMode mode,
+                                   const int ref, const mv mv,
+                                   const int is_interintra)
+{
+    const int bw4 = dav1d_block_dimensions[bs][0];
+    int bh4 = dav1d_block_dimensions[bs][1];
+    refmvs_block **rr = &rt->r[(by4 & 31) + 5];
+
+    const refmvs_block tmpl = (refmvs_block) {
+        .ref.ref = { ref + 1, is_interintra ? 0 : -1 },
+        .mv.mv[0] = mv,
+        .bs = bs,
+        .mf = (mode == GLOBALMV && imin(bw4, bh4) >= 2) | ((mode == NEWMV) * 2),
+    };
+    do {
+        refmvs_block *r = *rr++ + bx4;
+        for (int x = 0; x < bw4; x++)
+            r[x] = tmpl;
+    } while (--bh4);
+}
+
+static inline void splat_intrabc_mv(refmvs_tile *const rt,
+                                    const int by4, const int bx4,
+                                    const enum BlockSize bs, const mv mv)
+{
+    const int bw4 = dav1d_block_dimensions[bs][0];
+    int bh4 = dav1d_block_dimensions[bs][1];
+    refmvs_block **rr = &rt->r[(by4 & 31) + 5];
+
+    const refmvs_block tmpl = (refmvs_block) {
+        .ref.ref = { 0, -1 },
+        .mv.mv[0] = mv,
+        .bs = bs,
+        .mf = 0,
+    };
+    do {
+        refmvs_block *r = *rr++ + bx4;
+        for (int x = 0; x < bw4; x++) {
+            r[x] = tmpl;
+        }
+    } while (--bh4);
+}
+
+static inline void splat_tworef_mv(refmvs_tile *const rt,
+                                   const int by4, const int bx4,
+                                   const enum BlockSize bs,
+                                   const enum CompInterPredMode mode,
+                                   const refmvs_refpair ref,
+                                   const refmvs_mvpair mv)
+{
+    const int bw4 = dav1d_block_dimensions[bs][0];
+    int bh4 = dav1d_block_dimensions[bs][1];
+    refmvs_block **rr = &rt->r[(by4 & 31) + 5];
+
+    assert(bw4 >= 2 && bh4 >= 2);
+    const refmvs_block tmpl = (refmvs_block) {
+        .ref.pair = ref.pair + 0x0101,
+        .mv = mv,
+        .bs = bs,
+        .mf = (mode == GLOBALMV_GLOBALMV) | !!((1 << mode) & (0xbc)) * 2,
+    };
+    do {
+        refmvs_block *r = *rr++ + bx4;
+        for (int x = 0; x < bw4; x++)
+            r[x] = tmpl;
+    } while (--bh4);
+}
+
+static inline void splat_intraref(refmvs_tile *const rt,
+                                  const int by4, const int bx4,
+                                  const enum BlockSize bs)
+{
+    const int bw4 = dav1d_block_dimensions[bs][0];
+    int bh4 = dav1d_block_dimensions[bs][1];
+    refmvs_block **rr = &rt->r[(by4 & 31) + 5];
+
+    const refmvs_block tmpl = (refmvs_block) {
+        .ref.ref = { 0, -1 },
+        .mv.mv[0].n = INVALID_MV,
+        .bs = bs,
+        .mf = 0,
+    };
+    do {
+        refmvs_block *r = *rr++ + bx4;
+        for (int x = 0; x < bw4; x++) {
+            r[x] = tmpl;
+        }
+    } while (--bh4);
+}
+
+#endif /* DAV1D_SRC_REF_MVS_H */
diff --git a/src/scan.c b/src/scan.c
new file mode 100644 (file)
index 0000000..c51c6f5
--- /dev/null
@@ -0,0 +1,444 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+#include "src/scan.h"
+
+static const uint16_t ALIGN(av1_default_scan_4x4[], 32) = {
+     0,  4,  1,  2,
+     5,  8, 12,  9,
+     6,  3,  7, 10,
+    13, 14, 11, 15,
+};
+static const uint16_t ALIGN(av1_mrow_scan_4x4[], 32) = {
+     0,  4,  8, 12,
+     1,  5,  9, 13,
+     2,  6, 10, 14,
+     3,  7, 11, 15,
+};
+static const uint16_t ALIGN(av1_default_scan_4x8[], 32) = {
+     0,  8,  1, 16,
+     9,  2, 24, 17,
+    10,  3, 25, 18,
+    11,  4, 26, 19,
+    12,  5, 27, 20,
+    13,  6, 28, 21,
+    14,  7, 29, 22,
+    15, 30, 23, 31,
+};
+static const uint16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
+     0,  8, 16, 24,
+     1,  9, 17, 25,
+     2, 10, 18, 26,
+     3, 11, 19, 27,
+     4, 12, 20, 28,
+     5, 13, 21, 29,
+     6, 14, 22, 30,
+     7, 15, 23, 31,
+};
+static const uint16_t ALIGN(av1_default_scan_4x16[], 32) = {
+     0, 16,  1, 32,
+    17,  2, 48, 33,
+    18,  3, 49, 34,
+    19,  4, 50, 35,
+    20,  5, 51, 36,
+    21,  6, 52, 37,
+    22,  7, 53, 38,
+    23,  8, 54, 39,
+    24,  9, 55, 40,
+    25, 10, 56, 41,
+    26, 11, 57, 42,
+    27, 12, 58, 43,
+    28, 13, 59, 44,
+    29, 14, 60, 45,
+    30, 15, 61, 46,
+    31, 62, 47, 63,
+};
+static const uint16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
+     0, 16, 32, 48,
+     1, 17, 33, 49,
+     2, 18, 34, 50,
+     3, 19, 35, 51,
+     4, 20, 36, 52,
+     5, 21, 37, 53,
+     6, 22, 38, 54,
+     7, 23, 39, 55,
+     8, 24, 40, 56,
+     9, 25, 41, 57,
+    10, 26, 42, 58,
+    11, 27, 43, 59,
+    12, 28, 44, 60,
+    13, 29, 45, 61,
+    14, 30, 46, 62,
+    15, 31, 47, 63,
+};
+static const uint16_t ALIGN(av1_default_scan_8x4[], 32) = {
+     0,  1,  4,  2,  5,  8,  3,  6,
+     9, 12,  7, 10, 13, 16, 11, 14,
+    17, 20, 15, 18, 21, 24, 19, 22,
+    25, 28, 23, 26, 29, 27, 30, 31,
+};
+static const uint16_t ALIGN(av1_mrow_scan_8x4[], 32) = {
+     0,  4,  8, 12, 16, 20, 24, 28,
+     1,  5,  9, 13, 17, 21, 25, 29,
+     2,  6, 10, 14, 18, 22, 26, 30,
+     3,  7, 11, 15, 19, 23, 27, 31,
+};
+static const uint16_t ALIGN(av1_default_scan_8x8[], 32) = {
+     0,  8,  1,  2,  9, 16, 24, 17,
+    10,  3,  4, 11, 18, 25, 32, 40,
+    33, 26, 19, 12,  5,  6, 13, 20,
+    27, 34, 41, 48, 56, 49, 42, 35,
+    28, 21, 14,  7, 15, 22, 29, 36,
+    43, 50, 57, 58, 51, 44, 37, 30,
+    23, 31, 38, 45, 52, 59, 60, 53,
+    46, 39, 47, 54, 61, 62, 55, 63,
+};
+static const uint16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
+     0,  8, 16, 24, 32, 40, 48, 56,
+     1,  9, 17, 25, 33, 41, 49, 57,
+     2, 10, 18, 26, 34, 42, 50, 58,
+     3, 11, 19, 27, 35, 43, 51, 59,
+     4, 12, 20, 28, 36, 44, 52, 60,
+     5, 13, 21, 29, 37, 45, 53, 61,
+     6, 14, 22, 30, 38, 46, 54, 62,
+     7, 15, 23, 31, 39, 47, 55, 63,
+};
+static const uint16_t ALIGN(av1_default_scan_8x16[], 32) = {
+      0,  16,   1,  32,  17,   2,  48,  33,
+     18,   3,  64,  49,  34,  19,   4,  80,
+     65,  50,  35,  20,   5,  96,  81,  66,
+     51,  36,  21,   6, 112,  97,  82,  67,
+     52,  37,  22,   7, 113,  98,  83,  68,
+     53,  38,  23,   8, 114,  99,  84,  69,
+     54,  39,  24,   9, 115, 100,  85,  70,
+     55,  40,  25,  10, 116, 101,  86,  71,
+     56,  41,  26,  11, 117, 102,  87,  72,
+     57,  42,  27,  12, 118, 103,  88,  73,
+     58,  43,  28,  13, 119, 104,  89,  74,
+     59,  44,  29,  14, 120, 105,  90,  75,
+     60,  45,  30,  15, 121, 106,  91,  76,
+     61,  46,  31, 122, 107,  92,  77,  62,
+     47, 123, 108,  93,  78,  63, 124, 109,
+     94,  79, 125, 110,  95, 126, 111, 127,
+};
+static const uint16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
+      0,  16,  32,  48,  64,  80,  96, 112,
+      1,  17,  33,  49,  65,  81,  97, 113,
+      2,  18,  34,  50,  66,  82,  98, 114,
+      3,  19,  35,  51,  67,  83,  99, 115,
+      4,  20,  36,  52,  68,  84, 100, 116,
+      5,  21,  37,  53,  69,  85, 101, 117,
+      6,  22,  38,  54,  70,  86, 102, 118,
+      7,  23,  39,  55,  71,  87, 103, 119,
+      8,  24,  40,  56,  72,  88, 104, 120,
+      9,  25,  41,  57,  73,  89, 105, 121,
+     10,  26,  42,  58,  74,  90, 106, 122,
+     11,  27,  43,  59,  75,  91, 107, 123,
+     12,  28,  44,  60,  76,  92, 108, 124,
+     13,  29,  45,  61,  77,  93, 109, 125,
+     14,  30,  46,  62,  78,  94, 110, 126,
+     15,  31,  47,  63,  79,  95, 111, 127,
+};
+static const uint16_t ALIGN(av1_default_scan_8x32[], 32) = {
+      0,  32,   1,  64,  33,   2,  96,  65,
+     34,   3, 128,  97,  66,  35,   4, 160,
+    129,  98,  67,  36,   5, 192, 161, 130,
+     99,  68,  37,   6, 224, 193, 162, 131,
+    100,  69,  38,   7, 225, 194, 163, 132,
+    101,  70,  39,   8, 226, 195, 164, 133,
+    102,  71,  40,   9, 227, 196, 165, 134,
+    103,  72,  41,  10, 228, 197, 166, 135,
+    104,  73,  42,  11, 229, 198, 167, 136,
+    105,  74,  43,  12, 230, 199, 168, 137,
+    106,  75,  44,  13, 231, 200, 169, 138,
+    107,  76,  45,  14, 232, 201, 170, 139,
+    108,  77,  46,  15, 233, 202, 171, 140,
+    109,  78,  47,  16, 234, 203, 172, 141,
+    110,  79,  48,  17, 235, 204, 173, 142,
+    111,  80,  49,  18, 236, 205, 174, 143,
+    112,  81,  50,  19, 237, 206, 175, 144,
+    113,  82,  51,  20, 238, 207, 176, 145,
+    114,  83,  52,  21, 239, 208, 177, 146,
+    115,  84,  53,  22, 240, 209, 178, 147,
+    116,  85,  54,  23, 241, 210, 179, 148,
+    117,  86,  55,  24, 242, 211, 180, 149,
+    118,  87,  56,  25, 243, 212, 181, 150,
+    119,  88,  57,  26, 244, 213, 182, 151,
+    120,  89,  58,  27, 245, 214, 183, 152,
+    121,  90,  59,  28, 246, 215, 184, 153,
+    122,  91,  60,  29, 247, 216, 185, 154,
+    123,  92,  61,  30, 248, 217, 186, 155,
+    124,  93,  62,  31, 249, 218, 187, 156,
+    125,  94,  63, 250, 219, 188, 157, 126,
+     95, 251, 220, 189, 158, 127, 252, 221,
+    190, 159, 253, 222, 191, 254, 223, 255,
+};
+static const uint16_t ALIGN(av1_default_scan_16x4[], 32) = {
+     0,  1,  4,  2,  5,  8,  3,  6,  9, 12,  7, 10, 13, 16, 11, 14,
+    17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+    33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+    49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
+};
+static const uint16_t ALIGN(av1_mrow_scan_16x4[], 32) = {
+     0,  4,  8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+     1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+     2,  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+     3,  7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
+};
+static const uint16_t ALIGN(av1_default_scan_16x8[], 32) = {
+      0,   1,   8,   2,   9,  16,   3,  10,  17,  24,   4,  11,  18,  25,  32,   5,
+     12,  19,  26,  33,  40,   6,  13,  20,  27,  34,  41,  48,   7,  14,  21,  28,
+     35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,  30,  37,  44,
+     51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,  39,  46,  53,  60,
+     67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,  96,  55,  62,  69,  76,
+     83,  90,  97, 104,  63,  70,  77,  84,  91,  98, 105, 112,  71,  78,  85,  92,
+     99, 106, 113, 120,  79,  86,  93, 100, 107, 114, 121,  87,  94, 101, 108, 115,
+    122,  95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127,
+};
+static const uint16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
+      0,   8,  16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96, 104, 112, 120,
+      1,   9,  17,  25,  33,  41,  49,  57,  65,  73,  81,  89,  97, 105, 113, 121,
+      2,  10,  18,  26,  34,  42,  50,  58,  66,  74,  82,  90,  98, 106, 114, 122,
+      3,  11,  19,  27,  35,  43,  51,  59,  67,  75,  83,  91,  99, 107, 115, 123,
+      4,  12,  20,  28,  36,  44,  52,  60,  68,  76,  84,  92, 100, 108, 116, 124,
+      5,  13,  21,  29,  37,  45,  53,  61,  69,  77,  85,  93, 101, 109, 117, 125,
+      6,  14,  22,  30,  38,  46,  54,  62,  70,  78,  86,  94, 102, 110, 118, 126,
+      7,  15,  23,  31,  39,  47,  55,  63,  71,  79,  87,  95, 103, 111, 119, 127,
+};
+static const uint16_t ALIGN(av1_default_scan_16x16[], 32) = {
+      0,  16,   1,   2,  17,  32,  48,  33,  18,   3,   4,  19,  34,  49,  64,  80,
+     65,  50,  35,  20,   5,   6,  21,  36,  51,  66,  81,  96, 112,  97,  82,  67,
+     52,  37,  22,   7,   8,  23,  38,  53,  68,  83,  98, 113, 128, 144, 129, 114,
+     99,  84,  69,  54,  39,  24,   9,  10,  25,  40,  55,  70,  85, 100, 115, 130,
+    145, 160, 176, 161, 146, 131, 116, 101,  86,  71,  56,  41,  26,  11,  12,  27,
+     42,  57,  72,  87, 102, 117, 132, 147, 162, 177, 192, 208, 193, 178, 163, 148,
+    133, 118, 103,  88,  73,  58,  43,  28,  13,  14,  29,  44,  59,  74,  89, 104,
+    119, 134, 149, 164, 179, 194, 209, 224, 240, 225, 210, 195, 180, 165, 150, 135,
+    120, 105,  90,  75,  60,  45,  30,  15,  31,  46,  61,  76,  91, 106, 121, 136,
+    151, 166, 181, 196, 211, 226, 241, 242, 227, 212, 197, 182, 167, 152, 137, 122,
+    107,  92,  77,  62,  47,  63,  78,  93, 108, 123, 138, 153, 168, 183, 198, 213,
+    228, 243, 244, 229, 214, 199, 184, 169, 154, 139, 124, 109,  94,  79,  95, 110,
+    125, 140, 155, 170, 185, 200, 215, 230, 245, 246, 231, 216, 201, 186, 171, 156,
+    141, 126, 111, 127, 142, 157, 172, 187, 202, 217, 232, 247, 248, 233, 218, 203,
+    188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190,
+    175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255,
+};
+static const uint16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
+      0,  16,  32,  48,  64,  80,  96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+      1,  17,  33,  49,  65,  81,  97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
+      2,  18,  34,  50,  66,  82,  98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
+      3,  19,  35,  51,  67,  83,  99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
+      4,  20,  36,  52,  68,  84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+      5,  21,  37,  53,  69,  85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+      6,  22,  38,  54,  70,  86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+      7,  23,  39,  55,  71,  87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+      8,  24,  40,  56,  72,  88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+      9,  25,  41,  57,  73,  89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+     10,  26,  42,  58,  74,  90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+     11,  27,  43,  59,  75,  91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+     12,  28,  44,  60,  76,  92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+     13,  29,  45,  61,  77,  93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+     14,  30,  46,  62,  78,  94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+     15,  31,  47,  63,  79,  95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+static const uint16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
+      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+     16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+     32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+     48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+     64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+     80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+     96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+    112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+    144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+    176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+    192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+    208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+    240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+};
+static const uint16_t ALIGN(av1_default_scan_16x32[], 32) = {
+      0,  32,   1,  64,  33,   2,  96,  65,  34,   3, 128,  97,  66,  35,   4, 160,
+    129,  98,  67,  36,   5, 192, 161, 130,  99,  68,  37,   6, 224, 193, 162, 131,
+    100,  69,  38,   7, 256, 225, 194, 163, 132, 101,  70,  39,   8, 288, 257, 226,
+    195, 164, 133, 102,  71,  40,   9, 320, 289, 258, 227, 196, 165, 134, 103,  72,
+     41,  10, 352, 321, 290, 259, 228, 197, 166, 135, 104,  73,  42,  11, 384, 353,
+    322, 291, 260, 229, 198, 167, 136, 105,  74,  43,  12, 416, 385, 354, 323, 292,
+    261, 230, 199, 168, 137, 106,  75,  44,  13, 448, 417, 386, 355, 324, 293, 262,
+    231, 200, 169, 138, 107,  76,  45,  14, 480, 449, 418, 387, 356, 325, 294, 263,
+    232, 201, 170, 139, 108,  77,  46,  15, 481, 450, 419, 388, 357, 326, 295, 264,
+    233, 202, 171, 140, 109,  78,  47,  16, 482, 451, 420, 389, 358, 327, 296, 265,
+    234, 203, 172, 141, 110,  79,  48,  17, 483, 452, 421, 390, 359, 328, 297, 266,
+    235, 204, 173, 142, 111,  80,  49,  18, 484, 453, 422, 391, 360, 329, 298, 267,
+    236, 205, 174, 143, 112,  81,  50,  19, 485, 454, 423, 392, 361, 330, 299, 268,
+    237, 206, 175, 144, 113,  82,  51,  20, 486, 455, 424, 393, 362, 331, 300, 269,
+    238, 207, 176, 145, 114,  83,  52,  21, 487, 456, 425, 394, 363, 332, 301, 270,
+    239, 208, 177, 146, 115,  84,  53,  22, 488, 457, 426, 395, 364, 333, 302, 271,
+    240, 209, 178, 147, 116,  85,  54,  23, 489, 458, 427, 396, 365, 334, 303, 272,
+    241, 210, 179, 148, 117,  86,  55,  24, 490, 459, 428, 397, 366, 335, 304, 273,
+    242, 211, 180, 149, 118,  87,  56,  25, 491, 460, 429, 398, 367, 336, 305, 274,
+    243, 212, 181, 150, 119,  88,  57,  26, 492, 461, 430, 399, 368, 337, 306, 275,
+    244, 213, 182, 151, 120,  89,  58,  27, 493, 462, 431, 400, 369, 338, 307, 276,
+    245, 214, 183, 152, 121,  90,  59,  28, 494, 463, 432, 401, 370, 339, 308, 277,
+    246, 215, 184, 153, 122,  91,  60,  29, 495, 464, 433, 402, 371, 340, 309, 278,
+    247, 216, 185, 154, 123,  92,  61,  30, 496, 465, 434, 403, 372, 341, 310, 279,
+    248, 217, 186, 155, 124,  93,  62,  31, 497, 466, 435, 404, 373, 342, 311, 280,
+    249, 218, 187, 156, 125,  94,  63, 498, 467, 436, 405, 374, 343, 312, 281, 250,
+    219, 188, 157, 126,  95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189,
+    158, 127, 500, 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470,
+    439, 408, 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316,
+    285, 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+    380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382,
+    351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511,
+};
+static const uint16_t ALIGN(av1_default_scan_32x8[], 32) = {
+      0,   1,   8,   2,   9,  16,   3,  10,  17,  24,   4,  11,  18,  25,  32,   5,  12,  19,  26,  33,  40,   6,  13,  20,  27,  34,  41,  48,   7,  14,  21,  28,
+     35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,  30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,  39,  46,  53,  60,
+     67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,  96,  55,  62,  69,  76,  83,  90,  97, 104,  63,  70,  77,  84,  91,  98, 105, 112,  71,  78,  85,  92,
+     99, 106, 113, 120,  79,  86,  93, 100, 107, 114, 121, 128,  87,  94, 101, 108, 115, 122, 129, 136,  95, 102, 109, 116, 123, 130, 137, 144, 103, 110, 117, 124,
+    131, 138, 145, 152, 111, 118, 125, 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134, 141, 148, 155, 162, 169, 176, 135, 142, 149, 156,
+    163, 170, 177, 184, 143, 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200, 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188,
+    195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220,
+    227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255,
+};
+static const uint16_t ALIGN(av1_default_scan_32x16[], 32) = {
+      0,   1,  16,   2,  17,  32,   3,  18,  33,  48,   4,  19,  34,  49,  64,   5,  20,  35,  50,  65,  80,   6,  21,  36,  51,  66,  81,  96,   7,  22,  37,  52,
+     67,  82,  97, 112,   8,  23,  38,  53,  68,  83,  98, 113, 128,   9,  24,  39,  54,  69,  84,  99, 114, 129, 144,  10,  25,  40,  55,  70,  85, 100, 115, 130,
+    145, 160,  11,  26,  41,  56,  71,  86, 101, 116, 131, 146, 161, 176,  12,  27,  42,  57,  72,  87, 102, 117, 132, 147, 162, 177, 192,  13,  28,  43,  58,  73,
+     88, 103, 118, 133, 148, 163, 178, 193, 208,  14,  29,  44,  59,  74,  89, 104, 119, 134, 149, 164, 179, 194, 209, 224,  15,  30,  45,  60,  75,  90, 105, 120,
+    135, 150, 165, 180, 195, 210, 225, 240,  31,  46,  61,  76,  91, 106, 121, 136, 151, 166, 181, 196, 211, 226, 241, 256,  47,  62,  77,  92, 107, 122, 137, 152,
+    167, 182, 197, 212, 227, 242, 257, 272,  63,  78,  93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 258, 273, 288,  79,  94, 109, 124, 139, 154, 169, 184,
+    199, 214, 229, 244, 259, 274, 289, 304,  95, 110, 125, 140, 155, 170, 185, 200, 215, 230, 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216,
+    231, 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232, 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233, 248,
+    263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234, 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235, 250, 265, 280,
+    295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236, 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237, 252, 267, 282, 297, 312,
+    327, 342, 357, 372, 387, 402, 417, 432, 223, 238, 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239, 254, 269, 284, 299, 314, 329, 344,
+    359, 374, 389, 404, 419, 434, 449, 464, 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465, 480, 271, 286, 301, 316, 331, 346, 361, 376,
+    391, 406, 421, 436, 451, 466, 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467, 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423,
+    438, 453, 468, 483, 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335, 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366,
+    381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+    459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511,
+};
+static const uint16_t ALIGN(av1_default_scan_32x32[], 32) = {
+       0,   32,    1,    2,   33,   64,   96,   65,   34,    3,    4,   35,   66,   97,  128,  160,  129,   98,   67,   36,    5,    6,   37,   68,   99,  130,  161,  192,  224,  193,  162,  131,
+     100,   69,   38,    7,    8,   39,   70,  101,  132,  163,  194,  225,  256,  288,  257,  226,  195,  164,  133,  102,   71,   40,    9,   10,   41,   72,  103,  134,  165,  196,  227,  258,
+     289,  320,  352,  321,  290,  259,  228,  197,  166,  135,  104,   73,   42,   11,   12,   43,   74,  105,  136,  167,  198,  229,  260,  291,  322,  353,  384,  416,  385,  354,  323,  292,
+     261,  230,  199,  168,  137,  106,   75,   44,   13,   14,   45,   76,  107,  138,  169,  200,  231,  262,  293,  324,  355,  386,  417,  448,  480,  449,  418,  387,  356,  325,  294,  263,
+     232,  201,  170,  139,  108,   77,   46,   15,   16,   47,   78,  109,  140,  171,  202,  233,  264,  295,  326,  357,  388,  419,  450,  481,  512,  544,  513,  482,  451,  420,  389,  358,
+     327,  296,  265,  234,  203,  172,  141,  110,   79,   48,   17,   18,   49,   80,  111,  142,  173,  204,  235,  266,  297,  328,  359,  390,  421,  452,  483,  514,  545,  576,  608,  577,
+     546,  515,  484,  453,  422,  391,  360,  329,  298,  267,  236,  205,  174,  143,  112,   81,   50,   19,   20,   51,   82,  113,  144,  175,  206,  237,  268,  299,  330,  361,  392,  423,
+     454,  485,  516,  547,  578,  609,  640,  672,  641,  610,  579,  548,  517,  486,  455,  424,  393,  362,  331,  300,  269,  238,  207,  176,  145,  114,   83,   52,   21,   22,   53,   84,
+     115,  146,  177,  208,  239,  270,  301,  332,  363,  394,  425,  456,  487,  518,  549,  580,  611,  642,  673,  704,  736,  705,  674,  643,  612,  581,  550,  519,  488,  457,  426,  395,
+     364,  333,  302,  271,  240,  209,  178,  147,  116,   85,   54,   23,   24,   55,   86,  117,  148,  179,  210,  241,  272,  303,  334,  365,  396,  427,  458,  489,  520,  551,  582,  613,
+     644,  675,  706,  737,  768,  800,  769,  738,  707,  676,  645,  614,  583,  552,  521,  490,  459,  428,  397,  366,  335,  304,  273,  242,  211,  180,  149,  118,   87,   56,   25,   26,
+      57,   88,  119,  150,  181,  212,  243,  274,  305,  336,  367,  398,  429,  460,  491,  522,  553,  584,  615,  646,  677,  708,  739,  770,  801,  832,  864,  833,  802,  771,  740,  709,
+     678,  647,  616,  585,  554,  523,  492,  461,  430,  399,  368,  337,  306,  275,  244,  213,  182,  151,  120,   89,   58,   27,   28,   59,   90,  121,  152,  183,  214,  245,  276,  307,
+     338,  369,  400,  431,  462,  493,  524,  555,  586,  617,  648,  679,  710,  741,  772,  803,  834,  865,  896,  928,  897,  866,  835,  804,  773,  742,  711,  680,  649,  618,  587,  556,
+     525,  494,  463,  432,  401,  370,  339,  308,  277,  246,  215,  184,  153,  122,   91,   60,   29,   30,   61,   92,  123,  154,  185,  216,  247,  278,  309,  340,  371,  402,  433,  464,
+     495,  526,  557,  588,  619,  650,  681,  712,  743,  774,  805,  836,  867,  898,  929,  960,  992,  961,  930,  899,  868,  837,  806,  775,  744,  713,  682,  651,  620,  589,  558,  527,
+     496,  465,  434,  403,  372,  341,  310,  279,  248,  217,  186,  155,  124,   93,   62,   31,   63,   94,  125,  156,  187,  218,  249,  280,  311,  342,  373,  404,  435,  466,  497,  528,
+     559,  590,  621,  652,  683,  714,  745,  776,  807,  838,  869,  900,  931,  962,  993,  994,  963,  932,  901,  870,  839,  808,  777,  746,  715,  684,  653,  622,  591,  560,  529,  498,
+     467,  436,  405,  374,  343,  312,  281,  250,  219,  188,  157,  126,   95,  127,  158,  189,  220,  251,  282,  313,  344,  375,  406,  437,  468,  499,  530,  561,  592,  623,  654,  685,
+     716,  747,  778,  809,  840,  871,  902,  933,  964,  995,  996,  965,  934,  903,  872,  841,  810,  779,  748,  717,  686,  655,  624,  593,  562,  531,  500,  469,  438,  407,  376,  345,
+     314,  283,  252,  221,  190,  159,  191,  222,  253,  284,  315,  346,  377,  408,  439,  470,  501,  532,  563,  594,  625,  656,  687,  718,  749,  780,  811,  842,  873,  904,  935,  966,
+     997,  998,  967,  936,  905,  874,  843,  812,  781,  750,  719,  688,  657,  626,  595,  564,  533,  502,  471,  440,  409,  378,  347,  316,  285,  254,  223,  255,  286,  317,  348,  379,
+     410,  441,  472,  503,  534,  565,  596,  627,  658,  689,  720,  751,  782,  813,  844,  875,  906,  937,  968,  999, 1000,  969,  938,  907,  876,  845,  814,  783,  752,  721,  690,  659,
+     628,  597,  566,  535,  504,  473,  442,  411,  380,  349,  318,  287,  319,  350,  381,  412,  443,  474,  505,  536,  567,  598,  629,  660,  691,  722,  753,  784,  815,  846,  877,  908,
+     939,  970, 1001, 1002,  971,  940,  909,  878,  847,  816,  785,  754,  723,  692,  661,  630,  599,  568,  537,  506,  475,  444,  413,  382,  351,  383,  414,  445,  476,  507,  538,  569,
+     600,  631,  662,  693,  724,  755,  786,  817,  848,  879,  910,  941,  972, 1003, 1004,  973,  942,  911,  880,  849,  818,  787,  756,  725,  694,  663,  632,  601,  570,  539,  508,  477,
+     446,  415,  447,  478,  509,  540,  571,  602,  633,  664,  695,  726,  757,  788,  819,  850,  881,  912,  943,  974, 1005, 1006,  975,  944,  913,  882,  851,  820,  789,  758,  727,  696,
+     665,  634,  603,  572,  541,  510,  479,  511,  542,  573,  604,  635,  666,  697,  728,  759,  790,  821,  852,  883,  914,  945,  976, 1007, 1008,  977,  946,  915,  884,  853,  822,  791,
+     760,  729,  698,  667,  636,  605,  574,  543,  575,  606,  637,  668,  699,  730,  761,  792,  823,  854,  885,  916,  947,  978, 1009, 1010,  979,  948,  917,  886,  855,  824,  793,  762,
+     731,  700,  669,  638,  607,  639,  670,  701,  732,  763,  794,  825,  856,  887,  918,  949,  980, 1011, 1012,  981,  950,  919,  888,  857,  826,  795,  764,  733,  702,  671,  703,  734,
+     765,  796,  827,  858,  889,  920,  951,  982, 1013, 1014,  983,  952,  921,  890,  859,  828,  797,  766,  735,  767,  798,  829,  860,  891,  922,  953,  984, 1015, 1016,  985,  954,  923,
+     892,  861,  830,  799,  831,  862,  893,  924,  955,  986, 1017, 1018,  987,  956,  925,  894,  863,  895,  926,  957,  988, 1019, 1020,  989,  958,  927,  959,  990, 1021, 1022,  991, 1023,
+};
+
+const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
+    [TX_4X4] = {
+        [TX_CLASS_2D] = av1_default_scan_4x4,
+        [TX_CLASS_V]  = av1_mrow_scan_4x4,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
+    }, [TX_8X8] = {
+        [TX_CLASS_2D] = av1_default_scan_8x8,
+        [TX_CLASS_V]  = av1_mrow_scan_8x8,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
+    }, [TX_16X16] = {
+        [TX_CLASS_2D] = av1_default_scan_16x16,
+        [TX_CLASS_V]  = av1_mrow_scan_16x16,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
+    }, [TX_32X32] = {
+        [TX_CLASS_2D] = av1_default_scan_32x32,
+    }, [TX_64X64] = {
+        [TX_CLASS_2D] = av1_default_scan_32x32,
+    }, [RTX_4X8] = {
+        [TX_CLASS_2D] = av1_default_scan_4x8,
+        [TX_CLASS_V]  = av1_mrow_scan_4x8,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
+    }, [RTX_8X4] = {
+        [TX_CLASS_2D] = av1_default_scan_8x4,
+        [TX_CLASS_V]  = av1_mrow_scan_8x4,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
+    }, [RTX_8X16] = {
+        [TX_CLASS_2D] = av1_default_scan_8x16,
+        [TX_CLASS_V]  = av1_mrow_scan_8x16,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
+    }, [RTX_16X8] = {
+        [TX_CLASS_2D] = av1_default_scan_16x8,
+        [TX_CLASS_V]  = av1_mrow_scan_16x8,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
+    }, [RTX_16X32] = {
+        [TX_CLASS_2D] = av1_default_scan_16x32,
+    }, [RTX_32X16] = {
+        [TX_CLASS_2D] = av1_default_scan_32x16,
+    }, [RTX_32X64] = {
+        [TX_CLASS_2D] = av1_default_scan_32x32,
+    }, [RTX_64X32] = {
+        [TX_CLASS_2D] = av1_default_scan_32x32,
+    }, [RTX_4X16] = {
+        [TX_CLASS_2D] = av1_default_scan_4x16,
+        [TX_CLASS_V]  = av1_mrow_scan_4x16,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
+    }, [RTX_16X4] = {
+        [TX_CLASS_2D] = av1_default_scan_16x4,
+        [TX_CLASS_V]  = av1_mrow_scan_16x4,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
+    }, [RTX_8X32] = {
+        [TX_CLASS_2D] = av1_default_scan_8x32,
+    }, [RTX_32X8] = {
+        [TX_CLASS_2D] = av1_default_scan_32x8,
+    }, [RTX_16X64] = {
+        [TX_CLASS_2D] = av1_default_scan_16x32,
+    }, [RTX_64X16] = {
+        [TX_CLASS_2D] = av1_default_scan_32x16,
+    },
+};
diff --git a/src/scan.h b/src/scan.h
new file mode 100644 (file)
index 0000000..c474b7f
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_SCAN_H
+#define DAV1D_SRC_SCAN_H
+
+#include <stdint.h>
+
+#include "src/levels.h"
+
+extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3];
+
+#endif /* DAV1D_SRC_SCAN_H */
diff --git a/src/tables.c b/src/tables.c
new file mode 100644 (file)
index 0000000..30d9fa6
--- /dev/null
@@ -0,0 +1,1022 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+
+#include "common/attributes.h"
+
+#include "src/levels.h"
+#include "src/tables.h"
+
+const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS] = {
+    {
+        // partitions:
+        // none,  h,    v, split,  tts,  tbs,  tls,  trs,   h4,   v4
+        { 0x00, 0x00, 0x10,   -1, 0x00, 0x10, 0x10, 0x10,   -1,   -1 }, // bl128
+        { 0x10, 0x10, 0x18,   -1, 0x10, 0x18, 0x18, 0x18, 0x10, 0x1c }, // bl64
+        { 0x18, 0x18, 0x1c,   -1, 0x18, 0x1c, 0x1c, 0x1c, 0x18, 0x1e }, // bl32
+        { 0x1c, 0x1c, 0x1e,   -1, 0x1c, 0x1e, 0x1e, 0x1e, 0x1c, 0x1f }, // bl16
+        { 0x1e, 0x1e, 0x1f, 0x1f,   -1,   -1,   -1,   -1,   -1,   -1 }, // bl8
+    }, {
+        { 0x00, 0x10, 0x00,   -1, 0x10, 0x10, 0x00, 0x10,   -1,   -1 }, // bl128
+        { 0x10, 0x18, 0x10,   -1, 0x18, 0x18, 0x10, 0x18, 0x1c, 0x10 }, // bl64
+        { 0x18, 0x1c, 0x18,   -1, 0x1c, 0x1c, 0x18, 0x1c, 0x1e, 0x18 }, // bl32
+        { 0x1c, 0x1e, 0x1c,   -1, 0x1e, 0x1e, 0x1c, 0x1e, 0x1f, 0x1c }, // bl16
+        { 0x1e, 0x1f, 0x1e, 0x1f,   -1,   -1,   -1,   -1,   -1,   -1 }, // bl8
+    }
+};
+
+const uint8_t /* enum BlockSize */
+    dav1d_block_sizes[N_BL_LEVELS][N_PARTITIONS][2] =
+{
+    [BL_128X128] = {
+        [PARTITION_NONE]           = { BS_128x128 },
+        [PARTITION_H]              = { BS_128x64 },
+        [PARTITION_V]              = { BS_64x128 },
+        [PARTITION_T_TOP_SPLIT]    = { BS_64x64, BS_128x64 },
+        [PARTITION_T_BOTTOM_SPLIT] = { BS_128x64, BS_64x64 },
+        [PARTITION_T_LEFT_SPLIT]   = { BS_64x64, BS_64x128 },
+        [PARTITION_T_RIGHT_SPLIT]  = { BS_64x128, BS_64x64 },
+    }, [BL_64X64] = {
+        [PARTITION_NONE]           = { BS_64x64 },
+        [PARTITION_H]              = { BS_64x32 },
+        [PARTITION_V]              = { BS_32x64 },
+        [PARTITION_T_TOP_SPLIT]    = { BS_32x32, BS_64x32 },
+        [PARTITION_T_BOTTOM_SPLIT] = { BS_64x32, BS_32x32 },
+        [PARTITION_T_LEFT_SPLIT]   = { BS_32x32, BS_32x64 },
+        [PARTITION_T_RIGHT_SPLIT]  = { BS_32x64, BS_32x32 },
+        [PARTITION_H4]             = { BS_64x16 },
+        [PARTITION_V4]             = { BS_16x64 },
+    }, [BL_32X32] = {
+        [PARTITION_NONE]           = { BS_32x32 },
+        [PARTITION_H]              = { BS_32x16 },
+        [PARTITION_V]              = { BS_16x32 },
+        [PARTITION_T_TOP_SPLIT]    = { BS_16x16, BS_32x16 },
+        [PARTITION_T_BOTTOM_SPLIT] = { BS_32x16, BS_16x16 },
+        [PARTITION_T_LEFT_SPLIT]   = { BS_16x16, BS_16x32 },
+        [PARTITION_T_RIGHT_SPLIT]  = { BS_16x32, BS_16x16 },
+        [PARTITION_H4]             = { BS_32x8  },
+        [PARTITION_V4]             = { BS_8x32  },
+    }, [BL_16X16] = {
+        [PARTITION_NONE]           = { BS_16x16 },
+        [PARTITION_H]              = { BS_16x8  },
+        [PARTITION_V]              = { BS_8x16  },
+        [PARTITION_T_TOP_SPLIT]    = { BS_8x8,   BS_16x8  },
+        [PARTITION_T_BOTTOM_SPLIT] = { BS_16x8,  BS_8x8   },
+        [PARTITION_T_LEFT_SPLIT]   = { BS_8x8,   BS_8x16  },
+        [PARTITION_T_RIGHT_SPLIT]  = { BS_8x16,  BS_8x8   },
+        [PARTITION_H4]             = { BS_16x4  },
+        [PARTITION_V4]             = { BS_4x16  },
+    }, [BL_8X8] = {
+        [PARTITION_NONE]           = { BS_8x8   },
+        [PARTITION_H]              = { BS_8x4   },
+        [PARTITION_V]              = { BS_4x8   },
+        [PARTITION_SPLIT]          = { BS_4x4   },
+    }
+};
+
+const uint8_t dav1d_block_dimensions[N_BS_SIZES][4] = {
+    [BS_128x128] = { 32, 32, 5, 5 },
+    [BS_128x64]  = { 32, 16, 5, 4 },
+    [BS_64x128]  = { 16, 32, 4, 5 },
+    [BS_64x64]   = { 16, 16, 4, 4 },
+    [BS_64x32]   = { 16,  8, 4, 3 },
+    [BS_64x16]   = { 16,  4, 4, 2 },
+    [BS_32x64]   = {  8, 16, 3, 4 },
+    [BS_32x32]   = {  8,  8, 3, 3 },
+    [BS_32x16]   = {  8,  4, 3, 2 },
+    [BS_32x8]    = {  8,  2, 3, 1 },
+    [BS_16x64]   = {  4, 16, 2, 4 },
+    [BS_16x32]   = {  4,  8, 2, 3 },
+    [BS_16x16]   = {  4,  4, 2, 2 },
+    [BS_16x8]    = {  4,  2, 2, 1 },
+    [BS_16x4]    = {  4,  1, 2, 0 },
+    [BS_8x32]    = {  2,  8, 1, 3 },
+    [BS_8x16]    = {  2,  4, 1, 2 },
+    [BS_8x8]     = {  2,  2, 1, 1 },
+    [BS_8x4]     = {  2,  1, 1, 0 },
+    [BS_4x16]    = {  1,  4, 0, 2 },
+    [BS_4x8]     = {  1,  2, 0, 1 },
+    [BS_4x4]     = {  1,  1, 0, 0 },
+};
+
+const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES] = {
+    [ TX_4X4]   = { .w = 1, .h = 1, .lw = 0, .lh = 0,
+                    .min = 0, .max = 0, .ctx = 0 },
+    [ TX_8X8]   = { .w = 2, .h = 2, .lw = 1, .lh = 1,
+                    .min = 1, .max = 1, .sub = TX_4X4, .ctx = 1 },
+    [ TX_16X16] = { .w = 4, .h = 4, .lw = 2, .lh = 2,
+                    .min = 2, .max = 2, .sub = TX_8X8, .ctx = 2 },
+    [ TX_32X32] = { .w = 8, .h = 8, .lw = 3, .lh = 3,
+                    .min = 3, .max = 3, .sub = TX_16X16, .ctx = 3 },
+    [ TX_64X64] = { .w = 16, .h = 16, .lw = 4, .lh = 4,
+                    .min = 4, .max = 4, .sub = TX_32X32, .ctx = 4 },
+    [RTX_4X8]   = { .w = 1, .h = 2, .lw = 0, .lh = 1,
+                    .min = 0, .max = 1, .sub = TX_4X4, .ctx = 1 },
+    [RTX_8X4]   = { .w = 2, .h = 1, .lw = 1, .lh = 0,
+                    .min = 0, .max = 1, .sub = TX_4X4, .ctx = 1 },
+    [RTX_8X16]  = { .w = 2, .h = 4, .lw = 1, .lh = 2,
+                    .min = 1, .max = 2, .sub = TX_8X8, .ctx = 2 },
+    [RTX_16X8]  = { .w = 4, .h = 2, .lw = 2, .lh = 1,
+                    .min = 1, .max = 2, .sub = TX_8X8, .ctx = 2 },
+    [RTX_16X32] = { .w = 4, .h = 8, .lw = 2, .lh = 3,
+                    .min = 2, .max = 3, .sub = TX_16X16, .ctx = 3 },
+    [RTX_32X16] = { .w = 8, .h = 4, .lw = 3, .lh = 2,
+                    .min = 2, .max = 3, .sub = TX_16X16, .ctx = 3 },
+    [RTX_32X64] = { .w = 8, .h = 16, .lw = 3, .lh = 4,
+                    .min = 3, .max = 4, .sub = TX_32X32, .ctx = 4 },
+    [RTX_64X32] = { .w = 16, .h = 8, .lw = 4, .lh = 3,
+                    .min = 3, .max = 4, .sub = TX_32X32, .ctx = 4 },
+    [RTX_4X16]  = { .w = 1, .h = 4, .lw = 0, .lh = 2,
+                    .min = 0, .max = 2, .sub = RTX_4X8, .ctx = 1 },
+    [RTX_16X4]  = { .w = 4, .h = 1, .lw = 2, .lh = 0,
+                    .min = 0, .max = 2, .sub = RTX_8X4, .ctx = 1 },
+    [RTX_8X32]  = { .w = 2, .h = 8, .lw = 1, .lh = 3,
+                    .min = 1, .max = 3, .sub = RTX_8X16, .ctx = 2 },
+    [RTX_32X8]  = { .w = 8, .h = 2, .lw = 3, .lh = 1,
+                    .min = 1, .max = 3, .sub = RTX_16X8, .ctx = 2 },
+    [RTX_16X64] = { .w = 4, .h = 16, .lw = 2, .lh = 4,
+                    .min = 2, .max = 4, .sub = RTX_16X32, .ctx = 3 },
+    [RTX_64X16] = { .w = 16, .h = 4, .lw = 4, .lh = 2,
+                    .min = 2, .max = 4, .sub = RTX_32X16, .ctx = 3 },
+};
+
+const uint8_t /* enum (Rect)TxfmSize */
+    dav1d_max_txfm_size_for_bs[N_BS_SIZES][4 /* y, 420, 422, 444 */] =
+{
+    [BS_128x128] = {  TX_64X64,  TX_32X32,  TX_32X32,  TX_32X32 },
+    [BS_128x64]  = {  TX_64X64,  TX_32X32,  TX_32X32,  TX_32X32 },
+    [BS_64x128]  = {  TX_64X64,  TX_32X32,       0,    TX_32X32 },
+    [BS_64x64]   = {  TX_64X64,  TX_32X32,  TX_32X32,  TX_32X32 },
+    [BS_64x32]   = { RTX_64X32, RTX_32X16,  TX_32X32,  TX_32X32 },
+    [BS_64x16]   = { RTX_64X16, RTX_32X8,  RTX_32X16, RTX_32X16 },
+    [BS_32x64]   = { RTX_32X64, RTX_16X32,       0,    TX_32X32 },
+    [BS_32x32]   = {  TX_32X32,  TX_16X16, RTX_16X32,  TX_32X32 },
+    [BS_32x16]   = { RTX_32X16, RTX_16X8,   TX_16X16, RTX_32X16 },
+    [BS_32x8]    = { RTX_32X8,  RTX_16X4,  RTX_16X8,  RTX_32X8  },
+    [BS_16x64]   = { RTX_16X64, RTX_8X32,        0,   RTX_16X32 },
+    [BS_16x32]   = { RTX_16X32, RTX_8X16,        0,   RTX_16X32 },
+    [BS_16x16]   = {  TX_16X16,  TX_8X8,   RTX_8X16,   TX_16X16 },
+    [BS_16x8]    = { RTX_16X8,  RTX_8X4,    TX_8X8,   RTX_16X8  },
+    [BS_16x4]    = { RTX_16X4,  RTX_8X4,   RTX_8X4,   RTX_16X4  },
+    [BS_8x32]    = { RTX_8X32,  RTX_4X16,       0,    RTX_8X32  },
+    [BS_8x16]    = { RTX_8X16,  RTX_4X8,        0,    RTX_8X16  },
+    [BS_8x8]     = {  TX_8X8,    TX_4X4,   RTX_4X8,    TX_8X8   },
+    [BS_8x4]     = { RTX_8X4,    TX_4X4,    TX_4X4,   RTX_8X4   },
+    [BS_4x16]    = { RTX_4X16,  RTX_4X8,        0,    RTX_4X16  },
+    [BS_4x8]     = { RTX_4X8,    TX_4X4,        0,    RTX_4X8   },
+    [BS_4x4]     = {  TX_4X4,    TX_4X4,    TX_4X4,    TX_4X4   },
+};
+
+const uint8_t /* enum TxfmType */
+    dav1d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES] =
+{
+    [DC_PRED]              = DCT_DCT,
+    [VERT_PRED]            = ADST_DCT,
+    [HOR_PRED]             = DCT_ADST,
+    [DIAG_DOWN_LEFT_PRED]  = DCT_DCT,
+    [DIAG_DOWN_RIGHT_PRED] = ADST_ADST,
+    [VERT_RIGHT_PRED]      = ADST_DCT,
+    [HOR_DOWN_PRED]        = DCT_ADST,
+    [HOR_UP_PRED]          = DCT_ADST,
+    [VERT_LEFT_PRED]       = ADST_DCT,
+    [SMOOTH_PRED]          = ADST_ADST,
+    [SMOOTH_V_PRED]        = ADST_DCT,
+    [SMOOTH_H_PRED]        = DCT_ADST,
+    [PAETH_PRED]           = ADST_ADST,
+};
+
+const uint8_t /* enum InterPredMode */
+    dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2] =
+{
+    [NEARESTMV_NEARESTMV] = { NEARESTMV, NEARESTMV },
+    [NEARMV_NEARMV]       = { NEARMV,    NEARMV    },
+    [NEWMV_NEWMV]         = { NEWMV,     NEWMV     },
+    [GLOBALMV_GLOBALMV]   = { GLOBALMV,  GLOBALMV  },
+    [NEWMV_NEARESTMV]     = { NEWMV,     NEARESTMV },
+    [NEWMV_NEARMV]        = { NEWMV,     NEARMV    },
+    [NEARESTMV_NEWMV]     = { NEARESTMV, NEWMV     },
+    [NEARMV_NEWMV]        = { NEARMV,    NEWMV     },
+};
+
+const uint8_t dav1d_partition_type_count[N_BL_LEVELS] = {
+    [BL_128X128] = N_PARTITIONS - 3,
+    [BL_64X64]   = N_PARTITIONS - 1,
+    [BL_32X32]   = N_PARTITIONS - 1,
+    [BL_16X16]   = N_PARTITIONS - 1,
+    [BL_8X8]     = N_SUB8X8_PARTITIONS - 1,
+};
+
+const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40] = {
+    /* Intra2 */
+    IDTX, DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST,
+    /* Intra1 */
+    IDTX, DCT_DCT, V_DCT, H_DCT, ADST_ADST, ADST_DCT, DCT_ADST,
+    /* Inter2 */
+    IDTX, V_DCT, H_DCT, DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT,
+    DCT_FLIPADST, ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST,
+    /* Inter1 */
+    IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST, H_FLIPADST,
+    DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT, DCT_FLIPADST,
+    ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST,
+};
+
+const uint8_t dav1d_ymode_size_context[N_BS_SIZES] = {
+    [BS_128x128] = 3,
+    [BS_128x64]  = 3,
+    [BS_64x128]  = 3,
+    [BS_64x64]   = 3,
+    [BS_64x32]   = 3,
+    [BS_64x16]   = 2,
+    [BS_32x64]   = 3,
+    [BS_32x32]   = 3,
+    [BS_32x16]   = 2,
+    [BS_32x8 ]   = 1,
+    [BS_16x64]   = 2,
+    [BS_16x32]   = 2,
+    [BS_16x16]   = 2,
+    [BS_16x8 ]   = 1,
+    [BS_16x4 ]   = 0,
+    [BS_8x32 ]   = 1,
+    [BS_8x16 ]   = 1,
+    [BS_8x8  ]   = 1,
+    [BS_8x4  ]   = 0,
+    [BS_4x16 ]   = 0,
+    [BS_4x8  ]   = 0,
+    [BS_4x4  ]   = 0,
+};
+
+const uint8_t dav1d_lo_ctx_offsets[3][5][5] = {
+    { /* w == h */
+        {  0,  1,  6,  6, 21 },
+        {  1,  6,  6, 21, 21 },
+        {  6,  6, 21, 21, 21 },
+        {  6, 21, 21, 21, 21 },
+        { 21, 21, 21, 21, 21 },
+    }, { /* w > h */
+        {  0, 16,  6,  6, 21 },
+        { 16, 16,  6, 21, 21 },
+        { 16, 16, 21, 21, 21 },
+        { 16, 16, 21, 21, 21 },
+        { 16, 16, 21, 21, 21 },
+    }, { /* w < h */
+        {  0, 11, 11, 11, 11 },
+        { 11, 11, 11, 11, 11 },
+        {  6,  6, 21, 21, 21 },
+        {  6, 21, 21, 21, 21 },
+        { 21, 21, 21, 21, 21 },
+    },
+};
+
+const uint8_t dav1d_skip_ctx[5][5] = {
+    { 1, 2, 2, 2, 3 },
+    { 2, 4, 4, 4, 5 },
+    { 2, 4, 4, 4, 5 },
+    { 2, 4, 4, 4, 5 },
+    { 3, 5, 5, 5, 6 },
+};
+
+const uint8_t /* enum TxClass */ dav1d_tx_type_class[N_TX_TYPES_PLUS_LL] = {
+    [DCT_DCT]           = TX_CLASS_2D,
+    [ADST_DCT]          = TX_CLASS_2D,
+    [DCT_ADST]          = TX_CLASS_2D,
+    [ADST_ADST]         = TX_CLASS_2D,
+    [FLIPADST_DCT]      = TX_CLASS_2D,
+    [DCT_FLIPADST]      = TX_CLASS_2D,
+    [FLIPADST_FLIPADST] = TX_CLASS_2D,
+    [ADST_FLIPADST]     = TX_CLASS_2D,
+    [FLIPADST_ADST]     = TX_CLASS_2D,
+    [IDTX]              = TX_CLASS_2D,
+    [V_DCT]             = TX_CLASS_V,
+    [H_DCT]             = TX_CLASS_H,
+    [V_ADST]            = TX_CLASS_V,
+    [H_ADST]            = TX_CLASS_H,
+    [V_FLIPADST]        = TX_CLASS_V,
+    [H_FLIPADST]        = TX_CLASS_H,
+    [WHT_WHT]           = TX_CLASS_2D,
+};
+
+const uint8_t /* enum Filter2d */ dav1d_filter_2d[DAV1D_N_FILTERS][DAV1D_N_FILTERS] = {
+    [DAV1D_FILTER_8TAP_REGULAR] = {
+        [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_REGULAR,
+        [DAV1D_FILTER_8TAP_SHARP]   = FILTER_2D_8TAP_REGULAR_SHARP,
+        [DAV1D_FILTER_8TAP_SMOOTH]  = FILTER_2D_8TAP_REGULAR_SMOOTH,
+    }, [DAV1D_FILTER_8TAP_SHARP] = {
+        [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_SHARP_REGULAR,
+        [DAV1D_FILTER_8TAP_SHARP]   = FILTER_2D_8TAP_SHARP,
+        [DAV1D_FILTER_8TAP_SMOOTH]  = FILTER_2D_8TAP_SHARP_SMOOTH,
+    }, [DAV1D_FILTER_8TAP_SMOOTH] = {
+        [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_SMOOTH_REGULAR,
+        [DAV1D_FILTER_8TAP_SHARP]   = FILTER_2D_8TAP_SMOOTH_SHARP,
+        [DAV1D_FILTER_8TAP_SMOOTH]  = FILTER_2D_8TAP_SMOOTH,
+    }, [DAV1D_FILTER_BILINEAR] = {
+        [DAV1D_FILTER_BILINEAR]     = FILTER_2D_BILINEAR,
+    }
+};
+
+const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2] = {
+    [FILTER_2D_8TAP_REGULAR]        = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR },
+    [FILTER_2D_8TAP_REGULAR_SMOOTH] = { DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_REGULAR },
+    [FILTER_2D_8TAP_REGULAR_SHARP]  = { DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_REGULAR },
+    [FILTER_2D_8TAP_SHARP_REGULAR]  = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP   },
+    [FILTER_2D_8TAP_SHARP_SMOOTH]   = { DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_SHARP   },
+    [FILTER_2D_8TAP_SHARP]          = { DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_SHARP   },
+    [FILTER_2D_8TAP_SMOOTH_REGULAR] = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH  },
+    [FILTER_2D_8TAP_SMOOTH]         = { DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_SMOOTH  },
+    [FILTER_2D_8TAP_SMOOTH_SHARP]   = { DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_SMOOTH  },
+    [FILTER_2D_BILINEAR]            = { DAV1D_FILTER_BILINEAR,     DAV1D_FILTER_BILINEAR     },
+};
+
+const uint8_t dav1d_filter_mode_to_y_mode[5] = {
+    DC_PRED, VERT_PRED, HOR_PRED, HOR_DOWN_PRED, DC_PRED
+};
+
+const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES] = {
+    [DC_PRED]              = 0,
+    [VERT_PRED]            = 1,
+    [HOR_PRED]             = 2,
+    [DIAG_DOWN_LEFT_PRED]  = 3,
+    [DIAG_DOWN_RIGHT_PRED] = 4,
+    [VERT_RIGHT_PRED]      = 4,
+    [HOR_DOWN_PRED]        = 4,
+    [HOR_UP_PRED]          = 4,
+    [VERT_LEFT_PRED]       = 3,
+    [SMOOTH_PRED]          = 0,
+    [SMOOTH_V_PRED]        = 1,
+    [SMOOTH_H_PRED]        = 2,
+    [PAETH_PRED]           = 0,
+};
+
+const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES] = {
+    [BS_32x32] = 6,
+    [BS_32x16] = 5,
+    [BS_32x8]  = 8,
+    [BS_16x32] = 4,
+    [BS_16x16] = 3,
+    [BS_16x8]  = 2,
+    [BS_8x32]  = 7,
+    [BS_8x16]  = 1,
+    [BS_8x8]   = 0,
+};
+
+const Dav1dWarpedMotionParams dav1d_default_wm_params = {
+    .type = DAV1D_WM_TYPE_IDENTITY,
+    .matrix = {
+        0, 0, 1 << 16,
+        0, 0, 1 << 16,
+    },
+    .alpha = 0,
+    .beta = 0,
+    .gamma = 0,
+    .delta = 0,
+};
+
+const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
+    {  1 * 12 + 0,  2 * 12 + 0 }, // 6
+    {  1 * 12 + 0,  2 * 12 - 1 }, // 7
+    { -1 * 12 + 1, -2 * 12 + 2 }, // 0
+    {  0 * 12 + 1, -1 * 12 + 2 }, // 1
+    {  0 * 12 + 1,  0 * 12 + 2 }, // 2
+    {  0 * 12 + 1,  1 * 12 + 2 }, // 3
+    {  1 * 12 + 1,  2 * 12 + 2 }, // 4
+    {  1 * 12 + 0,  2 * 12 + 1 }, // 5
+    {  1 * 12 + 0,  2 * 12 + 0 }, // 6
+    {  1 * 12 + 0,  2 * 12 - 1 }, // 7
+    { -1 * 12 + 1, -2 * 12 + 2 }, // 0
+    {  0 * 12 + 1, -1 * 12 + 2 }, // 1
+};
+
+const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
+    { 2, 1, 140, 3236 }, { 2, 1, 112, 2158 }, { 2, 1, 93, 1618 },
+    { 2, 1,  80, 1438 }, { 2, 1,  70, 1295 }, { 2, 1, 58, 1177 },
+    { 2, 1,  47, 1079 }, { 2, 1,  37,  996 }, { 2, 1, 30,  925 },
+    { 2, 1,  25,  863 }, { 0, 1,  -1, 2589 }, { 0, 1, -1, 1618 },
+    { 0, 1,  -1, 1177 }, { 0, 1,  -1,  925 }, { 2, 0, 56,   -1 },
+    { 2, 0,  22,   -1 },
+};
+
+const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = {
+    255, 128,  85,  64,  51,  43,  37,  32,  28,  26,  23,  21,  20,  18,  17,
+     16,  15,  14,  13,  13,  12,  12,  11,  11,  10,  10,   9,   9,   9,   9,
+      8,   8,   8,   8,   7,   7,   7,   7,   7,   6,   6,   6,   6,   6,   6,
+      6,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   4,   4,   4,   4,
+      4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   3,   3,
+      3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+      3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   2,   2,   2,
+      2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+      2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+      2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+      2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+      2,   2,   2,   2,   2,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+      1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+      1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+      1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+      1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+      1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+      0
+};
+
+const int8_t ALIGN(dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8], 8) = {
+    [DAV1D_FILTER_8TAP_REGULAR] = {
+        {   0,   1,  -3,  63,   4,  -1,   0,   0 },
+        {   0,   1,  -5,  61,   9,  -2,   0,   0 },
+        {   0,   1,  -6,  58,  14,  -4,   1,   0 },
+        {   0,   1,  -7,  55,  19,  -5,   1,   0 },
+        {   0,   1,  -7,  51,  24,  -6,   1,   0 },
+        {   0,   1,  -8,  47,  29,  -6,   1,   0 },
+        {   0,   1,  -7,  42,  33,  -6,   1,   0 },
+        {   0,   1,  -7,  38,  38,  -7,   1,   0 },
+        {   0,   1,  -6,  33,  42,  -7,   1,   0 },
+        {   0,   1,  -6,  29,  47,  -8,   1,   0 },
+        {   0,   1,  -6,  24,  51,  -7,   1,   0 },
+        {   0,   1,  -5,  19,  55,  -7,   1,   0 },
+        {   0,   1,  -4,  14,  58,  -6,   1,   0 },
+        {   0,   0,  -2,   9,  61,  -5,   1,   0 },
+        {   0,   0,  -1,   4,  63,  -3,   1,   0 }
+    }, [DAV1D_FILTER_8TAP_SMOOTH] = {
+        {   0,   1,  14,  31,  17,   1,   0,   0 },
+        {   0,   0,  13,  31,  18,   2,   0,   0 },
+        {   0,   0,  11,  31,  20,   2,   0,   0 },
+        {   0,   0,  10,  30,  21,   3,   0,   0 },
+        {   0,   0,   9,  29,  22,   4,   0,   0 },
+        {   0,   0,   8,  28,  23,   5,   0,   0 },
+        {   0,  -1,   8,  27,  24,   6,   0,   0 },
+        {   0,  -1,   7,  26,  26,   7,  -1,   0 },
+        {   0,   0,   6,  24,  27,   8,  -1,   0 },
+        {   0,   0,   5,  23,  28,   8,   0,   0 },
+        {   0,   0,   4,  22,  29,   9,   0,   0 },
+        {   0,   0,   3,  21,  30,  10,   0,   0 },
+        {   0,   0,   2,  20,  31,  11,   0,   0 },
+        {   0,   0,   2,  18,  31,  13,   0,   0 },
+        {   0,   0,   1,  17,  31,  14,   1,   0 }
+    }, [DAV1D_FILTER_8TAP_SHARP] = {
+        {  -1,   1,  -3,  63,   4,  -1,   1,   0 },
+        {  -1,   3,  -6,  62,   8,  -3,   2,  -1 },
+        {  -1,   4,  -9,  60,  13,  -5,   3,  -1 },
+        {  -2,   5, -11,  58,  19,  -7,   3,  -1 },
+        {  -2,   5, -11,  54,  24,  -9,   4,  -1 },
+        {  -2,   5, -12,  50,  30, -10,   4,  -1 },
+        {  -2,   5, -12,  45,  35, -11,   5,  -1 },
+        {  -2,   6, -12,  40,  40, -12,   6,  -2 },
+        {  -1,   5, -11,  35,  45, -12,   5,  -2 },
+        {  -1,   4, -10,  30,  50, -12,   5,  -2 },
+        {  -1,   4,  -9,  24,  54, -11,   5,  -2 },
+        {  -1,   3,  -7,  19,  58, -11,   5,  -2 },
+        {  -1,   3,  -5,  13,  60,  -9,   4,  -1 },
+        {  -1,   2,  -3,   8,  62,  -6,   3,  -1 },
+        {   0,   1,  -1,   4,  63,  -3,   1,  -1 }
+    /* width <= 4 */
+    }, [3 + DAV1D_FILTER_8TAP_REGULAR] = {
+        {   0,   0,  -2,  63,   4,  -1,   0,   0 },
+        {   0,   0,  -4,  61,   9,  -2,   0,   0 },
+        {   0,   0,  -5,  58,  14,  -3,   0,   0 },
+        {   0,   0,  -6,  55,  19,  -4,   0,   0 },
+        {   0,   0,  -6,  51,  24,  -5,   0,   0 },
+        {   0,   0,  -7,  47,  29,  -5,   0,   0 },
+        {   0,   0,  -6,  42,  33,  -5,   0,   0 },
+        {   0,   0,  -6,  38,  38,  -6,   0,   0 },
+        {   0,   0,  -5,  33,  42,  -6,   0,   0 },
+        {   0,   0,  -5,  29,  47,  -7,   0,   0 },
+        {   0,   0,  -5,  24,  51,  -6,   0,   0 },
+        {   0,   0,  -4,  19,  55,  -6,   0,   0 },
+        {   0,   0,  -3,  14,  58,  -5,   0,   0 },
+        {   0,   0,  -2,   9,  61,  -4,   0,   0 },
+        {   0,   0,  -1,   4,  63,  -2,   0,   0 }
+    }, [3 + DAV1D_FILTER_8TAP_SMOOTH] = {
+        {   0,   0,  15,  31,  17,   1,   0,   0 },
+        {   0,   0,  13,  31,  18,   2,   0,   0 },
+        {   0,   0,  11,  31,  20,   2,   0,   0 },
+        {   0,   0,  10,  30,  21,   3,   0,   0 },
+        {   0,   0,   9,  29,  22,   4,   0,   0 },
+        {   0,   0,   8,  28,  23,   5,   0,   0 },
+        {   0,   0,   7,  27,  24,   6,   0,   0 },
+        {   0,   0,   6,  26,  26,   6,   0,   0 },
+        {   0,   0,   6,  24,  27,   7,   0,   0 },
+        {   0,   0,   5,  23,  28,   8,   0,   0 },
+        {   0,   0,   4,  22,  29,   9,   0,   0 },
+        {   0,   0,   3,  21,  30,  10,   0,   0 },
+        {   0,   0,   2,  20,  31,  11,   0,   0 },
+        {   0,   0,   2,  18,  31,  13,   0,   0 },
+        {   0,   0,   1,  17,  31,  15,   0,   0 }
+#if ARCH_X86_64
+    /* Bilin scaled being very rarely used, add a new table entry
+     * and use the put/prep_8tap_scaled code, thus acting as a
+     * scaled bilinear filter. */
+    }, [5] = {
+        {   0,   0,   0, 60,   4,   0,   0,   0 },
+        {   0,   0,   0, 56,   8,   0,   0,   0 },
+        {   0,   0,   0, 52,  12,   0,   0,   0 },
+        {   0,   0,   0, 48,  16,   0,   0,   0 },
+        {   0,   0,   0, 44,  20,   0,   0,   0 },
+        {   0,   0,   0, 40,  24,   0,   0,   0 },
+        {   0,   0,   0, 36,  28,   0,   0,   0 },
+        {   0,   0,   0, 32,  32,   0,   0,   0 },
+        {   0,   0,   0, 28,  36,   0,   0,   0 },
+        {   0,   0,   0, 24,  40,   0,   0,   0 },
+        {   0,   0,   0, 20,  44,   0,   0,   0 },
+        {   0,   0,   0, 16,  48,   0,   0,   0 },
+        {   0,   0,   0, 12,  52,   0,   0,   0 },
+        {   0,   0,   0,  8,  56,   0,   0,   0 },
+        {   0,   0,   0,  4,  60,   0,   0,   0 }
+#endif
+    }
+};
+
+#if ARCH_X86
+#define W(v0, v1, v2, v3, v4, v5, v6, v7) { v0, v2, v4, v6, v1, v3, v5, v7 }
+#else
+#define W(v0, v1, v2, v3, v4, v5, v6, v7) { v0, v1, v2, v3, v4, v5, v6, v7 }
+#endif
+const int8_t ALIGN(dav1d_mc_warp_filter[193][8], 8) = {
+   // [-1, 0)
+    W( 0,   0, 127,   1,   0, 0, 0, 0 ), W( 0, - 1, 127,   2,   0, 0, 0, 0 ),
+    W( 1, - 3, 127,   4, - 1, 0, 0, 0 ), W( 1, - 4, 126,   6, - 2, 1, 0, 0 ),
+    W( 1, - 5, 126,   8, - 3, 1, 0, 0 ), W( 1, - 6, 125,  11, - 4, 1, 0, 0 ),
+    W( 1, - 7, 124,  13, - 4, 1, 0, 0 ), W( 2, - 8, 123,  15, - 5, 1, 0, 0 ),
+    W( 2, - 9, 122,  18, - 6, 1, 0, 0 ), W( 2, -10, 121,  20, - 6, 1, 0, 0 ),
+    W( 2, -11, 120,  22, - 7, 2, 0, 0 ), W( 2, -12, 119,  25, - 8, 2, 0, 0 ),
+    W( 3, -13, 117,  27, - 8, 2, 0, 0 ), W( 3, -13, 116,  29, - 9, 2, 0, 0 ),
+    W( 3, -14, 114,  32, -10, 3, 0, 0 ), W( 3, -15, 113,  35, -10, 2, 0, 0 ),
+    W( 3, -15, 111,  37, -11, 3, 0, 0 ), W( 3, -16, 109,  40, -11, 3, 0, 0 ),
+    W( 3, -16, 108,  42, -12, 3, 0, 0 ), W( 4, -17, 106,  45, -13, 3, 0, 0 ),
+    W( 4, -17, 104,  47, -13, 3, 0, 0 ), W( 4, -17, 102,  50, -14, 3, 0, 0 ),
+    W( 4, -17, 100,  52, -14, 3, 0, 0 ), W( 4, -18,  98,  55, -15, 4, 0, 0 ),
+    W( 4, -18,  96,  58, -15, 3, 0, 0 ), W( 4, -18,  94,  60, -16, 4, 0, 0 ),
+    W( 4, -18,  91,  63, -16, 4, 0, 0 ), W( 4, -18,  89,  65, -16, 4, 0, 0 ),
+    W( 4, -18,  87,  68, -17, 4, 0, 0 ), W( 4, -18,  85,  70, -17, 4, 0, 0 ),
+    W( 4, -18,  82,  73, -17, 4, 0, 0 ), W( 4, -18,  80,  75, -17, 4, 0, 0 ),
+    W( 4, -18,  78,  78, -18, 4, 0, 0 ), W( 4, -17,  75,  80, -18, 4, 0, 0 ),
+    W( 4, -17,  73,  82, -18, 4, 0, 0 ), W( 4, -17,  70,  85, -18, 4, 0, 0 ),
+    W( 4, -17,  68,  87, -18, 4, 0, 0 ), W( 4, -16,  65,  89, -18, 4, 0, 0 ),
+    W( 4, -16,  63,  91, -18, 4, 0, 0 ), W( 4, -16,  60,  94, -18, 4, 0, 0 ),
+    W( 3, -15,  58,  96, -18, 4, 0, 0 ), W( 4, -15,  55,  98, -18, 4, 0, 0 ),
+    W( 3, -14,  52, 100, -17, 4, 0, 0 ), W( 3, -14,  50, 102, -17, 4, 0, 0 ),
+    W( 3, -13,  47, 104, -17, 4, 0, 0 ), W( 3, -13,  45, 106, -17, 4, 0, 0 ),
+    W( 3, -12,  42, 108, -16, 3, 0, 0 ), W( 3, -11,  40, 109, -16, 3, 0, 0 ),
+    W( 3, -11,  37, 111, -15, 3, 0, 0 ), W( 2, -10,  35, 113, -15, 3, 0, 0 ),
+    W( 3, -10,  32, 114, -14, 3, 0, 0 ), W( 2, - 9,  29, 116, -13, 3, 0, 0 ),
+    W( 2, - 8,  27, 117, -13, 3, 0, 0 ), W( 2, - 8,  25, 119, -12, 2, 0, 0 ),
+    W( 2, - 7,  22, 120, -11, 2, 0, 0 ), W( 1, - 6,  20, 121, -10, 2, 0, 0 ),
+    W( 1, - 6,  18, 122, - 9, 2, 0, 0 ), W( 1, - 5,  15, 123, - 8, 2, 0, 0 ),
+    W( 1, - 4,  13, 124, - 7, 1, 0, 0 ), W( 1, - 4,  11, 125, - 6, 1, 0, 0 ),
+    W( 1, - 3,   8, 126, - 5, 1, 0, 0 ), W( 1, - 2,   6, 126, - 4, 1, 0, 0 ),
+    W( 0, - 1,   4, 127, - 3, 1, 0, 0 ), W( 0,   0,   2, 127, - 1, 0, 0, 0 ),
+    // [0, 1)
+    W( 0,  0,   0, 127,   1,   0,  0,  0),W( 0,  0,  -1, 127,   2,   0,  0,  0),
+    W( 0,  1,  -3, 127,   4,  -2,  1,  0),W( 0,  1,  -5, 127,   6,  -2,  1,  0),
+    W( 0,  2,  -6, 126,   8,  -3,  1,  0),W(-1,  2,  -7, 126,  11,  -4,  2, -1),
+    W(-1,  3,  -8, 125,  13,  -5,  2, -1),W(-1,  3, -10, 124,  16,  -6,  3, -1),
+    W(-1,  4, -11, 123,  18,  -7,  3, -1),W(-1,  4, -12, 122,  20,  -7,  3, -1),
+    W(-1,  4, -13, 121,  23,  -8,  3, -1),W(-2,  5, -14, 120,  25,  -9,  4, -1),
+    W(-1,  5, -15, 119,  27, -10,  4, -1),W(-1,  5, -16, 118,  30, -11,  4, -1),
+    W(-2,  6, -17, 116,  33, -12,  5, -1),W(-2,  6, -17, 114,  35, -12,  5, -1),
+    W(-2,  6, -18, 113,  38, -13,  5, -1),W(-2,  7, -19, 111,  41, -14,  6, -2),
+    W(-2,  7, -19, 110,  43, -15,  6, -2),W(-2,  7, -20, 108,  46, -15,  6, -2),
+    W(-2,  7, -20, 106,  49, -16,  6, -2),W(-2,  7, -21, 104,  51, -16,  7, -2),
+    W(-2,  7, -21, 102,  54, -17,  7, -2),W(-2,  8, -21, 100,  56, -18,  7, -2),
+    W(-2,  8, -22,  98,  59, -18,  7, -2),W(-2,  8, -22,  96,  62, -19,  7, -2),
+    W(-2,  8, -22,  94,  64, -19,  7, -2),W(-2,  8, -22,  91,  67, -20,  8, -2),
+    W(-2,  8, -22,  89,  69, -20,  8, -2),W(-2,  8, -22,  87,  72, -21,  8, -2),
+    W(-2,  8, -21,  84,  74, -21,  8, -2),W(-2,  8, -22,  82,  77, -21,  8, -2),
+    W(-2,  8, -21,  79,  79, -21,  8, -2),W(-2,  8, -21,  77,  82, -22,  8, -2),
+    W(-2,  8, -21,  74,  84, -21,  8, -2),W(-2,  8, -21,  72,  87, -22,  8, -2),
+    W(-2,  8, -20,  69,  89, -22,  8, -2),W(-2,  8, -20,  67,  91, -22,  8, -2),
+    W(-2,  7, -19,  64,  94, -22,  8, -2),W(-2,  7, -19,  62,  96, -22,  8, -2),
+    W(-2,  7, -18,  59,  98, -22,  8, -2),W(-2,  7, -18,  56, 100, -21,  8, -2),
+    W(-2,  7, -17,  54, 102, -21,  7, -2),W(-2,  7, -16,  51, 104, -21,  7, -2),
+    W(-2,  6, -16,  49, 106, -20,  7, -2),W(-2,  6, -15,  46, 108, -20,  7, -2),
+    W(-2,  6, -15,  43, 110, -19,  7, -2),W(-2,  6, -14,  41, 111, -19,  7, -2),
+    W(-1,  5, -13,  38, 113, -18,  6, -2),W(-1,  5, -12,  35, 114, -17,  6, -2),
+    W(-1,  5, -12,  33, 116, -17,  6, -2),W(-1,  4, -11,  30, 118, -16,  5, -1),
+    W(-1,  4, -10,  27, 119, -15,  5, -1),W(-1,  4,  -9,  25, 120, -14,  5, -2),
+    W(-1,  3,  -8,  23, 121, -13,  4, -1),W(-1,  3,  -7,  20, 122, -12,  4, -1),
+    W(-1,  3,  -7,  18, 123, -11,  4, -1),W(-1,  3,  -6,  16, 124, -10,  3, -1),
+    W(-1,  2,  -5,  13, 125,  -8,  3, -1),W(-1,  2,  -4,  11, 126,  -7,  2, -1),
+    W( 0,  1,  -3,   8, 126,  -6,  2,  0),W( 0,  1,  -2,   6, 127,  -5,  1,  0),
+    W( 0,  1,  -2,   4, 127,  -3,  1,  0),W( 0,  0,   0,   2, 127,  -1,  0,  0),
+    // [1, 2)
+    W( 0, 0, 0,   1, 127,   0,   0, 0 ),W( 0, 0, 0, - 1, 127,   2,   0, 0 ),
+    W( 0, 0, 1, - 3, 127,   4, - 1, 0 ), W( 0, 0, 1, - 4, 126,   6, - 2, 1 ),
+    W( 0, 0, 1, - 5, 126,   8, - 3, 1 ), W( 0, 0, 1, - 6, 125,  11, - 4, 1 ),
+    W( 0, 0, 1, - 7, 124,  13, - 4, 1 ), W( 0, 0, 2, - 8, 123,  15, - 5, 1 ),
+    W( 0, 0, 2, - 9, 122,  18, - 6, 1 ), W( 0, 0, 2, -10, 121,  20, - 6, 1 ),
+    W( 0, 0, 2, -11, 120,  22, - 7, 2 ), W( 0, 0, 2, -12, 119,  25, - 8, 2 ),
+    W( 0, 0, 3, -13, 117,  27, - 8, 2 ), W( 0, 0, 3, -13, 116,  29, - 9, 2 ),
+    W( 0, 0, 3, -14, 114,  32, -10, 3 ), W( 0, 0, 3, -15, 113,  35, -10, 2 ),
+    W( 0, 0, 3, -15, 111,  37, -11, 3 ), W( 0, 0, 3, -16, 109,  40, -11, 3 ),
+    W( 0, 0, 3, -16, 108,  42, -12, 3 ), W( 0, 0, 4, -17, 106,  45, -13, 3 ),
+    W( 0, 0, 4, -17, 104,  47, -13, 3 ), W( 0, 0, 4, -17, 102,  50, -14, 3 ),
+    W( 0, 0, 4, -17, 100,  52, -14, 3 ), W( 0, 0, 4, -18,  98,  55, -15, 4 ),
+    W( 0, 0, 4, -18,  96,  58, -15, 3 ), W( 0, 0, 4, -18,  94,  60, -16, 4 ),
+    W( 0, 0, 4, -18,  91,  63, -16, 4 ), W( 0, 0, 4, -18,  89,  65, -16, 4 ),
+    W( 0, 0, 4, -18,  87,  68, -17, 4 ), W( 0, 0, 4, -18,  85,  70, -17, 4 ),
+    W( 0, 0, 4, -18,  82,  73, -17, 4 ), W( 0, 0, 4, -18,  80,  75, -17, 4 ),
+    W( 0, 0, 4, -18,  78,  78, -18, 4 ), W( 0, 0, 4, -17,  75,  80, -18, 4 ),
+    W( 0, 0, 4, -17,  73,  82, -18, 4 ), W( 0, 0, 4, -17,  70,  85, -18, 4 ),
+    W( 0, 0, 4, -17,  68,  87, -18, 4 ), W( 0, 0, 4, -16,  65,  89, -18, 4 ),
+    W( 0, 0, 4, -16,  63,  91, -18, 4 ), W( 0, 0, 4, -16,  60,  94, -18, 4 ),
+    W( 0, 0, 3, -15,  58,  96, -18, 4 ), W( 0, 0, 4, -15,  55,  98, -18, 4 ),
+    W( 0, 0, 3, -14,  52, 100, -17, 4 ), W( 0, 0, 3, -14,  50, 102, -17, 4 ),
+    W( 0, 0, 3, -13,  47, 104, -17, 4 ), W( 0, 0, 3, -13,  45, 106, -17, 4 ),
+    W( 0, 0, 3, -12,  42, 108, -16, 3 ), W( 0, 0, 3, -11,  40, 109, -16, 3 ),
+    W( 0, 0, 3, -11,  37, 111, -15, 3 ), W( 0, 0, 2, -10,  35, 113, -15, 3 ),
+    W( 0, 0, 3, -10,  32, 114, -14, 3 ), W( 0, 0, 2, - 9,  29, 116, -13, 3 ),
+    W( 0, 0, 2, - 8,  27, 117, -13, 3 ), W( 0, 0, 2, - 8,  25, 119, -12, 2 ),
+    W( 0, 0, 2, - 7,  22, 120, -11, 2 ), W( 0, 0, 1, - 6,  20, 121, -10, 2 ),
+    W( 0, 0, 1, - 6,  18, 122, - 9, 2 ), W( 0, 0, 1, - 5,  15, 123, - 8, 2 ),
+    W( 0, 0, 1, - 4,  13, 124, - 7, 1 ), W( 0, 0, 1, - 4,  11, 125, - 6, 1 ),
+    W( 0, 0, 1, - 3,   8, 126, - 5, 1 ), W( 0, 0, 1, - 2,   6, 126, - 4, 1 ),
+    W( 0, 0, 0, - 1,   4, 127, - 3, 1 ), W( 0, 0, 0,   0,   2, 127, - 1, 0 ),
+    // dummy (replicate row index 191)
+    W( 0, 0, 0,   0,   2, 127, - 1, 0 ),
+};
+
+const int8_t ALIGN(dav1d_resize_filter[64][8], 8) = {
+    { 0,  0,  0, -128,    0,  0,  0, 0 }, { 0,  0,  1, -128,   -2,  1,  0, 0 },
+    { 0, -1,  3, -127,   -4,  2, -1, 0 }, { 0, -1,  4, -127,   -6,  3, -1, 0 },
+    { 0, -2,  6, -126,   -8,  3, -1, 0 }, { 0, -2,  7, -125,  -11,  4, -1, 0 },
+    { 1, -2,  8, -125,  -13,  5, -2, 0 }, { 1, -3,  9, -124,  -15,  6, -2, 0 },
+    { 1, -3, 10, -123,  -18,  6, -2, 1 }, { 1, -3, 11, -122,  -20,  7, -3, 1 },
+    { 1, -4, 12, -121,  -22,  8, -3, 1 }, { 1, -4, 13, -120,  -25,  9, -3, 1 },
+    { 1, -4, 14, -118,  -28,  9, -3, 1 }, { 1, -4, 15, -117,  -30, 10, -4, 1 },
+    { 1, -5, 16, -116,  -32, 11, -4, 1 }, { 1, -5, 16, -114,  -35, 12, -4, 1 },
+    { 1, -5, 17, -112,  -38, 12, -4, 1 }, { 1, -5, 18, -111,  -40, 13, -5, 1 },
+    { 1, -5, 18, -109,  -43, 14, -5, 1 }, { 1, -6, 19, -107,  -45, 14, -5, 1 },
+    { 1, -6, 19, -105,  -48, 15, -5, 1 }, { 1, -6, 19, -103,  -51, 16, -5, 1 },
+    { 1, -6, 20, -101,  -53, 16, -6, 1 }, { 1, -6, 20,  -99,  -56, 17, -6, 1 },
+    { 1, -6, 20,  -97,  -58, 17, -6, 1 }, { 1, -6, 20,  -95,  -61, 18, -6, 1 },
+    { 2, -7, 20,  -93,  -64, 18, -6, 2 }, { 2, -7, 20,  -91,  -66, 19, -6, 1 },
+    { 2, -7, 20,  -88,  -69, 19, -6, 1 }, { 2, -7, 20,  -86,  -71, 19, -6, 1 },
+    { 2, -7, 20,  -84,  -74, 20, -7, 2 }, { 2, -7, 20,  -81,  -76, 20, -7, 1 },
+    { 2, -7, 20,  -79,  -79, 20, -7, 2 }, { 1, -7, 20,  -76,  -81, 20, -7, 2 },
+    { 2, -7, 20,  -74,  -84, 20, -7, 2 }, { 1, -6, 19,  -71,  -86, 20, -7, 2 },
+    { 1, -6, 19,  -69,  -88, 20, -7, 2 }, { 1, -6, 19,  -66,  -91, 20, -7, 2 },
+    { 2, -6, 18,  -64,  -93, 20, -7, 2 }, { 1, -6, 18,  -61,  -95, 20, -6, 1 },
+    { 1, -6, 17,  -58,  -97, 20, -6, 1 }, { 1, -6, 17,  -56,  -99, 20, -6, 1 },
+    { 1, -6, 16,  -53, -101, 20, -6, 1 }, { 1, -5, 16,  -51, -103, 19, -6, 1 },
+    { 1, -5, 15,  -48, -105, 19, -6, 1 }, { 1, -5, 14,  -45, -107, 19, -6, 1 },
+    { 1, -5, 14,  -43, -109, 18, -5, 1 }, { 1, -5, 13,  -40, -111, 18, -5, 1 },
+    { 1, -4, 12,  -38, -112, 17, -5, 1 }, { 1, -4, 12,  -35, -114, 16, -5, 1 },
+    { 1, -4, 11,  -32, -116, 16, -5, 1 }, { 1, -4, 10,  -30, -117, 15, -4, 1 },
+    { 1, -3,  9,  -28, -118, 14, -4, 1 }, { 1, -3,  9,  -25, -120, 13, -4, 1 },
+    { 1, -3,  8,  -22, -121, 12, -4, 1 }, { 1, -3,  7,  -20, -122, 11, -3, 1 },
+    { 1, -2,  6,  -18, -123, 10, -3, 1 }, { 0, -2,  6,  -15, -124,  9, -3, 1 },
+    { 0, -2,  5,  -13, -125,  8, -2, 1 }, { 0, -1,  4,  -11, -125,  7, -2, 0 },
+    { 0, -1,  3,   -8, -126,  6, -2, 0 }, { 0, -1,  3,   -6, -127,  4, -1, 0 },
+    { 0, -1,  2,   -4, -127,  3, -1, 0 }, { 0,  0,  1,   -2, -128,  1,  0, 0 },
+};
+
+const uint8_t dav1d_sm_weights[128] = {
+    // Unused, because we always offset by bs, which is at least 2.
+      0,   0,
+    // bs = 2
+    255, 128,
+    // bs = 4
+    255, 149,  85,  64,
+    // bs = 8
+    255, 197, 146, 105,  73,  50,  37,  32,
+    // bs = 16
+    255, 225, 196, 170, 145, 123, 102,  84,
+     68,  54,  43,  33,  26,  20,  17,  16,
+    // bs = 32
+    255, 240, 225, 210, 196, 182, 169, 157,
+    145, 133, 122, 111, 101,  92,  83,  74,
+     66,  59,  52,  45,  39,  34,  29,  25,
+     21,  17,  14,  12,  10,   9,   8,   8,
+    // bs = 64
+    255, 248, 240, 233, 225, 218, 210, 203,
+    196, 189, 182, 176, 169, 163, 156, 150,
+    144, 138, 133, 127, 121, 116, 111, 106,
+    101,  96,  91,  86,  82,  77,  73,  69,
+     65,  61,  57,  54,  50,  47,  44,  41,
+     38,  35,  32,  29,  27,  25,  22,  20,
+     18,  16,  15,  13,  12,  10,   9,   8,
+      7,   6,   6,   5,   5,   4,   4,   4
+};
+
+const uint16_t dav1d_dr_intra_derivative[44] = {
+    // Values that are 0 will never be used
+          0,    // Angles:
+    1023, 0,    //  3,  93, 183
+     547,       //  6,  96, 186
+     372, 0, 0, //  9,  99, 189
+     273,       // 14, 104, 194
+     215, 0,    // 17, 107, 197
+     178,       // 20, 110, 200
+     151, 0,    // 23, 113, 203 (113 & 203 are base angles)
+     132,       // 26, 116, 206
+     116, 0,    // 29, 119, 209
+     102, 0,    // 32, 122, 212
+      90,       // 36, 126, 216
+      80, 0,    // 39, 129, 219
+      71,       // 42, 132, 222
+      64, 0,    // 45, 135, 225 (45 & 135 are base angles)
+      57,       // 48, 138, 228
+      51, 0,    // 51, 141, 231
+      45, 0,    // 54, 144, 234
+      40,       // 58, 148, 238
+      35, 0,    // 61, 151, 241
+      31,       // 64, 154, 244
+      27, 0,    // 67, 157, 247 (67 & 157 are base angles)
+      23,       // 70, 160, 250
+      19, 0,    // 73, 163, 253
+      15, 0,    // 76, 166, 256
+      11, 0,    // 81, 171, 261
+       7,       // 84, 174, 264
+       3        // 87, 177, 267
+};
+
+#if ARCH_X86
+#define F(idx, f0, f1, f2, f3, f4, f5, f6) \
+    [2*idx+0]  = f0, [2*idx+1]  = f1,      \
+    [2*idx+16] = f2, [2*idx+17] = f3,      \
+    [2*idx+32] = f4, [2*idx+33] = f5,      \
+    [2*idx+48] = f6
+#else
+#define F(idx, f0, f1, f2, f3, f4, f5, f6) \
+    [1*idx+0]  = f0, [1*idx+8]  = f1,      \
+    [1*idx+16] = f2, [1*idx+24] = f3,      \
+    [1*idx+32] = f4, [1*idx+40] = f5,      \
+    [1*idx+48] = f6
+#endif
+const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
+    {
+        F( 0,  -6, 10,  0,  0,  0, 12,  0 ),
+        F( 1,  -5,  2, 10,  0,  0,  9,  0 ),
+        F( 2,  -3,  1,  1, 10,  0,  7,  0 ),
+        F( 3,  -3,  1,  1,  2, 10,  5,  0 ),
+        F( 4,  -4,  6,  0,  0,  0,  2, 12 ),
+        F( 5,  -3,  2,  6,  0,  0,  2,  9 ),
+        F( 6,  -3,  2,  2,  6,  0,  2,  7 ),
+        F( 7,  -3,  1,  2,  2,  6,  3,  5 ),
+    }, {
+        F( 0, -10, 16,  0,  0,  0, 10,  0 ),
+        F( 1,  -6,  0, 16,  0,  0,  6,  0 ),
+        F( 2,  -4,  0,  0, 16,  0,  4,  0 ),
+        F( 3,  -2,  0,  0,  0, 16,  2,  0 ),
+        F( 4, -10, 16,  0,  0,  0,  0, 10 ),
+        F( 5,  -6,  0, 16,  0,  0,  0,  6 ),
+        F( 6,  -4,  0,  0, 16,  0,  0,  4 ),
+        F( 7,  -2,  0,  0,  0, 16,  0,  2 ),
+    }, {
+        F( 0,  -8,  8,  0,  0,  0, 16,  0 ),
+        F( 1,  -8,  0,  8,  0,  0, 16,  0 ),
+        F( 2,  -8,  0,  0,  8,  0, 16,  0 ),
+        F( 3,  -8,  0,  0,  0,  8, 16,  0 ),
+        F( 4,  -4,  4,  0,  0,  0,  0, 16 ),
+        F( 5,  -4,  0,  4,  0,  0,  0, 16 ),
+        F( 6,  -4,  0,  0,  4,  0,  0, 16 ),
+        F( 7,  -4,  0,  0,  0,  4,  0, 16 ),
+    }, {
+        F( 0,  -2,  8,  0,  0,  0, 10,  0 ),
+        F( 1,  -1,  3,  8,  0,  0,  6,  0 ),
+        F( 2,  -1,  2,  3,  8,  0,  4,  0 ),
+        F( 3,   0,  1,  2,  3,  8,  2,  0 ),
+        F( 4,  -1,  4,  0,  0,  0,  3, 10 ),
+        F( 5,  -1,  3,  4,  0,  0,  4,  6 ),
+        F( 6,  -1,  2,  3,  4,  0,  4,  4 ),
+        F( 7,  -1,  2,  2,  3,  4,  3,  3 ),
+    }, {
+        F( 0, -12, 14,  0,  0,  0, 14,  0 ),
+        F( 1, -10,  0, 14,  0,  0, 12,  0 ),
+        F( 2,  -9,  0,  0, 14,  0, 11,  0 ),
+        F( 3,  -8,  0,  0,  0, 14, 10,  0 ),
+        F( 4, -10, 12,  0,  0,  0,  0, 14 ),
+        F( 5,  -9,  1, 12,  0,  0,  0, 12 ),
+        F( 6,  -8,  0,  0, 12,  0,  1, 11 ),
+        F( 7,  -7,  0,  0,  1, 12,  1,  9 ),
+    }
+};
+
+const uint8_t ALIGN(dav1d_obmc_masks[64], 16) = {
+    /* Unused */
+     0,  0,
+    /* 2 */
+    19,  0,
+    /* 4 */
+    25, 14,  5,  0,
+    /* 8 */
+    28, 22, 16, 11,  7,  3,  0,  0,
+    /* 16 */
+    30, 27, 24, 21, 18, 15, 12, 10,  8,  6,  4,  3,  0,  0,  0,  0,
+    /* 32 */
+    31, 29, 28, 26, 24, 23, 21, 20, 19, 17, 16, 14, 13, 12, 11,  9,
+     8,  7,  6,  5,  4,  4,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,
+};
+
+// Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512
+const int16_t dav1d_gaussian_sequence[2048] = {
+    56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+    224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+    112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+    -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+    432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+    192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+    540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+    248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+    248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+    340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+    220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+    -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+    60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+    488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+    -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+    -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+    -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+    -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+    728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+    4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+    772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+    -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+    -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+    -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+    1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+    204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+    548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+    -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+    96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+    -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+    240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+    -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+    896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+    -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+    -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+    -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+    -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+    -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+    424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+    436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+    -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+    -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+    496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+    56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+    -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+    540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+    424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+    -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+    756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+    -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+    60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+    -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+    -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+    308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+    -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+    -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+    284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+    264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+    -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+    908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+    124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+    1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+    -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+    -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+    -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+    320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+    -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+    -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+    -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+    -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+    -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+    636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+    -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+    -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+    392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+    -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+    -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+    -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+    756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+    -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+    472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+    844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+    60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+    -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+    -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+    472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+    652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+    -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+    -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+    -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+    -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+    220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+    412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+    320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+    372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+    924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+    332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+    436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+    -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+    1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+    -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+    -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+    -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+    528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+    -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+    -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+    1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+    20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+    96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+    192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+    648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+    816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+    648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+    -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+    -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+    -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+    384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+    -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+    -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+    64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+    -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+    128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+    112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+    828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+    -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+    0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+    -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+    24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+    508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+    716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+    600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+    -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+    -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+    344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+    -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+    164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+    192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+    288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+    -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+    -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+    556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+    268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+    884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+    -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+    -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+    244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+    -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+    -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+    -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+    1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+    -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+    344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+    -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+    1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+    -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+    504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+    76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+    116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+    28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+    -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+    -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+    -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+    -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+    252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+    312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+    732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+    124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+    -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+    440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+    -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+    648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+    680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+    -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+    -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+    -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+    -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+    372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+    -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+    -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+    -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+    -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+    52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+    716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+    -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+    -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+    104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+    428,   -484
+};
diff --git a/src/tables.h b/src/tables.h
new file mode 100644 (file)
index 0000000..abcf265
--- /dev/null
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_TABLES_H
+#define DAV1D_SRC_TABLES_H
+
+#include <stdint.h>
+
+#include "common/intops.h"
+
+#include "src/levels.h"
+
+extern const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS];
+extern const uint8_t /* enum BlockSize */
+                     dav1d_block_sizes[N_BL_LEVELS][N_PARTITIONS][2];
+// width, height (in 4px blocks), log2 versions of these two
+extern const uint8_t dav1d_block_dimensions[N_BS_SIZES][4];
+typedef struct TxfmInfo {
+    // width, height (in 4px blocks), log2 of them, min/max of log2, sub, pad
+    uint8_t w, h, lw, lh, min, max, sub, ctx;
+} TxfmInfo;
+extern const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES];
+extern const uint8_t /* enum (Rect)TxfmSize */
+                     dav1d_max_txfm_size_for_bs[N_BS_SIZES][4 /* y, 420, 422, 444 */];
+extern const uint8_t /* enum TxfmType */
+                     dav1d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES];
+
+extern const uint8_t /* enum InterPredMode */
+                     dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2];
+
+extern const uint8_t dav1d_partition_type_count[N_BL_LEVELS];
+extern const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40];
+
+extern const uint8_t dav1d_filter_mode_to_y_mode[5];
+extern const uint8_t dav1d_ymode_size_context[N_BS_SIZES];
+extern const uint8_t dav1d_lo_ctx_offsets[3][5][5];
+extern const uint8_t dav1d_skip_ctx[5][5];
+extern const uint8_t /* enum TxClass */
+                     dav1d_tx_type_class[N_TX_TYPES_PLUS_LL];
+extern const uint8_t /* enum Filter2d */
+                     dav1d_filter_2d[DAV1D_N_FILTERS /* h */][DAV1D_N_FILTERS /* v */];
+extern const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2];
+extern const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES];
+extern const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES];
+
+static const unsigned cfl_allowed_mask =
+    (1 << BS_32x32) |
+    (1 << BS_32x16) |
+    (1 << BS_32x8) |
+    (1 << BS_16x32) |
+    (1 << BS_16x16) |
+    (1 << BS_16x8) |
+    (1 << BS_16x4) |
+    (1 << BS_8x32) |
+    (1 << BS_8x16) |
+    (1 << BS_8x8) |
+    (1 << BS_8x4) |
+    (1 << BS_4x16) |
+    (1 << BS_4x8) |
+    (1 << BS_4x4);
+
+static const unsigned wedge_allowed_mask =
+    (1 << BS_32x32) |
+    (1 << BS_32x16) |
+    (1 << BS_32x8) |
+    (1 << BS_16x32) |
+    (1 << BS_16x16) |
+    (1 << BS_16x8) |
+    (1 << BS_8x32) |
+    (1 << BS_8x16) |
+    (1 << BS_8x8);
+
+static const unsigned interintra_allowed_mask =
+    (1 << BS_32x32) |
+    (1 << BS_32x16) |
+    (1 << BS_16x32) |
+    (1 << BS_16x16) |
+    (1 << BS_16x8) |
+    (1 << BS_8x16) |
+    (1 << BS_8x8);
+
+extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
+
+extern const int8_t dav1d_cdef_directions[12][2];
+
+extern const int16_t dav1d_sgr_params[16][4];
+extern const uint8_t dav1d_sgr_x_by_x[256];
+
+extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8];
+extern const int8_t dav1d_mc_warp_filter[193][8];
+extern const int8_t dav1d_resize_filter[64][8];
+
+extern const uint8_t dav1d_sm_weights[128];
+extern const uint16_t dav1d_dr_intra_derivative[44];
+extern const int8_t dav1d_filter_intra_taps[5][64];
+
+extern const uint8_t dav1d_obmc_masks[64];
+
+extern const int16_t dav1d_gaussian_sequence[2048]; // for fgs
+
+#endif /* DAV1D_SRC_TABLES_H */
diff --git a/src/thread.h b/src/thread.h
new file mode 100644 (file)
index 0000000..6cd304e
--- /dev/null
@@ -0,0 +1,180 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_THREAD_H
+#define DAV1D_SRC_THREAD_H
+
+#if defined(_WIN32)
+
+#include <limits.h>
+#include <windows.h>
+
+#define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT
+
+typedef struct {
+    HANDLE h;
+    void *(*func)(void*);
+    void *arg;
+} pthread_t;
+
+typedef struct {
+    unsigned stack_size;
+} pthread_attr_t;
+
+typedef SRWLOCK pthread_mutex_t;
+typedef CONDITION_VARIABLE pthread_cond_t;
+typedef INIT_ONCE pthread_once_t;
+
+void dav1d_init_thread(void);
+void dav1d_set_thread_name(const wchar_t *name);
+#define dav1d_set_thread_name(name) dav1d_set_thread_name(L##name)
+
+int dav1d_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
+                         void *(*func)(void*), void *arg);
+int dav1d_pthread_join(pthread_t *thread, void **res);
+int dav1d_pthread_once(pthread_once_t *once_control,
+                       void (*init_routine)(void));
+
+#define pthread_create dav1d_pthread_create
+#define pthread_join(thread, res) dav1d_pthread_join(&(thread), res)
+#define pthread_once   dav1d_pthread_once
+
+static inline int pthread_attr_init(pthread_attr_t *const attr) {
+    attr->stack_size = 0;
+    return 0;
+}
+
+static inline int pthread_attr_destroy(pthread_attr_t *const attr) {
+    return 0;
+}
+
+static inline int pthread_attr_setstacksize(pthread_attr_t *const attr,
+                                            const size_t stack_size)
+{
+    if (stack_size > UINT_MAX) return 1;
+    attr->stack_size = (unsigned) stack_size;
+    return 0;
+}
+
+static inline int pthread_mutex_init(pthread_mutex_t *const mutex,
+                                     const void *const attr)
+{
+    InitializeSRWLock(mutex);
+    return 0;
+}
+
+static inline int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+    return 0;
+}
+
+static inline int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+    AcquireSRWLockExclusive(mutex);
+    return 0;
+}
+
+static inline int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+    ReleaseSRWLockExclusive(mutex);
+    return 0;
+}
+
+static inline int pthread_cond_init(pthread_cond_t *const cond,
+                                    const void *const attr)
+{
+    InitializeConditionVariable(cond);
+    return 0;
+}
+
+static inline int pthread_cond_destroy(pthread_cond_t *const cond) {
+    return 0;
+}
+
+static inline int pthread_cond_wait(pthread_cond_t *const cond,
+                                    pthread_mutex_t *const mutex)
+{
+    return !SleepConditionVariableSRW(cond, mutex, INFINITE, 0);
+}
+
+static inline int pthread_cond_signal(pthread_cond_t *const cond) {
+    WakeConditionVariable(cond);
+    return 0;
+}
+
+static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {
+    WakeAllConditionVariable(cond);
+    return 0;
+}
+
+#else
+
+#include <pthread.h>
+
+#define dav1d_init_thread() do {} while (0)
+
+/* Thread naming support */
+
+#ifdef __linux__
+
+#include <sys/prctl.h>
+
+static inline void dav1d_set_thread_name(const char *const name) {
+    prctl(PR_SET_NAME, name);
+}
+
+#elif defined(__APPLE__)
+
+static inline void dav1d_set_thread_name(const char *const name) {
+    pthread_setname_np(name);
+}
+
+#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
+
+#if defined(__FreeBSD__)
+ /* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
+#define _SYS_PARAM_H_
+#include <sys/types.h>
+#endif
+#include <pthread_np.h>
+
+static inline void dav1d_set_thread_name(const char *const name) {
+    pthread_set_name_np(pthread_self(), name);
+}
+
+#elif defined(__NetBSD__)
+
+static inline void dav1d_set_thread_name(const char *const name) {
+    pthread_setname_np(pthread_self(), "%s", (void*)name);
+}
+
+#else
+
+#define dav1d_set_thread_name(name) do {} while (0)
+
+#endif
+
+#endif
+
+#endif /* DAV1D_SRC_THREAD_H */
diff --git a/src/thread_data.h b/src/thread_data.h
new file mode 100644 (file)
index 0000000..62814e6
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_THREAD_DATA_H
+#define DAV1D_SRC_THREAD_DATA_H
+
+#include "src/thread.h"
+
+struct thread_data {
+    pthread_t thread;
+    pthread_cond_t cond;
+    pthread_mutex_t lock;
+    int inited;
+};
+
+#endif /* DAV1D_SRC_THREAD_DATA_H */
diff --git a/src/thread_task.c b/src/thread_task.c
new file mode 100644 (file)
index 0000000..6c1c139
--- /dev/null
@@ -0,0 +1,142 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/thread_task.h"
+
+void *dav1d_frame_task(void *const data) {
+    Dav1dFrameContext *const f = data;
+
+    dav1d_set_thread_name("dav1d-frame");
+    pthread_mutex_lock(&f->frame_thread.td.lock);
+    for (;;) {
+        while (!f->n_tile_data && !f->frame_thread.die) {
+            pthread_cond_wait(&f->frame_thread.td.cond,
+                              &f->frame_thread.td.lock);
+        }
+        if (f->frame_thread.die) break;
+        pthread_mutex_unlock(&f->frame_thread.td.lock);
+
+        if (dav1d_decode_frame(f))
+            memset(f->frame_thread.cf, 0,
+                   (size_t)f->frame_thread.cf_sz * 128 * 128 / 2);
+
+        pthread_mutex_lock(&f->frame_thread.td.lock);
+        f->n_tile_data = 0;
+        pthread_cond_signal(&f->frame_thread.td.cond);
+    }
+    pthread_mutex_unlock(&f->frame_thread.td.lock);
+
+    return NULL;
+}
+
+void *dav1d_tile_task(void *const data) {
+    Dav1dTileContext *const t = data;
+    struct FrameTileThreadData *const fttd = t->tile_thread.fttd;
+    const Dav1dFrameContext *const f = t->f;
+    const int tile_thread_idx = (int) (t - f->tc);
+    const uint64_t mask = 1ULL << tile_thread_idx;
+
+    dav1d_set_thread_name("dav1d-tile");
+
+    for (;;) {
+        pthread_mutex_lock(&fttd->lock);
+        fttd->available |= mask;
+        int did_signal = 0;
+        while (!fttd->tasks_left && !t->tile_thread.die) {
+            if (!did_signal) {
+                did_signal = 1;
+                pthread_cond_signal(&fttd->icond);
+            }
+            pthread_cond_wait(&fttd->cond, &fttd->lock);
+        }
+        if (t->tile_thread.die) {
+            pthread_cond_signal(&fttd->icond);
+            pthread_mutex_unlock(&fttd->lock);
+            break;
+        }
+        fttd->available &= ~mask;
+        const int task_idx = fttd->num_tasks - fttd->tasks_left--;
+        pthread_mutex_unlock(&fttd->lock);
+
+        if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr->tiling.cols) {
+            // we can (or in fact, if >, we need to) do full tile decoding.
+            // loopfilter happens in the main thread
+            Dav1dTileState *const ts = t->ts = &f->ts[task_idx];
+            for (t->by = ts->tiling.row_start; t->by < ts->tiling.row_end;
+                 t->by += f->sb_step)
+            {
+                const int error = dav1d_decode_tile_sbrow(t);
+                const int progress = error ? TILE_ERROR : 1 + (t->by >> f->sb_shift);
+
+                // signal progress
+                pthread_mutex_lock(&ts->tile_thread.lock);
+                atomic_store(&ts->progress, progress);
+                pthread_cond_signal(&ts->tile_thread.cond);
+                pthread_mutex_unlock(&ts->tile_thread.lock);
+                if (error) break;
+            }
+        } else {
+            const int sby = f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0];
+            const int tile_idx = f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1];
+            Dav1dTileState *const ts = &f->ts[tile_idx];
+            int progress;
+
+            // the interleaved decoding can sometimes cause dependency issues
+            // if one part of the frame decodes signifcantly faster than others.
+            // Ideally, we'd "skip" tile_sbrows where dependencies are missing,
+            // and resume them later as dependencies are met. This also would
+            // solve the broadcast() below and allow us to use signal(). However,
+            // for now, we use linear dependency tracking because it's simpler.
+            if ((progress = atomic_load(&ts->progress)) < sby) {
+                pthread_mutex_lock(&ts->tile_thread.lock);
+                while ((progress = atomic_load(&ts->progress)) < sby)
+                    pthread_cond_wait(&ts->tile_thread.cond,
+                                      &ts->tile_thread.lock);
+                pthread_mutex_unlock(&ts->tile_thread.lock);
+            }
+            if (progress == TILE_ERROR) continue;
+
+            // we need to interleave sbrow decoding for all tile cols in a
+            // tile row, since otherwise subsequent threads will be blocked
+            // waiting for the post-filter to complete
+            t->ts = ts;
+            t->by = sby << f->sb_shift;
+            const int error = dav1d_decode_tile_sbrow(t);
+            progress = error ? TILE_ERROR : 1 + sby;
+
+            // signal progress
+            pthread_mutex_lock(&ts->tile_thread.lock);
+            atomic_store(&ts->progress, progress);
+            pthread_cond_broadcast(&ts->tile_thread.cond);
+            pthread_mutex_unlock(&ts->tile_thread.lock);
+        }
+    }
+
+    return NULL;
+}
diff --git a/src/thread_task.h b/src/thread_task.h
new file mode 100644 (file)
index 0000000..309a714
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_THREAD_TASK_H
+#define DAV1D_SRC_THREAD_TASK_H
+
+#include <limits.h>
+
+#include "src/internal.h"
+
+#define FRAME_ERROR (UINT_MAX - 1)
+#define TILE_ERROR (INT_MAX - 1)
+
+int dav1d_decode_frame(Dav1dFrameContext *f);
+void *dav1d_frame_task(void *data);
+
+int dav1d_decode_tile_sbrow(Dav1dTileContext *t);
+void *dav1d_tile_task(void *data);
+
+#endif /* DAV1D_SRC_THREAD_TASK_H */
diff --git a/src/warpmv.c b/src/warpmv.c
new file mode 100644 (file)
index 0000000..a933044
--- /dev/null
@@ -0,0 +1,209 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/warpmv.h"
+
+static const uint16_t div_lut[257] = {
+    16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+    15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+    15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+    14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+    13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+    13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+    13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+    12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+    12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+    11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+    11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+    11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+    10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+    10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+    10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010,  9986,
+     9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
+     9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
+     9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
+     9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
+     9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
+     8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
+     8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
+     8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
+     8240,  8224,  8208,  8192,
+};
+
+static inline int iclip_wmp(const int v) {
+    const int cv = iclip(v, INT16_MIN, INT16_MAX);
+
+    return apply_sign((abs(cv) + 32) >> 6, cv) * (1 << 6);
+}
+
+static inline int resolve_divisor_32(const unsigned d, int *const shift) {
+    *shift = ulog2(d);
+    const int e = d - (1 << *shift);
+    const int f = *shift > 8 ? (e + (1 << (*shift - 9))) >> (*shift - 8) :
+                               e << (8 - *shift);
+    assert(f <= 256);
+    *shift += 14;
+    // Use f as lookup into the precomputed table of multipliers
+    return div_lut[f];
+}
+
+int dav1d_get_shear_params(Dav1dWarpedMotionParams *const wm) {
+    const int32_t *const mat = wm->matrix;
+
+    if (mat[2] <= 0) return 1;
+
+    wm->alpha = iclip_wmp(mat[2] - 0x10000);
+    wm->beta = iclip_wmp(mat[3]);
+
+    int shift;
+    const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]);
+    const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y;
+    const int rnd = (1 << shift) >> 1;
+    wm->gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
+    const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y;
+    wm->delta = iclip_wmp(mat[5] -
+                          apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) -
+                          0x10000);
+
+    return (4 * abs(wm->alpha) + 7 * abs(wm->beta) >= 0x10000) ||
+           (4 * abs(wm->gamma) + 4 * abs(wm->delta) >= 0x10000);
+}
+
+static int resolve_divisor_64(const uint64_t d, int *const shift) {
+    *shift = u64log2(d);
+    const int64_t e = d - (1LL << *shift);
+    const int64_t f = *shift > 8 ? (e + (1LL << (*shift - 9))) >> (*shift - 8) :
+                                   e << (8 - *shift);
+    assert(f <= 256);
+    *shift += 14;
+    // Use f as lookup into the precomputed table of multipliers
+    return div_lut[f];
+}
+
+static int get_mult_shift_ndiag(const int64_t px,
+                                const int idet, const int shift)
+{
+    const int64_t v1 = px * idet;
+    const int v2 = apply_sign64((int) ((llabs(v1) +
+                                        ((1LL << shift) >> 1)) >> shift),
+                                v1);
+    return iclip(v2, -0x1fff, 0x1fff);
+}
+
+static int get_mult_shift_diag(const int64_t px,
+                               const int idet, const int shift)
+{
+    const int64_t v1 = px * idet;
+    const int v2 = apply_sign64((int) ((llabs(v1) +
+                                        ((1LL << shift) >> 1)) >> shift),
+                                v1);
+    return iclip(v2, 0xe001, 0x11fff);
+}
+
+void dav1d_set_affine_mv2d(const int bw4, const int bh4,
+                           const mv mv, Dav1dWarpedMotionParams *const wm,
+                           const int bx4, const int by4)
+{
+    int32_t *const mat = wm->matrix;
+    const int rsuy = 2 * bh4 - 1;
+    const int rsux = 2 * bw4 - 1;
+    const int isuy = by4 * 4 + rsuy;
+    const int isux = bx4 * 4 + rsux;
+
+    mat[0] = iclip(mv.x * 0x2000 - (isux * (mat[2] - 0x10000) + isuy * mat[3]),
+                   -0x800000, 0x7fffff);
+    mat[1] = iclip(mv.y * 0x2000 - (isux * mat[4] + isuy * (mat[5] - 0x10000)),
+                   -0x800000, 0x7fffff);
+}
+
+int dav1d_find_affine_int(const int (*pts)[2][2], const int np,
+                          const int bw4, const int bh4,
+                          const mv mv, Dav1dWarpedMotionParams *const wm,
+                          const int bx4, const int by4)
+{
+    int32_t *const mat = wm->matrix;
+    int a[2][2] = { { 0, 0 }, { 0, 0 } };
+    int bx[2] = { 0, 0 };
+    int by[2] = { 0, 0 };
+    const int rsuy = 2 * bh4 - 1;
+    const int rsux = 2 * bw4 - 1;
+    const int suy = rsuy * 8;
+    const int sux = rsux * 8;
+    const int duy = suy + mv.y;
+    const int dux = sux + mv.x;
+    const int isuy = by4 * 4 + rsuy;
+    const int isux = bx4 * 4 + rsux;
+
+    for (int i = 0; i < np; i++) {
+        const int dx = pts[i][1][0] - dux;
+        const int dy = pts[i][1][1] - duy;
+        const int sx = pts[i][0][0] - sux;
+        const int sy = pts[i][0][1] - suy;
+        if (abs(sx - dx) < 256 && abs(sy - dy) < 256) {
+            a[0][0] += ((sx * sx) >> 2) + sx * 2 + 8;
+            a[0][1] += ((sx * sy) >> 2) + sx + sy + 4;
+            a[1][1] += ((sy * sy) >> 2) + sy * 2 + 8;
+            bx[0] += ((sx * dx) >> 2) + sx + dx + 8;
+            bx[1] += ((sy * dx) >> 2) + sy + dx + 4;
+            by[0] += ((sx * dy) >> 2) + sx + dy + 4;
+            by[1] += ((sy * dy) >> 2) + sy + dy + 8;
+        }
+    }
+
+    // compute determinant of a
+    const int64_t det = (int64_t) a[0][0] * a[1][1] - (int64_t) a[0][1] * a[0][1];
+    if (det == 0) return 1;
+    int shift, idet = apply_sign64(resolve_divisor_64(llabs(det), &shift), det);
+    shift -= 16;
+    if (shift < 0) {
+        idet <<= -shift;
+        shift = 0;
+    }
+
+    // solve the least-squares
+    mat[2] = get_mult_shift_diag((int64_t) a[1][1] * bx[0] -
+                                 (int64_t) a[0][1] * bx[1], idet, shift);
+    mat[3] = get_mult_shift_ndiag((int64_t) a[0][0] * bx[1] -
+                                  (int64_t) a[0][1] * bx[0], idet, shift);
+    mat[4] = get_mult_shift_ndiag((int64_t) a[1][1] * by[0] -
+                                  (int64_t) a[0][1] * by[1], idet, shift);
+    mat[5] = get_mult_shift_diag((int64_t) a[0][0] * by[1] -
+                                 (int64_t) a[0][1] * by[0], idet, shift);
+
+    mat[0] = iclip(mv.x * 0x2000 - (isux * (mat[2] - 0x10000) + isuy * mat[3]),
+                   -0x800000, 0x7fffff);
+    mat[1] = iclip(mv.y * 0x2000 - (isux * mat[4] + isuy * (mat[5] - 0x10000)),
+                   -0x800000, 0x7fffff);
+
+    return 0;
+}
diff --git a/src/warpmv.h b/src/warpmv.h
new file mode 100644 (file)
index 0000000..08e841d
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_WARPMV_H
+#define DAV1D_SRC_WARPMV_H
+
+#include "src/levels.h"
+
+int dav1d_get_shear_params(Dav1dWarpedMotionParams *wm);
+int dav1d_find_affine_int(const int (*pts)[2][2], int np, int bw4, int bh4,
+                          mv mv, Dav1dWarpedMotionParams *wm, int bx, int by);
+void dav1d_set_affine_mv2d(int bw4, int bh4,
+                           mv mv, Dav1dWarpedMotionParams *wm, int bx, int by);
+
+#endif /* DAV1D_SRC_WARPMV_H */
diff --git a/src/wedge.c b/src/wedge.c
new file mode 100644 (file)
index 0000000..6b14e9a
--- /dev/null
@@ -0,0 +1,342 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/wedge.h"
+
+enum WedgeDirectionType {
+    WEDGE_HORIZONTAL = 0,
+    WEDGE_VERTICAL = 1,
+    WEDGE_OBLIQUE27 = 2,
+    WEDGE_OBLIQUE63 = 3,
+    WEDGE_OBLIQUE117 = 4,
+    WEDGE_OBLIQUE153 = 5,
+    N_WEDGE_DIRECTIONS
+};
+
+typedef struct {
+    enum WedgeDirectionType direction;
+    int x_offset;
+    int y_offset;
+} wedge_code_type;
+
+static const wedge_code_type wedge_codebook_16_hgtw[16] = {
+    { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+    { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+    { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
+    { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
+    { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+    { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+    { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+    { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_hltw[16] = {
+    { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+    { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+    { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 4, 4 },
+    { WEDGE_VERTICAL, 6, 4 },   { WEDGE_HORIZONTAL, 4, 4 },
+    { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+    { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+    { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+    { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_heqw[16] = {
+    { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+    { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+    { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+    { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
+    { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+    { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+    { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+    { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static uint8_t ALIGN(wedge_masks_444_32x32[2 * 16 * 32 * 32], 64);
+static uint8_t ALIGN(wedge_masks_444_32x16[2 * 16 * 32 * 16], 64);
+static uint8_t ALIGN(wedge_masks_444_32x8 [2 * 16 * 32 *  8], 64);
+static uint8_t ALIGN(wedge_masks_444_16x32[2 * 16 * 16 * 32], 64);
+static uint8_t ALIGN(wedge_masks_444_16x16[2 * 16 * 16 * 16], 64);
+static uint8_t ALIGN(wedge_masks_444_16x8 [2 * 16 * 16 *  8], 64);
+static uint8_t ALIGN(wedge_masks_444_8x32 [2 * 16 *  8 * 32], 64);
+static uint8_t ALIGN(wedge_masks_444_8x16 [2 * 16 *  8 * 16], 64);
+static uint8_t ALIGN(wedge_masks_444_8x8  [2 * 16 *  8 *  8], 64);
+
+static uint8_t ALIGN(wedge_masks_422_16x32[2 * 16 * 16 * 32], 64);
+static uint8_t ALIGN(wedge_masks_422_16x16[2 * 16 * 16 * 16], 64);
+static uint8_t ALIGN(wedge_masks_422_16x8 [2 * 16 * 16 *  8], 64);
+static uint8_t ALIGN(wedge_masks_422_8x32 [2 * 16 *  8 * 32], 64);
+static uint8_t ALIGN(wedge_masks_422_8x16 [2 * 16 *  8 * 16], 64);
+static uint8_t ALIGN(wedge_masks_422_8x8  [2 * 16 *  8 *  8], 64);
+static uint8_t ALIGN(wedge_masks_422_4x32 [2 * 16 *  4 * 32], 64);
+static uint8_t ALIGN(wedge_masks_422_4x16 [2 * 16 *  4 * 16], 64);
+static uint8_t ALIGN(wedge_masks_422_4x8  [2 * 16 *  4 *  8], 32);
+
+static uint8_t ALIGN(wedge_masks_420_16x16[2 * 16 * 16 * 16], 64);
+static uint8_t ALIGN(wedge_masks_420_16x8 [2 * 16 * 16 *  8], 64);
+static uint8_t ALIGN(wedge_masks_420_16x4 [2 * 16 * 16 *  4], 64);
+static uint8_t ALIGN(wedge_masks_420_8x16 [2 * 16 *  8 * 16], 64);
+static uint8_t ALIGN(wedge_masks_420_8x8  [2 * 16 *  8 *  8], 64);
+static uint8_t ALIGN(wedge_masks_420_8x4  [2 * 16 *  8 *  4], 64);
+static uint8_t ALIGN(wedge_masks_420_4x16 [2 * 16 *  4 * 16], 64);
+static uint8_t ALIGN(wedge_masks_420_4x8  [2 * 16 *  4 *  8], 32);
+static uint8_t ALIGN(wedge_masks_420_4x4  [2 * 16 *  4 *  4], 16);
+
+const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3][2][16];
+
+static void insert_border(uint8_t *const dst, const uint8_t *const src,
+                          const int ctr)
+{
+    if (ctr > 4) memset(dst, 0, ctr - 4);
+    memcpy(dst + imax(ctr, 4) - 4, src + imax(4 - ctr, 0), imin(64 - ctr, 8));
+    if (ctr < 64 - 4)
+        memset(dst + ctr + 4, 64, 64 - 4 - ctr);
+}
+
+static void transpose(uint8_t *const dst, const uint8_t *const src) {
+    for (int y = 0, y_off = 0; y < 64; y++, y_off += 64)
+        for (int x = 0, x_off = 0; x < 64; x++, x_off += 64)
+            dst[x_off + y] = src[y_off + x];
+}
+
+static void hflip(uint8_t *const dst, const uint8_t *const src) {
+    for (int y = 0, y_off = 0; y < 64; y++, y_off += 64)
+        for (int x = 0; x < 64; x++)
+            dst[y_off + 64 - 1 - x] = src[y_off + x];
+}
+
+static void invert(uint8_t *const dst, const uint8_t *const src,
+                   const int w, const int h)
+{
+    for (int y = 0, y_off = 0; y < h; y++, y_off += w)
+        for (int x = 0; x < w; x++)
+            dst[y_off + x] = 64 - src[y_off + x];
+}
+
+static void copy2d(uint8_t *dst, const uint8_t *src,
+                   const int w, const int h, const int x_off, const int y_off)
+{
+    src += y_off * 64 + x_off;
+    for (int y = 0; y < h; y++) {
+        memcpy(dst, src, w);
+        src += 64;
+        dst += w;
+    }
+}
+
+static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma,
+                             const int sign, const int w, const int h,
+                             const int ss_ver)
+{
+    for (int y = 0; y < h; y += 1 + ss_ver) {
+        for (int x = 0; x < w; x += 2) {
+            int sum = luma[x] + luma[x + 1] + 1;
+            if (ss_ver) sum += luma[w + x] + luma[w + x + 1] + 1;
+            chroma[x >> 1] = (sum - sign) >> (1 + ss_ver);
+        }
+        luma += w << ss_ver;
+        chroma += w >> 1;
+    }
+}
+
+static COLD void fill2d_16x2(uint8_t *dst, const int w, const int h,
+                             const enum BlockSize bs,
+                             const uint8_t (*const master)[64 * 64],
+                             const wedge_code_type *const cb,
+                             uint8_t *masks_444, uint8_t *masks_422,
+                             uint8_t *masks_420, const unsigned signs)
+{
+    uint8_t *ptr = dst;
+    for (int n = 0; n < 16; n++) {
+        copy2d(ptr, master[cb[n].direction], w, h,
+               32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3));
+        ptr += w * h;
+    }
+    for (int n = 0, off = 0; n < 16; n++, off += w * h)
+        invert(ptr + off, dst + off, w, h);
+
+    const int n_stride_444 = (w * h);
+    const int n_stride_422 = n_stride_444 >> 1;
+    const int n_stride_420 = n_stride_444 >> 2;
+    const int sign_stride_444 = 16 * n_stride_444;
+    const int sign_stride_422 = 16 * n_stride_422;
+    const int sign_stride_420 = 16 * n_stride_420;
+    // assign pointers in externally visible array
+    for (int n = 0; n < 16; n++) {
+        const int sign = (signs >> n) & 1;
+        dav1d_wedge_masks[bs][0][0][n] = &masks_444[ sign * sign_stride_444];
+        // not using !sign is intentional here, since 444 does not require
+        // any rounding since no chroma subsampling is applied.
+        dav1d_wedge_masks[bs][0][1][n] = &masks_444[ sign * sign_stride_444];
+        dav1d_wedge_masks[bs][1][0][n] = &masks_422[ sign * sign_stride_422];
+        dav1d_wedge_masks[bs][1][1][n] = &masks_422[!sign * sign_stride_422];
+        dav1d_wedge_masks[bs][2][0][n] = &masks_420[ sign * sign_stride_420];
+        dav1d_wedge_masks[bs][2][1][n] = &masks_420[!sign * sign_stride_420];
+        masks_444 += n_stride_444;
+        masks_422 += n_stride_422;
+        masks_420 += n_stride_420;
+
+        // since the pointers come from inside, we know that
+        // violation of the const is OK here. Any other approach
+        // means we would have to duplicate the sign correction
+        // logic in two places, which isn't very nice, or mark
+        // the table faced externally as non-const, which also sucks
+        init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][0][n],
+                    dav1d_wedge_masks[bs][0][0][n], 0, w, h, 0);
+        init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][1][n],
+                    dav1d_wedge_masks[bs][0][0][n], 1, w, h, 0);
+        init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][0][n],
+                    dav1d_wedge_masks[bs][0][0][n], 0, w, h, 1);
+        init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][1][n],
+                    dav1d_wedge_masks[bs][0][0][n], 1, w, h, 1);
+    }
+}
+
+COLD void dav1d_init_wedge_masks(void) {
+    // This function is guaranteed to be called only once
+
+    enum WedgeMasterLineType {
+        WEDGE_MASTER_LINE_ODD,
+        WEDGE_MASTER_LINE_EVEN,
+        WEDGE_MASTER_LINE_VERT,
+        N_WEDGE_MASTER_LINES,
+    };
+    static const uint8_t wedge_master_border[N_WEDGE_MASTER_LINES][8] = {
+        [WEDGE_MASTER_LINE_ODD]  = {  1,  2,  6, 18, 37, 53, 60, 63 },
+        [WEDGE_MASTER_LINE_EVEN] = {  1,  4, 11, 27, 46, 58, 62, 63 },
+        [WEDGE_MASTER_LINE_VERT] = {  0,  2,  7, 21, 43, 57, 62, 64 },
+    };
+    uint8_t master[6][64 * 64];
+
+    // create master templates
+    for (int y = 0, off = 0; y < 64; y++, off += 64)
+        insert_border(&master[WEDGE_VERTICAL][off],
+                      wedge_master_border[WEDGE_MASTER_LINE_VERT], 32);
+    for (int y = 0, off = 0, ctr = 48; y < 64; y += 2, off += 128, ctr--)
+    {
+        insert_border(&master[WEDGE_OBLIQUE63][off],
+                      wedge_master_border[WEDGE_MASTER_LINE_EVEN], ctr);
+        insert_border(&master[WEDGE_OBLIQUE63][off + 64],
+                      wedge_master_border[WEDGE_MASTER_LINE_ODD], ctr - 1);
+    }
+
+    transpose(master[WEDGE_OBLIQUE27], master[WEDGE_OBLIQUE63]);
+    transpose(master[WEDGE_HORIZONTAL], master[WEDGE_VERTICAL]);
+    hflip(master[WEDGE_OBLIQUE117], master[WEDGE_OBLIQUE63]);
+    hflip(master[WEDGE_OBLIQUE153], master[WEDGE_OBLIQUE27]);
+
+#define fill(w, h, sz_422, sz_420, hvsw, signs) \
+    fill2d_16x2((uint8_t *) wedge_masks_444_##w##x##h,  w, h, BS_##w##x##h, \
+                master, wedge_codebook_16_##hvsw, wedge_masks_444_##w##x##h, \
+                wedge_masks_422_##sz_422, wedge_masks_420_##sz_420, signs)
+
+    fill(32, 32, 16x32, 16x16, heqw, 0x7bfb);
+    fill(32, 16, 16x16, 16x8,  hltw, 0x7beb);
+    fill(32,  8, 16x8,  16x4,  hltw, 0x6beb);
+    fill(16, 32,  8x32,  8x16, hgtw, 0x7beb);
+    fill(16, 16,  8x16,  8x8,  heqw, 0x7bfb);
+    fill(16,  8,  8x8,   8x4,  hltw, 0x7beb);
+    fill( 8, 32,  4x32,  4x16, hgtw, 0x7aeb);
+    fill( 8, 16,  4x16,  4x8,  hgtw, 0x7beb);
+    fill( 8,  8,  4x8,   4x4,  heqw, 0x7bfb);
+#undef fill
+}
+
+#define N_II_PRED_MODES (N_INTER_INTRA_PRED_MODES - 1)
+static uint8_t ALIGN(ii_dc_mask[32 * 32], 64);
+static uint8_t ALIGN(ii_nondc_mask_32x32[N_II_PRED_MODES][32 * 32], 64);
+static uint8_t ALIGN(ii_nondc_mask_16x32[N_II_PRED_MODES][16 * 32], 64);
+static uint8_t ALIGN(ii_nondc_mask_16x16[N_II_PRED_MODES][16 * 16], 64);
+static uint8_t ALIGN(ii_nondc_mask_8x32 [N_II_PRED_MODES][ 8 * 32], 64);
+static uint8_t ALIGN(ii_nondc_mask_8x16 [N_II_PRED_MODES][ 8 * 16], 64);
+static uint8_t ALIGN(ii_nondc_mask_8x8  [N_II_PRED_MODES][ 8 *  8], 64);
+static uint8_t ALIGN(ii_nondc_mask_4x16 [N_II_PRED_MODES][ 4 * 16], 64);
+static uint8_t ALIGN(ii_nondc_mask_4x8  [N_II_PRED_MODES][ 4 *  8], 32);
+static uint8_t ALIGN(ii_nondc_mask_4x4  [N_II_PRED_MODES][ 4 *  4], 16);
+#undef N_II_PRED_MODES
+
+#define set1(sz) \
+    [II_DC_PRED] = ii_dc_mask, \
+    [II_VERT_PRED] = ii_nondc_mask_##sz[II_VERT_PRED - 1], \
+    [II_HOR_PRED] = ii_nondc_mask_##sz[II_HOR_PRED - 1], \
+    [II_SMOOTH_PRED] = ii_nondc_mask_##sz[II_SMOOTH_PRED - 1]
+#define set(sz_444, sz_422, sz_420) \
+    { { set1(sz_444) }, { set1(sz_422) }, { set1(sz_420) } }
+const uint8_t *dav1d_ii_masks[N_BS_SIZES][3][N_INTER_INTRA_PRED_MODES] = {
+    [BS_8x8]   = set( 8x8,   4x8,   4x4),
+    [BS_8x16]  = set( 8x16,  4x16,  4x8),
+    [BS_16x8]  = set(16x16,  8x8,   8x8),
+    [BS_16x16] = set(16x16,  8x16,  8x8),
+    [BS_16x32] = set(16x32,  8x32,  8x16),
+    [BS_32x16] = set(32x32, 16x16, 16x16),
+    [BS_32x32] = set(32x32, 16x32, 16x16),
+};
+#undef set
+#undef set1
+
+static COLD void build_nondc_ii_masks(uint8_t *const mask_v,
+                                      uint8_t *const mask_h,
+                                      uint8_t *const mask_sm,
+                                      const int w, const int h, const int step)
+{
+    static const uint8_t ii_weights_1d[] = {
+        60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,  8,  7,
+         6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,
+    };
+
+    for (int y = 0, off = 0; y < h; y++, off += w) {
+        memset(&mask_v[off], ii_weights_1d[y * step], w);
+        for (int x = 0; x < w; x++) {
+            mask_sm[off + x] = ii_weights_1d[imin(x, y) * step];
+            mask_h[off + x] = ii_weights_1d[x * step];
+        }
+    }
+}
+
+COLD void dav1d_init_interintra_masks(void) {
+    // This function is guaranteed to be called only once
+
+    memset(ii_dc_mask, 32, 32 * 32);
+#define set(a) a[II_VERT_PRED - 1], a[II_HOR_PRED - 1], a[II_SMOOTH_PRED - 1]
+    build_nondc_ii_masks(set(ii_nondc_mask_32x32), 32, 32, 1);
+    build_nondc_ii_masks(set(ii_nondc_mask_16x32), 16, 32, 1);
+    build_nondc_ii_masks(set(ii_nondc_mask_16x16), 16, 16, 2);
+    build_nondc_ii_masks(set(ii_nondc_mask_8x32),   8, 32, 1);
+    build_nondc_ii_masks(set(ii_nondc_mask_8x16),   8, 16, 2);
+    build_nondc_ii_masks(set(ii_nondc_mask_8x8),    8,  8, 4);
+    build_nondc_ii_masks(set(ii_nondc_mask_4x16),   4, 16, 2);
+    build_nondc_ii_masks(set(ii_nondc_mask_4x8),    4,  8, 4);
+    build_nondc_ii_masks(set(ii_nondc_mask_4x4),    4,  4, 8);
+#undef set
+}
diff --git a/src/wedge.h b/src/wedge.h
new file mode 100644 (file)
index 0000000..45f0570
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_WEDGE_H
+#define DAV1D_SRC_WEDGE_H
+
+#include "src/levels.h"
+
+void dav1d_init_wedge_masks(void);
+extern const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
+                                 [2 /* sign */][16 /* wedge_idx */];
+
+void dav1d_init_interintra_masks(void);
+extern const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
+                                    [N_INTER_INTRA_PRED_MODES];
+
+#endif /* DAV1D_SRC_WEDGE_H */
diff --git a/src/win32/thread.c b/src/win32/thread.c
new file mode 100644 (file)
index 0000000..5e878bf
--- /dev/null
@@ -0,0 +1,96 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#if defined(_WIN32)
+
+#include <process.h>
+#include <stdlib.h>
+#include <windows.h>
+
+#include "common/attributes.h"
+
+#include "src/thread.h"
+
+static HRESULT (WINAPI *set_thread_description)(HANDLE, PCWSTR);
+
+COLD void dav1d_init_thread(void) {
+    set_thread_description =
+        (void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"),
+                              "SetThreadDescription");
+}
+
+#undef dav1d_set_thread_name
+COLD void dav1d_set_thread_name(const wchar_t *const name) {
+    if (set_thread_description) /* Only available since Windows 10 1607 */
+        set_thread_description(GetCurrentThread(), name);
+}
+
+static COLD unsigned __stdcall thread_entrypoint(void *const data) {
+    pthread_t *const t = data;
+    t->arg = t->func(t->arg);
+    return 0;
+}
+
+COLD int dav1d_pthread_create(pthread_t *const thread,
+                              const pthread_attr_t *const attr,
+                              void *(*const func)(void*), void *const arg)
+{
+    const unsigned stack_size = attr ? attr->stack_size : 0;
+    thread->func = func;
+    thread->arg = arg;
+    thread->h = (HANDLE)_beginthreadex(NULL, stack_size, thread_entrypoint, thread,
+                                       STACK_SIZE_PARAM_IS_A_RESERVATION, NULL);
+    return !thread->h;
+}
+
+COLD int dav1d_pthread_join(pthread_t *const thread, void **const res) {
+    if (WaitForSingleObject(thread->h, INFINITE))
+        return 1;
+
+    if (res)
+        *res = thread->arg;
+
+    return !CloseHandle(thread->h);
+}
+
+COLD int dav1d_pthread_once(pthread_once_t *const once_control,
+                            void (*const init_routine)(void))
+{
+    BOOL pending = FALSE;
+
+    if (InitOnceBeginInitialize(once_control, 0, &pending, NULL) != TRUE)
+        return 1;
+
+    if (pending == TRUE)
+        init_routine();
+
+    return !InitOnceComplete(once_control, 0, NULL);
+}
+
+#endif
diff --git a/src/x86/cdef_avx2.asm b/src/x86/cdef_avx2.asm
new file mode 100644 (file)
index 0000000..643caa0
--- /dev/null
@@ -0,0 +1,1798 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+%macro JMP_TABLE 2-*
+ %xdefine %1_jmptable %%table
+ %xdefine %%base mangle(private_prefix %+ _%1_avx2)
+ %%table:
+ %rep %0 - 1
+    dd %%base %+ .%2 - %%table
+  %rotate 1
+ %endrep
+%endmacro
+
+%macro CDEF_FILTER_JMP_TABLE 1
+JMP_TABLE cdef_filter_%1, \
+    d6k0, d6k1, d7k0, d7k1, \
+    d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
+    d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
+    d0k0, d0k1, d1k0, d1k1
+%endmacro
+
+SECTION_RODATA 32
+
+pd_47130256:   dd  4,  7,  1,  3,  0,  2,  5,  6
+blend_4x4:     dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
+               dd 0x80, 0x00, 0x00
+blend_4x8_0:   dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+blend_4x8_1:   dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+               dd 0x00, 0x00
+blend_4x8_2:   dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+               dd 0x0000
+blend_4x8_3:   dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+               dd 0x0000, 0x0000
+blend_8x8_0:   dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
+blend_8x8_1:   dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
+div_table:     dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
+shufw_6543210x:db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
+shufb_lohi:    db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
+pw_128:        times 2 dw 128
+pw_2048:       times 2 dw 2048
+tap_table:     ; masks for 8 bit shifts
+               db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
+               ; weights
+               db  4,  2,  3,  3,  2,  1
+               db -1 * 16 + 1, -2 * 16 + 2
+               db  0 * 16 + 1, -1 * 16 + 2
+               db  0 * 16 + 1,  0 * 16 + 2
+               db  0 * 16 + 1,  1 * 16 + 2
+               db  1 * 16 + 1,  2 * 16 + 2
+               db  1 * 16 + 0,  2 * 16 + 1
+               db  1 * 16 + 0,  2 * 16 + 0
+               db  1 * 16 + 0,  2 * 16 - 1
+               ; the last 6 are repeats of the first 6 so we don't need to & 7
+               db -1 * 16 + 1, -2 * 16 + 2
+               db  0 * 16 + 1, -1 * 16 + 2
+               db  0 * 16 + 1,  0 * 16 + 2
+               db  0 * 16 + 1,  1 * 16 + 2
+               db  1 * 16 + 1,  2 * 16 + 2
+               db  1 * 16 + 0,  2 * 16 + 1
+
+CDEF_FILTER_JMP_TABLE 4x4
+CDEF_FILTER_JMP_TABLE 4x8
+CDEF_FILTER_JMP_TABLE 8x8
+
+SECTION .text
+
+%macro PREP_REGS 2 ; w, h
+    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+    mov           dird, r6m
+    lea         tableq, [cdef_filter_%1x%2_jmptable]
+    lea           dirq, [tableq+dirq*2*4]
+%if %1 == 4
+ %if %2 == 4
+  DEFINE_ARGS dst, stride, left, top, pri, sec, \
+              table, dir, dirjmp, dst4, stride3, k
+ %else
+  DEFINE_ARGS dst, stride, left, top, pri, sec, \
+              table, dir, dirjmp, dst4, dst8, stride3, k
+    lea          dst8q, [dstq+strideq*8]
+ %endif
+%else
+  DEFINE_ARGS dst, stride, h, top1, pri, sec, \
+              table, dir, dirjmp, top2, dst4, stride3, k
+    mov             hq, -8
+    lea          top1q, [top1q+strideq*0]
+    lea          top2q, [top1q+strideq*1]
+%endif
+    lea          dst4q, [dstq+strideq*4]
+%if %1 == 4
+    lea       stride3q, [strideq*3]
+%endif
+%endmacro
+
+%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+    mov             kd, 1
+    pxor           m15, m15                     ; sum
+%if %2 == 8
+    pxor           m12, m12
+ %if %1 == 4
+    movd           xm4, [dstq +strideq*0]
+    movd           xm6, [dstq +strideq*1]
+    movd           xm5, [dstq +strideq*2]
+    movd           xm7, [dstq +stride3q ]
+    vinserti128     m4, [dst4q+strideq*0], 1
+    vinserti128     m6, [dst4q+strideq*1], 1
+    vinserti128     m5, [dst4q+strideq*2], 1
+    vinserti128     m7, [dst4q+stride3q ], 1
+    punpckldq       m4, m6
+    punpckldq       m5, m7
+ %else
+    movq           xm4, [dstq+strideq*0]
+    movq           xm5, [dstq+strideq*1]
+    vinserti128     m4, [dstq+strideq*2], 1
+    vinserti128     m5, [dstq+stride3q ], 1
+ %endif
+    punpcklqdq      m4, m5
+%else
+    movd           xm4, [dstq+strideq*0]
+    movd           xm5, [dstq+strideq*1]
+    vinserti128     m4, [dstq+strideq*2], 1
+    vinserti128     m5, [dstq+stride3q ], 1
+    punpckldq       m4, m5
+%endif
+%if %3 == 1
+    mova            m7, m4                      ; min
+    mova            m8, m4                      ; max
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
+                                 ; mul_tap, w, h, clip
+    ; load p0/p1
+    movsxd     dirjmpq, [dirq+kq*4+%1*2*4]
+    add        dirjmpq, tableq
+    call       dirjmpq
+
+%if %8 == 1
+    pmaxub          m7, m5
+    pminub          m8, m5
+    pmaxub          m7, m6
+    pminub          m8, m6
+%endif
+
+    ; accumulate sum[m15] over p0/p1
+%if %7 == 4
+    punpcklbw       m5, m6
+    punpcklbw       m6, m4, m4
+    psubusb         m9, m5, m6
+    psubusb         m5, m6, m5
+    por             m9, m5     ; abs_diff_p01(p01 - px)
+    pcmpeqb         m5, m9
+    por             m5, %5
+    psignb          m6, %5, m5
+    psrlw           m5, m9, %2 ; emulate 8-bit shift
+    pand            m5, %3
+    psubusb         m5, %4, m5
+    pminub          m5, m9
+    pmaddubsw       m5, m6
+    paddw          m15, m5
+%else
+    psubusb         m9, m5, m4
+    psubusb         m5, m4, m5
+    psubusb        m11, m6, m4
+    psubusb         m6, m4, m6
+    por             m9, m5      ; abs_diff_p0(p0 - px)
+    por            m11, m6      ; abs_diff_p1(p1 - px)
+    pcmpeqb         m5, m9
+    pcmpeqb         m6, m11
+    punpckhbw      m10, m9, m11
+    punpcklbw       m9, m11
+    por             m5, %5
+    por            m11, m6, %5
+    punpckhbw       m6, m5, m11
+    punpcklbw       m5, m11
+    psignb         m11, %5, m6
+    psrlw           m6, m10, %2 ; emulate 8-bit shift
+    pand            m6, %3
+    psubusb         m6, %4, m6
+    pminub          m6, m10
+    pmaddubsw       m6, m11
+    paddw          m12, m6
+    psignb         m11, %5, m5
+    psrlw           m5, m9, %2  ; emulate 8-bit shift
+    pand            m5, %3
+    psubusb         m5, %4, m5
+    pminub          m5, m9
+    pmaddubsw       m5, m11
+    paddw          m15, m5
+%endif
+%endmacro
+
+%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
+%if %2 == 4
+ %if %5 == 1
+    punpcklbw       m4, %3
+ %endif
+    pcmpgtw         %3, m15
+    paddw          m15, %3
+    pmulhrsw       m15, %4
+ %if %5 == 0
+    packsswb       m15, m15
+    paddb           m4, m15
+ %else
+    paddw           m4, m15
+    packuswb        m4, m4 ; clip px in [0x0,0xff]
+    pminub          m4, m7
+    pmaxub          m4, m8
+ %endif
+    vextracti128   xm5, m4, 1
+    movd   [dstq+strideq*0], xm4
+    movd   [dstq+strideq*2], xm5
+    pextrd [dstq+strideq*1], xm4, 1
+    pextrd [dstq+stride3q ], xm5, 1
+%else
+    pcmpgtw         m6, %3, m12
+    pcmpgtw         m5, %3, m15
+    paddw          m12, m6
+    paddw          m15, m5
+ %if %5 == 1
+    punpckhbw       m5, m4, %3
+    punpcklbw       m4, %3
+ %endif
+    pmulhrsw       m12, %4
+    pmulhrsw       m15, %4
+ %if %5 == 0
+    packsswb       m15, m12
+    paddb           m4, m15
+ %else
+    paddw           m5, m12
+    paddw           m4, m15
+    packuswb        m4, m5 ; clip px in [0x0,0xff]
+    pminub          m4, m7
+    pmaxub          m4, m8
+ %endif
+    vextracti128   xm5, m4, 1
+ %if %1 == 4
+    movd   [dstq +strideq*0], xm4
+    movd   [dst4q+strideq*0], xm5
+    pextrd [dstq +strideq*1], xm4, 1
+    pextrd [dst4q+strideq*1], xm5, 1
+    pextrd [dstq +strideq*2], xm4, 2
+    pextrd [dst4q+strideq*2], xm5, 2
+    pextrd [dstq +stride3q ], xm4, 3
+    pextrd [dst4q+stride3q ], xm5, 3
+ %else
+    movq   [dstq+strideq*0], xm4
+    movq   [dstq+strideq*2], xm5
+    movhps [dstq+strideq*1], xm4
+    movhps [dstq+stride3q ], xm5
+ %endif
+%endif
+%endmacro
+
+%macro BORDER_PREP_REGS 2 ; w, h
+    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+    mov           dird, r6m
+    lea           dirq, [tableq+dirq*2+14]
+%if %1*%2*2/mmsize > 1
+ %if %1 == 4
+    DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
+ %else
+    DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
+ %endif
+    mov             hd, %1*%2*2/mmsize
+%else
+    DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k
+%endif
+    lea           stkq, [px]
+    pxor           m11, m11
+%endmacro
+
+%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+    mov             kd, 1
+%if %1 == 4
+    movq           xm4, [stkq+32*0]
+    movhps         xm4, [stkq+32*1]
+    movq           xm5, [stkq+32*2]
+    movhps         xm5, [stkq+32*3]
+    vinserti128     m4, xm5, 1
+%else
+    mova           xm4, [stkq+32*0]             ; px
+    vinserti128     m4, [stkq+32*1], 1
+%endif
+    pxor           m15, m15                     ; sum
+%if %3 == 1
+    mova            m7, m4                      ; max
+    mova            m8, m4                      ; min
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
+                                 ; mul_tap, w, clip
+    ; load p0/p1
+    movsx         offq, byte [dirq+kq+%1]       ; off1
+%if %6 == 4
+    movq           xm5, [stkq+offq*2+32*0]      ; p0
+    movq           xm6, [stkq+offq*2+32*2]
+    movhps         xm5, [stkq+offq*2+32*1]
+    movhps         xm6, [stkq+offq*2+32*3]
+    vinserti128     m5, xm6, 1
+%else
+    movu           xm5, [stkq+offq*2+32*0]      ; p0
+    vinserti128     m5, [stkq+offq*2+32*1], 1
+%endif
+    neg           offq                          ; -off1
+%if %6 == 4
+    movq           xm6, [stkq+offq*2+32*0]      ; p1
+    movq           xm9, [stkq+offq*2+32*2]
+    movhps         xm6, [stkq+offq*2+32*1]
+    movhps         xm9, [stkq+offq*2+32*3]
+    vinserti128     m6, xm9, 1
+%else
+    movu           xm6, [stkq+offq*2+32*0]      ; p1
+    vinserti128     m6, [stkq+offq*2+32*1], 1
+%endif
+%if %7 == 1
+    ; out of bounds values are set to a value that is a both a large unsigned
+    ; value and a negative signed value.
+    ; use signed max and unsigned min to remove them
+    pmaxsw          m7, m5                      ; max after p0
+    pminuw          m8, m5                      ; min after p0
+    pmaxsw          m7, m6                      ; max after p1
+    pminuw          m8, m6                      ; min after p1
+%endif
+
+    ; accumulate sum[m15] over p0/p1
+    ; calculate difference before converting
+    psubw           m5, m4                      ; diff_p0(p0 - px)
+    psubw           m6, m4                      ; diff_p1(p1 - px)
+
+    ; convert to 8-bits with signed saturation
+    ; saturating to large diffs has no impact on the results
+    packsswb        m5, m6
+
+    ; group into pairs so we can accumulate using maddubsw
+    pshufb          m5, m12
+    pabsb           m9, m5
+    psignb         m10, %5, m5
+    psrlw           m5, m9, %2                  ; emulate 8-bit shift
+    pand            m5, %3
+    psubusb         m5, %4, m5
+
+    ; use unsigned min since abs diff can equal 0x80
+    pminub          m5, m9
+    pmaddubsw       m5, m10
+    paddw          m15, m5
+%endmacro
+
+%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
+    pcmpgtw         m9, m11, m15
+    paddw          m15, m9
+    pmulhrsw       m15, %2
+    paddw           m4, m15
+%if %3 == 1
+    pminsw          m4, m7
+    pmaxsw          m4, m8
+%endif
+    packuswb        m4, m4
+    vextracti128   xm5, m4, 1
+%if %1 == 4
+    movd [dstq+strideq*0], xm4
+    pextrd [dstq+strideq*1], xm4, 1
+    movd [dstq+strideq*2], xm5
+    pextrd [dstq+stride3q], xm5, 1
+%else
+    movq [dstq+strideq*0], xm4
+    movq [dstq+strideq*1], xm5
+%endif
+%endmacro
+
+%macro CDEF_FILTER 2 ; w, h
+INIT_YMM avx2
+cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
+                                    pri, sec, dir, damping, edge
+%assign stack_offset_entry stack_offset
+    mov          edged, edgem
+    cmp          edged, 0xf
+    jne .border_block
+
+    PUSH            r9
+    PUSH           r10
+    PUSH           r11
+%if %2 == 4
+ %assign regs_used 12
+ %if STACK_ALIGNMENT < 32
+    PUSH  r%+regs_used
+  %assign regs_used regs_used+1
+ %endif
+    ALLOC_STACK 0x60, 16
+    pmovzxbw       xm0, [leftq+1]
+    vpermq          m0, m0, q0110
+    psrldq          m1, m0, 4
+    vpalignr        m2, m0, m0, 12
+    movu    [rsp+0x10], m0
+    movu    [rsp+0x28], m1
+    movu    [rsp+0x40], m2
+%elif %1 == 4
+    PUSH           r12
+ %assign regs_used 13
+ %if STACK_ALIGNMENT < 32
+    PUSH  r%+regs_used
+   %assign regs_used regs_used+1
+ %endif
+    ALLOC_STACK 8*2+%1*%2*1, 16
+    pmovzxwd        m0, [leftq]
+    mova    [rsp+0x10], m0
+%else
+    PUSH           r12
+    PUSH           r13
+ %assign regs_used 14
+ %if STACK_ALIGNMENT < 32
+    PUSH  r%+regs_used
+  %assign regs_used regs_used+1
+ %endif
+    ALLOC_STACK 8*2+%1*%2*2+32, 16
+    lea            r11, [strideq*3]
+    movu           xm4, [dstq+strideq*2]
+    pmovzxwq        m0, [leftq+0]
+    pmovzxwq        m1, [leftq+8]
+    vinserti128     m4, [dstq+r11], 1
+    pmovzxbd        m2, [leftq+1]
+    pmovzxbd        m3, [leftq+9]
+    mova    [rsp+0x10], m0
+    mova    [rsp+0x30], m1
+    mova    [rsp+0x50], m2
+    mova    [rsp+0x70], m3
+    mova    [rsp+0x90], m4
+%endif
+
+ DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping
+    mov       dampingd, r7m
+    xor          zerod, zerod
+    movifnidn     prid, prim
+    sub       dampingd, 31
+    movifnidn  secdmpd, secdmpm
+    test          prid, prid
+    jz .sec_only
+    movd           xm0, prid
+    lzcnt      pridmpd, prid
+    add        pridmpd, dampingd
+    cmovs      pridmpd, zerod
+    mov        [rsp+0], pridmpq                 ; pri_shift
+    test       secdmpd, secdmpd
+    jz .pri_only
+    movd           xm1, secdmpd
+    lzcnt      secdmpd, secdmpd
+    add        secdmpd, dampingd
+    cmovs      secdmpd, zerod
+    mov        [rsp+8], secdmpq                 ; sec_shift
+
+ DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp
+    lea         tableq, [tap_table]
+    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
+    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
+
+    ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, pri, sec, table, dir
+    vpbroadcastb    m0, xm0                     ; pri_strength
+    vpbroadcastb    m1, xm1                     ; sec_strength
+    and           prid, 1
+    lea           priq, [tableq+priq*2+8]       ; pri_taps
+    lea           secq, [tableq+12]             ; sec_taps
+
+    PREP_REGS       %1, %2
+%if %1*%2 > mmsize
+.v_loop:
+%endif
+    LOAD_BLOCK      %1, %2, 1
+.k_loop:
+    vpbroadcastb    m2, [priq+kq]                          ; pri_taps
+    vpbroadcastb    m3, [secq+kq]                          ; sec_taps
+    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
+    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
+    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
+    dec             kq
+    jge .k_loop
+
+    vpbroadcastd   m10, [pw_2048]
+    pxor            m9, m9
+    ADJUST_PIXEL    %1, %2, m9, m10, 1
+%if %1*%2 > mmsize
+    mov           dstq, dst4q
+    lea          top1q, [rsp+0x90]
+    lea          top2q, [rsp+0xA0]
+    lea          dst4q, [dst4q+strideq*4]
+    add             hq, 4
+    jl .v_loop
+%endif
+    RET
+
+.pri_only:
+ DEFINE_ARGS dst, stride, left, top, pri, _, table, pridmp
+    lea         tableq, [tap_table]
+    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
+    ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, pri, _, table, dir
+    vpbroadcastb    m0, xm0                     ; pri_strength
+    and           prid, 1
+    lea           priq, [tableq+priq*2+8]       ; pri_taps
+    PREP_REGS       %1, %2
+    vpbroadcastd    m3, [pw_2048]
+    pxor            m1, m1
+%if %1*%2 > mmsize
+.pri_v_loop:
+%endif
+    LOAD_BLOCK      %1, %2
+.pri_k_loop:
+    vpbroadcastb    m2, [priq+kq]                       ; pri_taps
+    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
+    dec             kq
+    jge .pri_k_loop
+    ADJUST_PIXEL    %1, %2, m1, m3
+%if %1*%2 > mmsize
+    mov           dstq, dst4q
+    lea          top1q, [rsp+0x90]
+    lea          top2q, [rsp+0xA0]
+    lea          dst4q, [dst4q+strideq*4]
+    add             hq, 4
+    jl .pri_v_loop
+%endif
+    RET
+
+.sec_only:
+ DEFINE_ARGS dst, stride, left, top, _, secdmp, zero, _, damping
+    movd           xm1, secdmpd
+    lzcnt      secdmpd, secdmpd
+    add        secdmpd, dampingd
+    cmovs      secdmpd, zerod
+    mov        [rsp+8], secdmpq                 ; sec_shift
+ DEFINE_ARGS dst, stride, left, top, _, secdmp, table
+    lea         tableq, [tap_table]
+    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
+    ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, _, sec, table, dir
+    vpbroadcastb    m1, xm1                     ; sec_strength
+    lea           secq, [tableq+12]             ; sec_taps
+    PREP_REGS       %1, %2
+    vpbroadcastd    m2, [pw_2048]
+    pxor            m0, m0
+%if %1*%2 > mmsize
+.sec_v_loop:
+%endif
+    LOAD_BLOCK      %1, %2
+.sec_k_loop:
+    vpbroadcastb    m3, [secq+kq]                       ; sec_taps
+    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
+    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
+    dec             kq
+    jge .sec_k_loop
+    ADJUST_PIXEL    %1, %2, m0, m2
+%if %1*%2 > mmsize
+    mov           dstq, dst4q
+    lea          top1q, [rsp+0x90]
+    lea          top2q, [rsp+0xA0]
+    lea          dst4q, [dst4q+strideq*4]
+    add             hq, 4
+    jl .sec_v_loop
+%endif
+    RET
+
+.d0k0:
+%if %1 == 4
+ %if %2 == 4
+    vpbroadcastq    m6, [dstq+strideq*1-1]
+    vpbroadcastq   m10, [dstq+strideq*2-1]
+    movd           xm5, [topq+strideq*1+1]
+    movd           xm9, [dstq+strideq*0+1]
+    psrldq         m11, m6, 2
+    psrldq         m12, m10, 2
+    vinserti128     m6, [dstq+stride3q -1], 1
+    vinserti128    m10, [dstq+strideq*4-1], 1
+    vpblendd        m5, m11, 0x10
+    vpblendd        m9, m12, 0x10
+    movu           m11, [blend_4x4+16]
+    punpckldq       m6, m10
+    punpckldq       m5, m9
+    vpblendvb       m6, [rsp+gprsize+0x28], m11
+ %else
+    movd           xm5, [topq +strideq*1+1]
+    movq           xm6, [dstq +strideq*1-1]
+    movq          xm10, [dstq +stride3q -1]
+    movq          xm11, [dst4q+strideq*1-1]
+    pinsrd         xm5, [dstq +strideq*0+1], 1
+    movhps         xm6, [dstq +strideq*2-1]
+    movhps        xm10, [dst4q+strideq*0-1]
+    movhps        xm11, [dst4q+strideq*2-1]
+    psrldq         xm9, xm6, 2
+    shufps         xm5, xm9, q2010   ; -1 +0 +1 +2
+    shufps         xm6, xm10, q2020  ; +1 +2 +3 +4
+    psrldq         xm9, xm11, 2
+    psrldq        xm10, 2
+    shufps        xm10, xm9, q2020   ; +3 +4 +5 +6
+    movd           xm9, [dst4q+stride3q -1]
+    pinsrd         xm9, [dst4q+strideq*4-1], 1
+    shufps        xm11, xm9, q1020   ; +5 +6 +7 +8
+    pmovzxbw        m9, [leftq+3]
+    vinserti128     m6, xm11, 1
+    movu           m11, [blend_4x8_0+4]
+    vinserti128     m5, xm10, 1
+    vpblendvb       m6, m9, m11
+ %endif
+%else
+    lea            r13, [blend_8x8_0+16]
+    movq           xm5, [top2q         +1]
+    vbroadcasti128 m10, [dstq+strideq*1-1]
+    vbroadcasti128 m11, [dstq+strideq*2-1]
+    movhps         xm5, [dstq+strideq*0+1]
+    vinserti128     m6, m10, [dstq+stride3q -1], 1
+    vinserti128     m9, m11, [dstq+strideq*4-1], 1
+    psrldq         m10, 2
+    psrldq         m11, 2
+    punpcklqdq      m6, m9
+    movu            m9, [r13+hq*2*1+16*1]
+    punpcklqdq     m10, m11
+    vpblendd        m5, m10, 0xF0
+    vpblendvb       m6, [rsp+gprsize+80+hq*8+64+8*1], m9
+%endif
+    ret
+.d1k0:
+.d2k0:
+.d3k0:
+%if %1 == 4
+ %if %2 == 4
+    movq           xm6, [dstq+strideq*0-1]
+    movq           xm9, [dstq+strideq*1-1]
+    vinserti128     m6, [dstq+strideq*2-1], 1
+    vinserti128     m9, [dstq+stride3q -1], 1
+    movu           m11, [rsp+gprsize+0x10]
+    pcmpeqd        m12, m12
+    psrldq          m5, m6, 2
+    psrldq         m10, m9, 2
+    psrld          m12, 24
+    punpckldq       m6, m9
+    punpckldq       m5, m10
+    vpblendvb       m6, m11, m12
+ %else
+    movq           xm6, [dstq +strideq*0-1]
+    movq           xm9, [dstq +strideq*2-1]
+    movhps         xm6, [dstq +strideq*1-1]
+    movhps         xm9, [dstq +stride3q -1]
+    movq          xm10, [dst4q+strideq*0-1]
+    movhps        xm10, [dst4q+strideq*1-1]
+    psrldq         xm5, xm6, 2
+    psrldq        xm11, xm9, 2
+    shufps         xm5, xm11, q2020
+    movq          xm11, [dst4q+strideq*2-1]
+    movhps        xm11, [dst4q+stride3q -1]
+    shufps         xm6, xm9, q2020
+    shufps         xm9, xm10, xm11, q2020
+    vinserti128     m6, xm9, 1
+    pmovzxbw        m9, [leftq+1]
+    psrldq        xm10, 2
+    psrldq        xm11, 2
+    shufps        xm10, xm11, q2020
+    vpbroadcastd   m11, [blend_4x8_0+4]
+    vinserti128     m5, xm10, 1
+    vpblendvb       m6, m9, m11
+ %endif
+%else
+    movu           xm5, [dstq+strideq*0-1]
+    movu           xm9, [dstq+strideq*1-1]
+    vinserti128     m5, [dstq+strideq*2-1], 1
+    vinserti128     m9, [dstq+stride3q -1], 1
+    movu           m10, [blend_8x8_0+16]
+    punpcklqdq      m6, m5, m9
+    vpblendvb       m6, [rsp+gprsize+80+hq*8+64], m10
+    psrldq          m5, 2
+    psrldq          m9, 2
+    punpcklqdq      m5, m9
+%endif
+    ret
+.d4k0:
+%if %1 == 4
+ %if %2 == 4
+    vpbroadcastq   m10, [dstq+strideq*1-1]
+    vpbroadcastq   m11, [dstq+strideq*2-1]
+    movd           xm6, [topq+strideq*1-1]
+    movd           xm9, [dstq+strideq*0-1]
+    psrldq          m5, m10, 2
+    psrldq         m12, m11, 2
+    vpblendd        m6, m10, 0x10
+    vpblendd        m9, m11, 0x10
+    movu           m10, [blend_4x4]
+    vinserti128     m5, [dstq+stride3q +1], 1
+    vinserti128    m12, [dstq+strideq*4+1], 1
+    punpckldq       m6, m9
+    punpckldq       m5, m12
+    vpblendvb       m6, [rsp+gprsize+0x40], m10
+ %else
+    movd           xm6, [topq +strideq*1-1]
+    movq           xm9, [dstq +strideq*1-1]
+    movq          xm10, [dstq +stride3q -1]
+    movq          xm11, [dst4q+strideq*1-1]
+    pinsrd         xm6, [dstq +strideq*0-1], 1
+    movhps         xm9, [dstq +strideq*2-1]
+    movhps        xm10, [dst4q+strideq*0-1]
+    movhps        xm11, [dst4q+strideq*2-1]
+    psrldq         xm5, xm9, 2
+    shufps         xm6, xm9, q2010
+    psrldq         xm9, xm10, 2
+    shufps         xm5, xm9, q2020
+    shufps        xm10, xm11, q2020
+    movd           xm9, [dst4q+stride3q +1]
+    vinserti128     m6, xm10, 1
+    pinsrd         xm9, [dst4q+strideq*4+1], 1
+    psrldq        xm11, 2
+    pmovzxbw       m10, [leftq-1]
+    shufps        xm11, xm9, q1020
+    movu            m9, [blend_4x8_0]
+    vinserti128     m5, xm11, 1
+    vpblendvb       m6, m10, m9
+ %endif
+%else
+    lea            r13, [blend_8x8_0+8]
+    movq           xm6, [top2q         -1]
+    vbroadcasti128  m5, [dstq+strideq*1-1]
+    vbroadcasti128  m9, [dstq+strideq*2-1]
+    movhps         xm6, [dstq+strideq*0-1]
+    movu           m11, [r13+hq*2*1+16*1]
+    punpcklqdq     m10, m5, m9
+    vinserti128     m5, [dstq+stride3q -1], 1
+    vinserti128     m9, [dstq+strideq*4-1], 1
+    vpblendd        m6, m10, 0xF0
+    vpblendvb       m6, [rsp+gprsize+80+hq*8+64-8*1], m11
+    psrldq          m5, 2
+    psrldq          m9, 2
+    punpcklqdq      m5, m9
+%endif
+    ret
+.d5k0:
+.d6k0:
+.d7k0:
+%if %1 == 4
+ %if %2 == 4
+    movd           xm6, [topq+strideq*1  ]
+    vpbroadcastd    m5, [dstq+strideq*1  ]
+    vpbroadcastd    m9, [dstq+strideq*2  ]
+    vpblendd       xm6, [dstq+strideq*0-4], 0x2
+    vpblendd        m5, m9, 0x22
+    vpblendd        m6, m5, 0x30
+    vinserti128     m5, [dstq+stride3q    ], 1
+    vpblendd        m5, [dstq+strideq*4-20], 0x20
+ %else
+    movd           xm6, [topq +strideq*1]
+    movd           xm5, [dstq +strideq*1]
+    movd           xm9, [dstq +stride3q ]
+    movd          xm10, [dst4q+strideq*1]
+    movd          xm11, [dst4q+stride3q ]
+    pinsrd         xm6, [dstq +strideq*0], 1
+    pinsrd         xm5, [dstq +strideq*2], 1
+    pinsrd         xm9, [dst4q+strideq*0], 1
+    pinsrd        xm10, [dst4q+strideq*2], 1
+    pinsrd        xm11, [dst4q+strideq*4], 1
+    punpcklqdq     xm6, xm5
+    punpcklqdq     xm5, xm9
+    punpcklqdq     xm9, xm10
+    punpcklqdq    xm10, xm11
+    vinserti128     m6, xm9, 1
+    vinserti128     m5, xm10, 1
+ %endif
+%else
+    movq           xm6, [top2q         ]
+    movq           xm5, [dstq+strideq*1]
+    movq           xm9, [dstq+stride3q ]
+    movhps         xm6, [dstq+strideq*0]
+    movhps         xm5, [dstq+strideq*2]
+    movhps         xm9, [dstq+strideq*4]
+    vinserti128     m6, xm5, 1
+    vinserti128     m5, xm9, 1
+%endif
+    ret
+.d0k1:
+%if %1 == 4
+ %if %2 == 4
+    movd           xm6, [dstq +strideq*2-2]
+    movd           xm9, [dstq +stride3q -2]
+    movd           xm5, [topq +strideq*0+2]
+    movd          xm10, [topq +strideq*1+2]
+    pinsrw         xm6, [leftq+4], 0
+    pinsrw         xm9, [leftq+6], 0
+    vinserti128     m5, [dstq +strideq*0+2], 1
+    vinserti128    m10, [dstq +strideq*1+2], 1
+    vinserti128     m6, [dst4q+strideq*0-2], 1
+    vinserti128     m9, [dst4q+strideq*1-2], 1
+    punpckldq       m5, m10
+    punpckldq       m6, m9
+ %else
+    movq           xm6, [dstq +strideq*2-2]
+    movd          xm10, [dst4q+strideq*2-2]
+    movd           xm5, [topq +strideq*0+2]
+    movq           xm9, [dst4q+strideq*0-2]
+    movhps         xm6, [dstq +stride3q -2]
+    pinsrw        xm10, [dst4q+stride3q   ], 3
+    pinsrd         xm5, [topq +strideq*1+2], 1
+    movhps         xm9, [dst4q+strideq*1-2]
+    pinsrd        xm10, [dst8q+strideq*0-2], 2
+    pinsrd         xm5, [dstq +strideq*0+2], 2
+    pinsrd        xm10, [dst8q+strideq*1-2], 3
+    pinsrd         xm5, [dstq +strideq*1+2], 3
+    shufps        xm11, xm6, xm9, q3131
+    shufps         xm6, xm9, q2020
+    movu            m9, [blend_4x8_3+8]
+    vinserti128     m6, xm10, 1
+    vinserti128     m5, xm11, 1
+    vpblendvb       m6, [rsp+gprsize+16+8], m9
+ %endif
+%else
+    lea            r13, [blend_8x8_1+16]
+    movq           xm6, [dstq +strideq*2-2]
+    movq           xm9, [dstq +stride3q -2]
+    movq           xm5, [top1q          +2]
+    movq          xm10, [top2q          +2]
+    movu           m11, [r13+hq*2*2+16*2]
+    vinserti128     m6, [dst4q+strideq*0-2], 1
+    vinserti128     m9, [dst4q+strideq*1-2], 1
+    vinserti128     m5, [dstq +strideq*0+2], 1
+    vinserti128    m10, [dstq +strideq*1+2], 1
+    punpcklqdq      m6, m9
+    punpcklqdq      m5, m10
+    vpblendvb       m6, [rsp+gprsize+16+hq*8+64+8*2], m11
+%endif
+    ret
+.d1k1:
+%if %1 == 4
+ %if %2 == 4
+    vpbroadcastq    m6, [dstq+strideq*1-2]
+    vpbroadcastq    m9, [dstq+strideq*2-2]
+    movd           xm5, [topq+strideq*1+2]
+    movd          xm10, [dstq+strideq*0+2]
+    psrldq         m11, m6, 4
+    psrldq         m12, m9, 4
+    vpblendd        m5, m11, 0x10
+    movq          xm11, [leftq+2]
+    vinserti128     m6, [dstq+stride3q -2], 1
+    punpckldq     xm11, xm11
+    vpblendd       m10, m12, 0x10
+    pcmpeqd        m12, m12
+    pmovzxwd       m11, xm11
+    psrld          m12, 16
+    punpckldq       m6, m9
+    vpbroadcastd    m9, [dstq+strideq*4-2]
+    vpblendvb       m6, m11, m12
+    punpckldq       m5, m10
+    vpblendd        m6, m9, 0x20
+ %else
+    movd           xm5, [topq +strideq*1+2]
+    movq           xm6, [dstq +strideq*1-2]
+    movq           xm9, [dstq +stride3q -2]
+    movq          xm10, [dst4q+strideq*1-2]
+    movd          xm11, [dst4q+stride3q -2]
+    pinsrd         xm5, [dstq +strideq*0+2], 1
+    movhps         xm6, [dstq +strideq*2-2]
+    movhps         xm9, [dst4q+strideq*0-2]
+    movhps        xm10, [dst4q+strideq*2-2]
+    pinsrd        xm11, [dst4q+strideq*4-2], 1
+    shufps         xm5, xm6, q3110
+    shufps         xm6, xm9, q2020
+    shufps         xm9, xm10, q3131
+    shufps        xm10, xm11, q1020
+    movu           m11, [blend_4x8_2+4]
+    vinserti128     m6, xm10, 1
+    vinserti128     m5, xm9, 1
+    vpblendvb       m6, [rsp+gprsize+16+4], m11
+ %endif
+%else
+    lea            r13, [blend_8x8_1+16]
+    movq           xm5, [top2q         +2]
+    vbroadcasti128  m6, [dstq+strideq*1-2]
+    vbroadcasti128  m9, [dstq+strideq*2-2]
+    movhps         xm5, [dstq+strideq*0+2]
+    shufps         m10, m6, m9, q2121
+    vinserti128     m6, [dstq+stride3q -2], 1
+    vinserti128     m9, [dstq+strideq*4-2], 1
+    movu           m11, [r13+hq*2*1+16*1]
+    vpblendd        m5, m10, 0xF0
+    punpcklqdq      m6, m9
+    vpblendvb       m6, [rsp+gprsize+16+hq*8+64+8*1], m11
+%endif
+    ret
+.d2k1:
+%if %1 == 4
+ %if %2 == 4
+    movq          xm11, [leftq]
+    movq           xm6, [dstq+strideq*0-2]
+    movq           xm9, [dstq+strideq*1-2]
+    vinserti128     m6, [dstq+strideq*2-2], 1
+    vinserti128     m9, [dstq+stride3q -2], 1
+    punpckldq     xm11, xm11
+    psrldq          m5, m6, 4
+    psrldq         m10, m9, 4
+    pmovzxwd       m11, xm11
+    punpckldq       m6, m9
+    punpckldq       m5, m10
+    pblendw         m6, m11, 0x05
+ %else
+    movq           xm5, [dstq +strideq*0-2]
+    movq           xm9, [dstq +strideq*2-2]
+    movq          xm10, [dst4q+strideq*0-2]
+    movq          xm11, [dst4q+strideq*2-2]
+    movhps         xm5, [dstq +strideq*1-2]
+    movhps         xm9, [dstq +stride3q -2]
+    movhps        xm10, [dst4q+strideq*1-2]
+    movhps        xm11, [dst4q+stride3q -2]
+    shufps         xm6, xm5, xm9, q2020
+    shufps         xm5, xm9, q3131
+    shufps         xm9, xm10, xm11, q2020
+    shufps        xm10, xm11, q3131
+    pmovzxwd       m11, [leftq]
+    vinserti128     m6, xm9, 1
+    vinserti128     m5, xm10, 1
+    pblendw         m6, m11, 0x55
+ %endif
+%else
+    mova           m11, [rsp+gprsize+16+hq*8+64]
+    movu           xm5, [dstq+strideq*0-2]
+    movu           xm9, [dstq+strideq*1-2]
+    vinserti128     m5, [dstq+strideq*2-2], 1
+    vinserti128     m9, [dstq+stride3q -2], 1
+    shufps          m6, m5, m9, q1010
+    shufps          m5, m9, q2121
+    pblendw         m6, m11, 0x11
+%endif
+    ret
+.d3k1:
+%if %1 == 4
+ %if %2 == 4
+    vpbroadcastq   m11, [dstq+strideq*1-2]
+    vpbroadcastq   m12, [dstq+strideq*2-2]
+    movd           xm6, [topq+strideq*1-2]
+    movd           xm9, [dstq+strideq*0-2]
+    pblendw        m11, [leftq-16+2], 0x01
+    pblendw        m12, [leftq-16+4], 0x01
+    pinsrw         xm9, [leftq- 0+0], 0
+    psrldq          m5, m11, 4
+    psrldq         m10, m12, 4
+    vinserti128     m5, [dstq+stride3q +2], 1
+    vinserti128    m10, [dstq+strideq*4+2], 1
+    vpblendd        m6, m11, 0x10
+    vpblendd        m9, m12, 0x10
+    punpckldq       m6, m9
+    punpckldq       m5, m10
+ %else
+    movd           xm6, [topq +strideq*1-2]
+    movq           xm5, [dstq +strideq*1-2]
+    movq           xm9, [dstq +stride3q -2]
+    movq          xm10, [dst4q+strideq*1-2]
+    movd          xm11, [dst4q+stride3q +2]
+    pinsrw         xm6, [dstq +strideq*0  ], 3
+    movhps         xm5, [dstq +strideq*2-2]
+    movhps         xm9, [dst4q+strideq*0-2]
+    movhps        xm10, [dst4q+strideq*2-2]
+    pinsrd        xm11, [dst4q+strideq*4+2], 1
+    shufps         xm6, xm5, q2010
+    shufps         xm5, xm9, q3131
+    shufps         xm9, xm10, q2020
+    shufps        xm10, xm11, q1031
+    movu           m11, [blend_4x8_2]
+    vinserti128     m6, xm9, 1
+    vinserti128     m5, xm10, 1
+    vpblendvb       m6, [rsp+gprsize+16-4], m11
+ %endif
+%else
+    lea            r13, [blend_8x8_1+8]
+    movq           xm6, [top2q         -2]
+    vbroadcasti128  m5, [dstq+strideq*1-2]
+    vbroadcasti128 m10, [dstq+strideq*2-2]
+    movhps         xm6, [dstq+strideq*0-2]
+    punpcklqdq      m9, m5, m10
+    vinserti128     m5, [dstq+stride3q -2], 1
+    vinserti128    m10, [dstq+strideq*4-2], 1
+    movu           m11, [r13+hq*2*1+16*1]
+    vpblendd        m6, m9, 0xF0
+    shufps          m5, m10, q2121
+    vpblendvb       m6, [rsp+gprsize+16+hq*8+64-8*1], m11
+%endif
+    ret
+.d4k1:
+%if %1 == 4
+ %if %2 == 4
+    vinserti128     m6, [dstq +strideq*0-2], 1
+    vinserti128     m9, [dstq +strideq*1-2], 1
+    movd           xm5, [dstq +strideq*2+2]
+    movd          xm10, [dstq +stride3q +2]
+    pblendw         m6, [leftq-16+0], 0x01
+    pblendw         m9, [leftq-16+2], 0x01
+    vinserti128     m5, [dst4q+strideq*0+2], 1
+    vinserti128    m10, [dst4q+strideq*1+2], 1
+    vpblendd        m6, [topq +strideq*0-2], 0x01
+    vpblendd        m9, [topq +strideq*1-2], 0x01
+    punpckldq       m5, m10
+    punpckldq       m6, m9
+ %else
+    movd           xm6, [topq +strideq*0-2]
+    movq           xm5, [dstq +strideq*2-2]
+    movq           xm9, [dst4q+strideq*0-2]
+    movd          xm10, [dst4q+strideq*2+2]
+    pinsrd         xm6, [topq +strideq*1-2], 1
+    movhps         xm5, [dstq +stride3q -2]
+    movhps         xm9, [dst4q+strideq*1-2]
+    pinsrd        xm10, [dst4q+stride3q +2], 1
+    pinsrd         xm6, [dstq +strideq*0-2], 2
+    pinsrd        xm10, [dst8q+strideq*0+2], 2
+    pinsrd         xm6, [dstq +strideq*1-2], 3
+    pinsrd        xm10, [dst8q+strideq*1+2], 3
+    shufps        xm11, xm5, xm9, q2020
+    shufps         xm5, xm9, q3131
+    movu            m9, [blend_4x8_3]
+    vinserti128     m6, xm11, 1
+    vinserti128     m5, xm10, 1
+    vpblendvb       m6, [rsp+gprsize+16-8], m9
+ %endif
+%else
+    lea            r13, [blend_8x8_1]
+    movu           m11, [r13+hq*2*2+16*2]
+    movq           xm6, [top1q          -2]
+    movq           xm9, [top2q          -2]
+    movq           xm5, [dstq +strideq*2+2]
+    movq          xm10, [dstq +stride3q +2]
+    vinserti128     m6, [dstq +strideq*0-2], 1
+    vinserti128     m9, [dstq +strideq*1-2], 1
+    vinserti128     m5, [dst4q+strideq*0+2], 1
+    vinserti128    m10, [dst4q+strideq*1+2], 1
+    punpcklqdq      m6, m9
+    vpblendvb       m6, [rsp+gprsize+16+hq*8+64-8*2], m11
+    punpcklqdq      m5, m10
+%endif
+    ret
+.d5k1:
+%if %1 == 4
+ %if %2 == 4
+    movd           xm6, [topq +strideq*0-1]
+    movd           xm9, [topq +strideq*1-1]
+    movd           xm5, [dstq +strideq*2+1]
+    movd          xm10, [dstq +stride3q +1]
+    pcmpeqd        m12, m12
+    pmovzxbw       m11, [leftq-8+1]
+    psrld          m12, 24
+    vinserti128     m6, [dstq +strideq*0-1], 1
+    vinserti128     m9, [dstq +strideq*1-1], 1
+    vinserti128     m5, [dst4q+strideq*0+1], 1
+    vinserti128    m10, [dst4q+strideq*1+1], 1
+    punpckldq       m6, m9
+    pxor            m9, m9
+    vpblendd       m12, m9, 0x0F
+    punpckldq       m5, m10
+    vpblendvb       m6, m11, m12
+ %else
+    movd           xm6, [topq +strideq*0-1]
+    movq           xm5, [dstq +strideq*2-1]
+    movq           xm9, [dst4q+strideq*0-1]
+    movd          xm10, [dst4q+strideq*2+1]
+    pinsrd         xm6, [topq +strideq*1-1], 1
+    movhps         xm5, [dstq +stride3q -1]
+    movhps         xm9, [dst4q+strideq*1-1]
+    pinsrd        xm10, [dst4q+stride3q +1], 1
+    pinsrd         xm6, [dstq +strideq*0-1], 2
+    pinsrd        xm10, [dst8q+strideq*0+1], 2
+    pinsrd         xm6, [dstq +strideq*1-1], 3
+    pinsrd        xm10, [dst8q+strideq*1+1], 3
+    shufps        xm11, xm5, xm9, q2020
+    vinserti128     m6, xm11, 1
+    pmovzxbw       m11, [leftq-3]
+    psrldq         xm5, 2
+    psrldq         xm9, 2
+    shufps         xm5, xm9, q2020
+    movu            m9, [blend_4x8_1]
+    vinserti128     m5, xm10, 1
+    vpblendvb       m6, m11, m9
+ %endif
+%else
+    lea            r13, [blend_8x8_0]
+    movu           m11, [r13+hq*2*2+16*2]
+    movq           xm6, [top1q          -1]
+    movq           xm9, [top2q          -1]
+    movq           xm5, [dstq +strideq*2+1]
+    movq          xm10, [dstq +stride3q +1]
+    vinserti128     m6, [dstq +strideq*0-1], 1
+    vinserti128     m9, [dstq +strideq*1-1], 1
+    vinserti128     m5, [dst4q+strideq*0+1], 1
+    vinserti128    m10, [dst4q+strideq*1+1], 1
+    punpcklqdq      m6, m9
+    punpcklqdq      m5, m10
+    vpblendvb       m6, [rsp+gprsize+80+hq*8+64-8*2], m11
+%endif
+    ret
+.d6k1:
+%if %1 == 4
+ %if %2 == 4
+    movd           xm6, [topq +strideq*0]
+    movd           xm9, [topq +strideq*1]
+    movd           xm5, [dstq +strideq*2]
+    movd          xm10, [dstq +stride3q ]
+    vinserti128     m6, [dstq +strideq*0], 1
+    vinserti128     m9, [dstq +strideq*1], 1
+    vinserti128     m5, [dst4q+strideq*0], 1
+    vinserti128    m10, [dst4q+strideq*1], 1
+    punpckldq       m6, m9
+    punpckldq       m5, m10
+ %else
+    movd           xm5, [dstq +strideq*2]
+    movd           xm6, [topq +strideq*0]
+    movd           xm9, [dst4q+strideq*2]
+    pinsrd         xm5, [dstq +stride3q ], 1
+    pinsrd         xm6, [topq +strideq*1], 1
+    pinsrd         xm9, [dst4q+stride3q ], 1
+    pinsrd         xm5, [dst4q+strideq*0], 2
+    pinsrd         xm6, [dstq +strideq*0], 2
+    pinsrd         xm9, [dst8q+strideq*0], 2
+    pinsrd         xm5, [dst4q+strideq*1], 3
+    pinsrd         xm6, [dstq +strideq*1], 3
+    pinsrd         xm9, [dst8q+strideq*1], 3
+    vinserti128     m6, xm5, 1
+    vinserti128     m5, xm9, 1
+ %endif
+%else
+    movq           xm5, [dstq +strideq*2]
+    movq           xm9, [dst4q+strideq*0]
+    movq           xm6, [top1q          ]
+    movq          xm10, [dstq +strideq*0]
+    movhps         xm5, [dstq +stride3q ]
+    movhps         xm9, [dst4q+strideq*1]
+    movhps         xm6, [top2q          ]
+    movhps        xm10, [dstq +strideq*1]
+    vinserti128     m5, xm9, 1
+    vinserti128     m6, xm10, 1
+%endif
+    ret
+.d7k1:
+%if %1 == 4
+ %if %2 == 4
+    movd           xm5, [dstq +strideq*2-1]
+    movd           xm9, [dstq +stride3q -1]
+    movd           xm6, [topq +strideq*0+1]
+    movd          xm10, [topq +strideq*1+1]
+    pinsrb         xm5, [leftq+ 5], 0
+    pinsrb         xm9, [leftq+ 7], 0
+    vinserti128     m6, [dstq +strideq*0+1], 1
+    vinserti128    m10, [dstq +strideq*1+1], 1
+    vinserti128     m5, [dst4q+strideq*0-1], 1
+    vinserti128     m9, [dst4q+strideq*1-1], 1
+    punpckldq       m6, m10
+    punpckldq       m5, m9
+ %else
+    movd           xm6, [topq +strideq*0+1]
+    movq           xm9, [dstq +strideq*2-1]
+    movq          xm10, [dst4q+strideq*0-1]
+    movd          xm11, [dst4q+strideq*2-1]
+    pinsrd         xm6, [topq +strideq*1+1], 1
+    movhps         xm9, [dstq +stride3q -1]
+    movhps        xm10, [dst4q+strideq*1-1]
+    pinsrd        xm11, [dst4q+stride3q -1], 1
+    pinsrd         xm6, [dstq +strideq*0+1], 2
+    pinsrd        xm11, [dst8q+strideq*0-1], 2
+    pinsrd         xm6, [dstq +strideq*1+1], 3
+    pinsrd        xm11, [dst8q+strideq*1-1], 3
+    shufps         xm5, xm9, xm10, q2020
+    vinserti128     m5, xm11, 1
+    pmovzxbw       m11, [leftq+5]
+    psrldq         xm9, 2
+    psrldq        xm10, 2
+    shufps         xm9, xm10, q2020
+    movu           m10, [blend_4x8_1+8]
+    vinserti128     m6, xm9, 1
+    vpblendvb       m5, m11, m10
+ %endif
+%else
+    lea            r13, [blend_8x8_0+16]
+    movq           xm5, [dstq +strideq*2-1]
+    movq           xm9, [dst4q+strideq*0-1]
+    movq           xm6, [top1q          +1]
+    movq          xm10, [dstq +strideq*0+1]
+    movhps         xm5, [dstq +stride3q -1]
+    movhps         xm9, [dst4q+strideq*1-1]
+    movhps         xm6, [top2q          +1]
+    movhps        xm10, [dstq +strideq*1+1]
+    movu           m11, [r13+hq*2*2+16*2]
+    vinserti128     m5, xm9, 1
+    vinserti128     m6, xm10, 1
+    vpblendvb       m5, [rsp+gprsize+80+hq*8+64+8*2], m11
+%endif
+    ret
+
+.border_block:
+ DEFINE_ARGS dst, stride, left, top, pri, sec, stride3, dst4, edge
+%define rstk rsp
+%assign stack_offset stack_offset_entry
+%if %1 == 4 && %2 == 8
+    PUSH            r9
+ %assign regs_used 10
+%else
+ %assign regs_used 9
+%endif
+%if STACK_ALIGNMENT < 32
+    PUSH  r%+regs_used
+ %assign regs_used regs_used+1
+%endif
+    ALLOC_STACK 2*16+(%2+4)*32, 16
+%define px rsp+2*16+2*32
+
+    pcmpeqw        m14, m14
+    psllw          m14, 15                  ; 0x8000
+
+    ; prepare pixel buffers - body/right
+%if %1 == 4
+    INIT_XMM avx2
+%endif
+%if %2 == 8
+    lea          dst4q, [dstq+strideq*4]
+%endif
+    lea       stride3q, [strideq*3]
+    test         edgeb, 2                   ; have_right
+    jz .no_right
+    pmovzxbw        m1, [dstq+strideq*0]
+    pmovzxbw        m2, [dstq+strideq*1]
+    pmovzxbw        m3, [dstq+strideq*2]
+    pmovzxbw        m4, [dstq+stride3q]
+    mova     [px+0*32], m1
+    mova     [px+1*32], m2
+    mova     [px+2*32], m3
+    mova     [px+3*32], m4
+%if %2 == 8
+    pmovzxbw        m1, [dst4q+strideq*0]
+    pmovzxbw        m2, [dst4q+strideq*1]
+    pmovzxbw        m3, [dst4q+strideq*2]
+    pmovzxbw        m4, [dst4q+stride3q]
+    mova     [px+4*32], m1
+    mova     [px+5*32], m2
+    mova     [px+6*32], m3
+    mova     [px+7*32], m4
+%endif
+    jmp .body_done
+.no_right:
+%if %1 == 4
+    movd           xm1, [dstq+strideq*0]
+    movd           xm2, [dstq+strideq*1]
+    movd           xm3, [dstq+strideq*2]
+    movd           xm4, [dstq+stride3q]
+    pmovzxbw       xm1, xm1
+    pmovzxbw       xm2, xm2
+    pmovzxbw       xm3, xm3
+    pmovzxbw       xm4, xm4
+    movq     [px+0*32], xm1
+    movq     [px+1*32], xm2
+    movq     [px+2*32], xm3
+    movq     [px+3*32], xm4
+%else
+    pmovzxbw       xm1, [dstq+strideq*0]
+    pmovzxbw       xm2, [dstq+strideq*1]
+    pmovzxbw       xm3, [dstq+strideq*2]
+    pmovzxbw       xm4, [dstq+stride3q]
+    mova     [px+0*32], xm1
+    mova     [px+1*32], xm2
+    mova     [px+2*32], xm3
+    mova     [px+3*32], xm4
+%endif
+    movd [px+0*32+%1*2], xm14
+    movd [px+1*32+%1*2], xm14
+    movd [px+2*32+%1*2], xm14
+    movd [px+3*32+%1*2], xm14
+%if %2 == 8
+ %if %1 == 4
+    movd           xm1, [dst4q+strideq*0]
+    movd           xm2, [dst4q+strideq*1]
+    movd           xm3, [dst4q+strideq*2]
+    movd           xm4, [dst4q+stride3q]
+    pmovzxbw       xm1, xm1
+    pmovzxbw       xm2, xm2
+    pmovzxbw       xm3, xm3
+    pmovzxbw       xm4, xm4
+    movq     [px+4*32], xm1
+    movq     [px+5*32], xm2
+    movq     [px+6*32], xm3
+    movq     [px+7*32], xm4
+ %else
+    pmovzxbw       xm1, [dst4q+strideq*0]
+    pmovzxbw       xm2, [dst4q+strideq*1]
+    pmovzxbw       xm3, [dst4q+strideq*2]
+    pmovzxbw       xm4, [dst4q+stride3q]
+    mova     [px+4*32], xm1
+    mova     [px+5*32], xm2
+    mova     [px+6*32], xm3
+    mova     [px+7*32], xm4
+ %endif
+    movd [px+4*32+%1*2], xm14
+    movd [px+5*32+%1*2], xm14
+    movd [px+6*32+%1*2], xm14
+    movd [px+7*32+%1*2], xm14
+%endif
+.body_done:
+
+    ; top
+    test         edgeb, 4                    ; have_top
+    jz .no_top
+    test         edgeb, 1                    ; have_left
+    jz .top_no_left
+    test         edgeb, 2                    ; have_right
+    jz .top_no_right
+    pmovzxbw        m1, [topq+strideq*0-(%1/2)]
+    pmovzxbw        m2, [topq+strideq*1-(%1/2)]
+    movu  [px-2*32-%1], m1
+    movu  [px-1*32-%1], m2
+    jmp .top_done
+.top_no_right:
+    pmovzxbw        m1, [topq+strideq*0-%1]
+    pmovzxbw        m2, [topq+strideq*1-%1]
+    movu [px-2*32-%1*2], m1
+    movu [px-1*32-%1*2], m2
+    movd [px-2*32+%1*2], xm14
+    movd [px-1*32+%1*2], xm14
+    jmp .top_done
+.top_no_left:
+    test         edgeb, 2                   ; have_right
+    jz .top_no_left_right
+    pmovzxbw        m1, [topq+strideq*0]
+    pmovzxbw        m2, [topq+strideq*1]
+    mova   [px-2*32+0], m1
+    mova   [px-1*32+0], m2
+    movd   [px-2*32-4], xm14
+    movd   [px-1*32-4], xm14
+    jmp .top_done
+.top_no_left_right:
+%if %1 == 4
+    movd           xm1, [topq+strideq*0]
+    pinsrd         xm1, [topq+strideq*1], 1
+    pmovzxbw       xm1, xm1
+    movq   [px-2*32+0], xm1
+    movhps [px-1*32+0], xm1
+%else
+    pmovzxbw       xm1, [topq+strideq*0]
+    pmovzxbw       xm2, [topq+strideq*1]
+    mova   [px-2*32+0], xm1
+    mova   [px-1*32+0], xm2
+%endif
+    movd   [px-2*32-4], xm14
+    movd   [px-1*32-4], xm14
+    movd [px-2*32+%1*2], xm14
+    movd [px-1*32+%1*2], xm14
+    jmp .top_done
+.no_top:
+    movu   [px-2*32-%1], m14
+    movu   [px-1*32-%1], m14
+.top_done:
+
+    ; left
+    test         edgeb, 1                   ; have_left
+    jz .no_left
+    pmovzxbw       xm1, [leftq+ 0]
+%if %2 == 8
+    pmovzxbw       xm2, [leftq+ 8]
+%endif
+    movd   [px+0*32-4], xm1
+    pextrd [px+1*32-4], xm1, 1
+    pextrd [px+2*32-4], xm1, 2
+    pextrd [px+3*32-4], xm1, 3
+%if %2 == 8
+    movd   [px+4*32-4], xm2
+    pextrd [px+5*32-4], xm2, 1
+    pextrd [px+6*32-4], xm2, 2
+    pextrd [px+7*32-4], xm2, 3
+%endif
+    jmp .left_done
+.no_left:
+    movd   [px+0*32-4], xm14
+    movd   [px+1*32-4], xm14
+    movd   [px+2*32-4], xm14
+    movd   [px+3*32-4], xm14
+%if %2 == 8
+    movd   [px+4*32-4], xm14
+    movd   [px+5*32-4], xm14
+    movd   [px+6*32-4], xm14
+    movd   [px+7*32-4], xm14
+%endif
+.left_done:
+
+    ; bottom
+    DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge
+    test         edgeb, 8                   ; have_bottom
+    jz .no_bottom
+    lea          dst8q, [dstq+%2*strideq]
+    test         edgeb, 1                   ; have_left
+    jz .bottom_no_left
+    test         edgeb, 2                   ; have_right
+    jz .bottom_no_right
+    pmovzxbw        m1, [dst8q-(%1/2)]
+    pmovzxbw        m2, [dst8q+strideq-(%1/2)]
+    movu   [px+(%2+0)*32-%1], m1
+    movu   [px+(%2+1)*32-%1], m2
+    jmp .bottom_done
+.bottom_no_right:
+    pmovzxbw        m1, [dst8q-%1]
+    pmovzxbw        m2, [dst8q+strideq-%1]
+    movu  [px+(%2+0)*32-%1*2], m1
+    movu  [px+(%2+1)*32-%1*2], m2
+%if %1 == 8
+    movd  [px+(%2-1)*32+%1*2], xm14                ; overwritten by previous movu
+%endif
+    movd  [px+(%2+0)*32+%1*2], xm14
+    movd  [px+(%2+1)*32+%1*2], xm14
+    jmp .bottom_done
+.bottom_no_left:
+    test          edgeb, 2                  ; have_right
+    jz .bottom_no_left_right
+    pmovzxbw        m1, [dst8q]
+    pmovzxbw        m2, [dst8q+strideq]
+    mova   [px+(%2+0)*32+0], m1
+    mova   [px+(%2+1)*32+0], m2
+    movd   [px+(%2+0)*32-4], xm14
+    movd   [px+(%2+1)*32-4], xm14
+    jmp .bottom_done
+.bottom_no_left_right:
+%if %1 == 4
+    movd           xm1, [dst8q]
+    pinsrd         xm1, [dst8q+strideq], 1
+    pmovzxbw       xm1, xm1
+    movq   [px+(%2+0)*32+0], xm1
+    movhps [px+(%2+1)*32+0], xm1
+%else
+    pmovzxbw       xm1, [dst8q]
+    pmovzxbw       xm2, [dst8q+strideq]
+    mova   [px+(%2+0)*32+0], xm1
+    mova   [px+(%2+1)*32+0], xm2
+%endif
+    movd   [px+(%2+0)*32-4], xm14
+    movd   [px+(%2+1)*32-4], xm14
+    movd  [px+(%2+0)*32+%1*2], xm14
+    movd  [px+(%2+1)*32+%1*2], xm14
+    jmp .bottom_done
+.no_bottom:
+    movu   [px+(%2+0)*32-%1], m14
+    movu   [px+(%2+1)*32-%1], m14
+.bottom_done:
+
+    ; actual filter
+    INIT_YMM avx2
+    DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero
+%undef edged
+    ; register to shuffle values into after packing
+    vbroadcasti128 m12, [shufb_lohi]
+
+    mov       dampingd, r7m
+    xor          zerod, zerod
+    movifnidn     prid, prim
+    sub       dampingd, 31
+    movifnidn  secdmpd, secdmpm
+    test          prid, prid
+    jz .border_sec_only
+    movd           xm0, prid
+    lzcnt      pridmpd, prid
+    add        pridmpd, dampingd
+    cmovs      pridmpd, zerod
+    mov        [rsp+0], pridmpq                 ; pri_shift
+    test       secdmpd, secdmpd
+    jz .border_pri_only
+    movd           xm1, secdmpd
+    lzcnt      secdmpd, secdmpd
+    add        secdmpd, dampingd
+    cmovs      secdmpd, zerod
+    mov        [rsp+8], secdmpq                 ; sec_shift
+
+    DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
+    lea         tableq, [tap_table]
+    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
+    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
+
+    ; pri/sec_taps[k] [4 total]
+    DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3
+    vpbroadcastb    m0, xm0                     ; pri_strength
+    vpbroadcastb    m1, xm1                     ; sec_strength
+    and           prid, 1
+    lea           priq, [tableq+priq*2+8]       ; pri_taps
+    lea           secq, [tableq+12]             ; sec_taps
+
+    BORDER_PREP_REGS %1, %2
+%if %1*%2*2/mmsize > 1
+.border_v_loop:
+%endif
+    BORDER_LOAD_BLOCK %1, %2, 1
+.border_k_loop:
+    vpbroadcastb    m2, [priq+kq]               ; pri_taps
+    vpbroadcastb    m3, [secq+kq]               ; sec_taps
+    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
+    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
+    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
+    dec             kq
+    jge .border_k_loop
+
+    vpbroadcastd   m10, [pw_2048]
+    BORDER_ADJUST_PIXEL %1, m10, 1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+    lea           dstq, [dstq+strideq*vloop_lines]
+    add           stkq, 32*vloop_lines
+    dec             hd
+    jg .border_v_loop
+%endif
+    RET
+
+.border_pri_only:
+ DEFINE_ARGS dst, stride, pridmp, table, pri, _, stride3
+    lea         tableq, [tap_table]
+    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
+ DEFINE_ARGS dst, stride, dir, table, pri, _, stride3
+    vpbroadcastb    m0, xm0                     ; pri_strength
+    and           prid, 1
+    lea           priq, [tableq+priq*2+8]       ; pri_taps
+    BORDER_PREP_REGS %1, %2
+    vpbroadcastd    m1, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_pri_v_loop:
+%endif
+    BORDER_LOAD_BLOCK %1, %2
+.border_pri_k_loop:
+    vpbroadcastb    m2, [priq+kq]               ; pri_taps
+    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
+    dec             kq
+    jge .border_pri_k_loop
+    BORDER_ADJUST_PIXEL %1, m1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+    lea           dstq, [dstq+strideq*vloop_lines]
+    add           stkq, 32*vloop_lines
+    dec             hd
+    jg .border_pri_v_loop
+%endif
+    RET
+
+.border_sec_only:
+ DEFINE_ARGS dst, stride, _, damping, _, secdmp, stride3, zero
+    movd           xm1, secdmpd
+    lzcnt      secdmpd, secdmpd
+    add        secdmpd, dampingd
+    cmovs      secdmpd, zerod
+    mov        [rsp+8], secdmpq                 ; sec_shift
+ DEFINE_ARGS dst, stride, _, table, _, secdmp, stride3
+    lea         tableq, [tap_table]
+    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
+ DEFINE_ARGS dst, stride, dir, table, _, sec, stride3
+    vpbroadcastb    m1, xm1                     ; sec_strength
+    lea           secq, [tableq+12]             ; sec_taps
+    BORDER_PREP_REGS %1, %2
+    vpbroadcastd    m0, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_sec_v_loop:
+%endif
+    BORDER_LOAD_BLOCK %1, %2
+.border_sec_k_loop:
+    vpbroadcastb    m3, [secq+kq]               ; sec_taps
+    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
+    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
+    dec             kq
+    jge .border_sec_k_loop
+    BORDER_ADJUST_PIXEL %1, m0
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+    lea           dstq, [dstq+strideq*vloop_lines]
+    add           stkq, 32*vloop_lines
+    dec             hd
+    jg .border_sec_v_loop
+%endif
+    RET
+%endmacro
+
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
+
+INIT_YMM avx2
+cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
+    lea       stride3q, [strideq*3]
+    movq           xm0, [srcq+strideq*0]
+    movq           xm1, [srcq+strideq*1]
+    movq           xm2, [srcq+strideq*2]
+    movq           xm3, [srcq+stride3q]
+    lea           srcq, [srcq+strideq*4]
+    vpbroadcastq    m4, [srcq+strideq*0]
+    vpbroadcastq    m5, [srcq+strideq*1]
+    vpbroadcastq    m6, [srcq+strideq*2]
+    vpbroadcastq    m7, [srcq+stride3q]
+    vpbroadcastd    m8, [pw_128]
+    pxor            m9, m9
+
+    vpblendd        m0, m0, m7, 0xf0
+    vpblendd        m1, m1, m6, 0xf0
+    vpblendd        m2, m2, m5, 0xf0
+    vpblendd        m3, m3, m4, 0xf0
+
+    punpcklbw       m0, m9
+    punpcklbw       m1, m9
+    punpcklbw       m2, m9
+    punpcklbw       m3, m9
+
+    psubw           m0, m8
+    psubw           m1, m8
+    psubw           m2, m8
+    psubw           m3, m8
+
+    ; shuffle registers to generate partial_sum_diag[0-1] together
+    vpermq          m7, m0, q1032
+    vpermq          m6, m1, q1032
+    vpermq          m5, m2, q1032
+    vpermq          m4, m3, q1032
+
+    ; start with partial_sum_hv[0-1]
+    paddw           m8, m0, m1
+    paddw           m9, m2, m3
+    phaddw         m10, m0, m1
+    phaddw         m11, m2, m3
+    paddw           m8, m9
+    phaddw         m10, m11
+    vextracti128   xm9, m8, 1
+    vextracti128  xm11, m10, 1
+    paddw          xm8, xm9                 ; partial_sum_hv[1]
+    phaddw        xm10, xm11                ; partial_sum_hv[0]
+    vinserti128     m8, xm10, 1
+    vpbroadcastd    m9, [div_table+44]
+    pmaddwd         m8, m8
+    pmulld          m8, m9                  ; cost6[2a-d] | cost2[a-d]
+
+    ; create aggregates [lower half]:
+    ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
+    ;      m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
+    ; m10=             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
+    ;      m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
+    ; and [upper half]:
+    ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
+    ;      m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
+    ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
+    ;      m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
+    ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
+
+    pslldq          m9, m1, 2
+    psrldq         m10, m1, 14
+    pslldq         m11, m2, 4
+    psrldq         m12, m2, 12
+    pslldq         m13, m3, 6
+    psrldq         m14, m3, 10
+    paddw           m9, m11
+    paddw          m10, m12
+    paddw           m9, m13
+    paddw          m10, m14
+    pslldq         m11, m4, 8
+    psrldq         m12, m4, 8
+    pslldq         m13, m5, 10
+    psrldq         m14, m5, 6
+    paddw           m9, m11
+    paddw          m10, m12
+    paddw           m9, m13
+    paddw          m10, m14
+    pslldq         m11, m6, 12
+    psrldq         m12, m6, 4
+    pslldq         m13, m7, 14
+    psrldq         m14, m7, 2
+    paddw           m9, m11
+    paddw          m10, m12
+    paddw           m9, m13
+    paddw          m10, m14                 ; partial_sum_diag[0/1][8-14,zero]
+    vbroadcasti128 m14, [shufw_6543210x]
+    vbroadcasti128 m13, [div_table+16]
+    vbroadcasti128 m12, [div_table+0]
+    paddw           m9, m0                  ; partial_sum_diag[0/1][0-7]
+    pshufb         m10, m14
+    punpckhwd      m11, m9, m10
+    punpcklwd       m9, m10
+    pmaddwd        m11, m11
+    pmaddwd         m9, m9
+    pmulld         m11, m13
+    pmulld          m9, m12
+    paddd           m9, m11                 ; cost0[a-d] | cost4[a-d]
+
+    ; merge horizontally and vertically for partial_sum_alt[0-3]
+    paddw          m10, m0, m1
+    paddw          m11, m2, m3
+    paddw          m12, m4, m5
+    paddw          m13, m6, m7
+    phaddw          m0, m4
+    phaddw          m1, m5
+    phaddw          m2, m6
+    phaddw          m3, m7
+
+    ; create aggregates [lower half]:
+    ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
+    ; m11=              m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
+    ; and [upper half]:
+    ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
+    ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
+    ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
+
+    pslldq          m4, m11, 2
+    psrldq         m11, 14
+    pslldq          m5, m12, 4
+    psrldq         m12, 12
+    pslldq          m6, m13, 6
+    psrldq         m13, 10
+    paddw           m4, m10
+    paddw          m11, m12
+    vpbroadcastd   m12, [div_table+44]
+    paddw           m5, m6
+    paddw          m11, m13                 ; partial_sum_alt[3/2] right
+    vbroadcasti128 m13, [div_table+32]
+    paddw           m4, m5                  ; partial_sum_alt[3/2] left
+    pshuflw         m5, m11, q3012
+    punpckhwd       m6, m11, m4
+    punpcklwd       m4, m5
+    pmaddwd         m6, m6
+    pmaddwd         m4, m4
+    pmulld          m6, m12
+    pmulld          m4, m13
+    paddd           m4, m6                  ; cost7[a-d] | cost5[a-d]
+
+    ; create aggregates [lower half]:
+    ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
+    ; m1 =             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
+    ; and [upper half]:
+    ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
+    ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
+    ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
+
+    pslldq          m5, m1, 2
+    psrldq          m1, 14
+    pslldq          m6, m2, 4
+    psrldq          m2, 12
+    pslldq          m7, m3, 6
+    psrldq          m3, 10
+    paddw           m5, m0
+    paddw           m1, m2
+    paddw           m6, m7
+    paddw           m1, m3                  ; partial_sum_alt[0/1] right
+    paddw           m5, m6                  ; partial_sum_alt[0/1] left
+    pshuflw         m0, m1, q3012
+    punpckhwd       m1, m5
+    punpcklwd       m5, m0
+    pmaddwd         m1, m1
+    pmaddwd         m5, m5
+    pmulld          m1, m12
+    pmulld          m5, m13
+    paddd           m5, m1                  ; cost1[a-d] | cost3[a-d]
+
+    mova           xm0, [pd_47130256+ 16]
+    mova            m1, [pd_47130256]
+    phaddd          m9, m8
+    phaddd          m5, m4
+    phaddd          m9, m5
+    vpermd          m0, m9                  ; cost[0-3]
+    vpermd          m1, m9                  ; cost[4-7] | cost[0-3]
+
+    ; now find the best cost
+    pmaxsd         xm2, xm0, xm1
+    pshufd         xm3, xm2, q1032
+    pmaxsd         xm2, xm3
+    pshufd         xm3, xm2, q2301
+    pmaxsd         xm2, xm3 ; best cost
+
+    ; find the idx using minpos
+    ; make everything other than the best cost negative via subtraction
+    ; find the min of unsigned 16-bit ints to sort out the negative values
+    psubd          xm4, xm1, xm2
+    psubd          xm3, xm0, xm2
+    packssdw       xm3, xm4
+    phminposuw     xm3, xm3
+
+    ; convert idx to 32-bits
+    psrld          xm3, 16
+    movd           eax, xm3
+
+    ; get idx^4 complement
+    vpermd          m3, m1
+    psubd          xm2, xm3
+    psrld          xm2, 10
+    movd        [varq], xm2
+    RET
+
+%endif ; ARCH_X86_64
diff --git a/src/x86/cdef_avx512.asm b/src/x86/cdef_avx512.asm
new file mode 100644 (file)
index 0000000..e7eee9e
--- /dev/null
@@ -0,0 +1,867 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if HAVE_AVX512ICL && ARCH_X86_64
+
+%macro DUP4 1-*
+    %rep %0
+        times 4 db %1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DIRS 16 ; cdef_directions[]
+    %rep 4 + 16 + 4 ; 6 7   0 1 2 3 4 5 6 7   0 1
+        ; masking away unused bits allows us to use a single vpaddd {1to16}
+        ; instruction instead of having to do vpbroadcastd + paddb
+        db %13 & 0x3f, -%13 & 0x3f
+        %rotate 1
+    %endrep
+%endmacro
+
+SECTION_RODATA 64
+
+lut_perm_4x4:  db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+               db 16, 17,  0,  1,  2,  3,  4,  5, 18, 19,  8,  9, 10, 11, 12, 13
+               db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37
+               db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57
+lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+              db  96, 97,  0,  1,  2,  3,  4,  5, 98, 99,  8,  9, 10, 11, 12, 13
+lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29
+              db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45
+              db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61
+               db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95
+pd_01234567:   dd  0,  1,  2,  3,  4,  5,  6,  7
+lut_perm_8x8a: db  0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
+               db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55
+               db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87
+               db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119
+lut_perm_8x8b: db  4,  5,  6,  7,  8,  9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27
+               db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59
+               db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91
+              db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123
+edge_mask:     dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001
+               dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011
+               dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101
+               dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111
+               dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001
+               dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011
+               dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101
+               dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111
+px_idx:      DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45
+cdef_dirs:   DIRS -7,-14,  1, -6,  1,  2,  1, 10,  9, 18,  8, 17,  8, 16,  8, 15
+gf_shr:        dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0
+               dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2
+               dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4
+               dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6
+      times 16 db  0 ; realign (introduced by cdef_dirs)
+end_perm_w8clip:db 0, 4,  8, 12,  2,  6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
+               db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62
+               db  1,  5,  9, 13,  3,  7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31
+               db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63
+end_perm:      db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+               db  3,  7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+pri_tap:       db 64, 64, 32, 32, 48, 48, 48, 48         ; left-shifted by 4
+sec_tap:       db 32, 32, 16, 16
+pd_268435568:  dd 268435568
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 5, 6
+%else
+DECLARE_REG_TMP 8, 5
+%endif
+
+; lut:
+; t0 t1 t2 t3 t4 t5 t6 t7
+; T0 T1 T2 T3 T4 T5 T6 T7
+; L0 L1 00 01 02 03 04 05
+; L2 L3 10 11 12 13 14 15
+; L4 L5 20 21 22 23 24 25
+; L6 L7 30 31 32 33 34 35
+; 4e 4f 40 41 42 43 44 45
+; 5e 5f 50 51 52 53 54 55
+
+INIT_ZMM avx512icl
+cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge
+%define base r7-edge_mask
+    movq         xmm0, [dstq+strideq*0]
+    movhps       xmm0, [dstq+strideq*1]
+    lea            r7, [edge_mask]
+    movq         xmm1, [topq+strideq*0-2]
+    movhps       xmm1, [topq+strideq*1-2]
+    mov           r6d, edgem
+    vinserti32x4  ym0, ymm0, [leftq], 1
+    lea            r2, [strideq*3]
+    vinserti32x4  ym1, ymm1, [dstq+strideq*2], 1
+    mova           m5, [base+lut_perm_4x4]
+    vinserti32x4   m0, [dstq+r2], 2
+    test          r6b, 0x08      ; avoid buffer overread
+    jz .main
+    lea            r3, [dstq+strideq*4-4]
+    vinserti32x4   m1, [r3+strideq*0], 2
+    vinserti32x4   m0, [r3+strideq*1], 3
+.main:
+    movifnidn    prid, prim
+    mov           t0d, dirm
+    mova           m3, [base+px_idx]
+    mov           r3d, dampingm
+    vpermi2b       m5, m0, m1    ; lut
+    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+    pxor           m7, m7
+    lea            r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8
+    vpermb         m6, m3, m5    ; px
+    cmp           r6d, 0x0f
+    jne .mask_edges              ; mask edges only if required
+    test         prid, prid
+    jz .sec_only
+    vpaddd         m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+    vpermb         m1, m1, m5    ; k0p0 k0p1 k1p0 k1p1
+%macro CDEF_FILTER_4x4_PRI 0
+    vpcmpub        k1, m6, m1, 6 ; px > pN
+    psubb          m2, m1, m6
+    lzcnt         r6d, prid
+    vpsubb     m2{k1}, m6, m1    ; abs(diff)
+    vpbroadcastb   m4, prid
+    and          prid, 1
+    vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift
+    movifnidn     t1d, secm
+    vpbroadcastd  m10, [base+pri_tap+priq*4]
+    vpsubb    m10{k1}, m7, m10   ; apply_sign(pri_tap)
+    psubusb        m4, m9        ; imax(0, pri_strength - (abs(diff) >> shift)))
+    pminub         m2, m4
+    vpdpbusd       m0, m2, m10   ; sum
+%endmacro
+    CDEF_FILTER_4x4_PRI
+    test          t1d, t1d       ; sec
+    jz .end_no_clip
+    call .sec
+.end_clip:
+    pminub         m4, m6, m1
+    pmaxub         m1, m6
+    pminub         m5, m2, m3
+    pmaxub         m2, m3
+    pminub         m4, m5
+    pmaxub         m2, m1
+    psrldq         m1, m4, 2
+    psrldq         m3, m2, 2
+    pminub         m1, m4
+    vpcmpw         k1, m0, m7, 1
+    vpshldd        m6, m0, 8
+    pmaxub         m2, m3
+    pslldq         m3, m1, 1
+    psubw          m7, m0
+    paddusw        m0, m6     ; clip >0xff
+    vpsubusw   m0{k1}, m6, m7 ; clip <0x00
+    pslldq         m4, m2, 1
+    pminub         m1, m3
+    pmaxub         m2, m4
+    pmaxub         m0, m1
+    pminub         m0, m2
+    jmp .end
+.sec_only:
+    movifnidn     t1d, secm
+    call .sec
+.end_no_clip:
+    vpshldd        m6, m0, 8  ; (px << 8) + ((sum > -8) << 4)
+    paddw          m0, m6     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+.end:
+    mova          xm1, [base+end_perm]
+    vpermb         m0, m1, m0 ; output in bits 8-15 of each dword
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r2       ], xm0, 3
+    RET
+.mask_edges_sec_only:
+    movifnidn     t1d, secm
+    call .mask_edges_sec
+    jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+    vpbroadcastq   m8, [base+edge_mask+r6*8]
+    test         prid, prid
+    jz .mask_edges_sec_only
+    vpaddd         m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16}
+    vpshufbitqmb   k1, m8, m2 ; index in-range
+    mova           m1, m6
+    vpermb     m1{k1}, m2, m5
+    CDEF_FILTER_4x4_PRI
+    test          t1d, t1d
+    jz .end_no_clip
+    call .mask_edges_sec
+    jmp .end_clip
+.mask_edges_sec:
+    vpaddd         m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16}
+    vpaddd         m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16}
+    vpshufbitqmb   k1, m8, m4
+    mova           m2, m6
+    vpermb     m2{k1}, m4, m5
+    vpshufbitqmb   k1, m8, m9
+    mova           m3, m6
+    vpermb     m3{k1}, m9, m5
+    jmp .sec_main
+ALIGN function_align
+.sec:
+    vpaddd         m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+    vpaddd         m3,     [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+    vpermb         m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1
+    vpermb         m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3
+.sec_main:
+    vpbroadcastd   m8, [base+sec_tap]
+    vpcmpub        k1, m6, m2, 6
+    psubb          m4, m2, m6
+    vpbroadcastb  m12, t1d
+    lzcnt         t1d, t1d
+    vpsubb     m4{k1}, m6, m2
+    vpcmpub        k2, m6, m3, 6
+    vpbroadcastq  m11, [r3+t1*8]
+    gf2p8affineqb m10, m4, m11, 0
+    psubb          m5, m3, m6
+    mova           m9, m8
+    vpsubb     m8{k1}, m7, m8
+    psubusb       m10, m12, m10
+    vpsubb     m5{k2}, m6, m3
+    pminub         m4, m10
+    vpdpbusd       m0, m4, m8
+    gf2p8affineqb m11, m5, m11, 0
+    vpsubb     m9{k2}, m7, m9
+    psubusb       m12, m11
+    pminub         m5, m12
+    vpdpbusd       m0, m5, m9
+    ret
+
+DECLARE_REG_TMP 2, 7
+
+;         lut top                lut bottom
+; t0 t1 t2 t3 t4 t5 t6 t7  L4 L5 20 21 22 23 24 25
+; T0 T1 T2 T3 T4 T5 T6 T7  L6 L7 30 31 32 33 34 35
+; L0 L1 00 01 02 03 04 05  L8 L9 40 41 42 43 44 45
+; L2 L3 10 11 12 13 14 15  La Lb 50 51 52 53 54 55
+; L4 L5 20 21 22 23 24 25  Lc Ld 60 61 62 63 64 65
+; L6 L7 30 31 32 33 34 35  Le Lf 70 71 72 73 74 75
+; L8 L9 40 41 42 43 44 45  8e 8f 80 81 82 83 84 85
+; La Lb 50 51 52 53 54 55  9e 9f 90 91 92 93 94 95
+
+cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \
+                                   pri, sec, dir, damping, edge
+%define base r8-edge_mask
+    vpbroadcastd ym21, strided
+    mov           r6d, edgem
+    lea            r8, [edge_mask]
+    movq          xm1, [topq+strideq*0-2]
+    pmulld       ym21, [base+pd_01234567]
+    kxnorb         k1, k1, k1
+    movq          xm2, [topq+strideq*1-2]
+    vpgatherdq m0{k1}, [dstq+ym21]  ; +0+1 +2+3 +4+5 +6+7
+    mova          m14, [base+lut_perm_4x8a]
+    movu          m15, [base+lut_perm_4x8b]
+    test          r6b, 0x08         ; avoid buffer overread
+    jz .main
+    lea            r7, [dstq+strideq*8-2]
+    vinserti32x4  ym1, [r7+strideq*0], 1
+    vinserti32x4  ym2, [r7+strideq*1], 1
+.main:
+    punpcklqdq    ym1, ym2
+    vinserti32x4   m1, [leftq], 2   ; -2-1 +8+9 left ____
+    movifnidn    prid, prim
+    mov           t0d, dirm
+    mova          m16, [base+px_idx]
+    mov           r3d, dampingm
+    vpermi2b      m14, m0, m1    ; lut top
+    vpermi2b      m15, m0, m1    ; lut bottom
+    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+    pxor          m20, m20
+    lea            r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
+    vpermb         m2, m16, m14  ; pxt
+    vpermb         m3, m16, m15  ; pxb
+    mova           m1, m0
+    cmp           r6b, 0x0f
+    jne .mask_edges              ; mask edges only if required
+    test         prid, prid
+    jz .sec_only
+    vpaddd         m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+    vpermb         m4, m6, m14   ; pNt k0p0 k0p1 k1p0 k1p1
+    vpermb         m5, m6, m15   ; pNb
+%macro CDEF_FILTER_4x8_PRI 0
+    vpcmpub        k1, m2, m4, 6 ; pxt > pNt
+    vpcmpub        k2, m3, m5, 6 ; pxb > pNb
+    psubb          m6, m4, m2
+    psubb          m7, m5, m3
+    lzcnt         r6d, prid
+    vpsubb     m6{k1}, m2, m4    ; abs(diff_top)
+    vpsubb     m7{k2}, m3, m5    ; abs(diff_bottom)
+    vpbroadcastb  m13, prid
+    vpbroadcastq   m9, [r3+r6*8]
+    and          prid, 1
+    vpbroadcastd  m11, [base+pri_tap+priq*4]
+    vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift
+    vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift
+    mova          m10, m11
+    movifnidn     t1d, secm
+    vpsubb    m10{k1}, m20, m11  ; apply_sign(pri_tap_top)
+    vpsubb    m11{k2}, m20, m11  ; apply_sign(pri_tap_bottom)
+    psubusb       m12, m13, m8   ; imax(0, pri_strength - (abs(dt) >> shift)))
+    psubusb       m13, m13, m9   ; imax(0, pri_strength - (abs(db) >> shift)))
+    pminub         m6, m12
+    pminub         m7, m13
+    vpdpbusd       m0, m6, m10   ; sum top
+    vpdpbusd       m1, m7, m11   ; sum bottom
+%endmacro
+    CDEF_FILTER_4x8_PRI
+    test          t1d, t1d       ; sec
+    jz .end_no_clip
+    call .sec
+.end_clip:
+    pminub        m10, m4, m2
+    pminub        m12, m6, m8
+    pminub        m11, m5, m3
+    pminub        m13, m7, m9
+    pmaxub         m4, m2
+    pmaxub         m6, m8
+    pmaxub         m5, m3
+    pmaxub         m7, m9
+    pminub        m10, m12
+    pminub        m11, m13
+    pmaxub         m4, m6
+    pmaxub         m5, m7
+    mov           r2d, 0xAAAAAAAA
+    kmovd          k1, r2d
+    kxnorb         k2, k2, k2       ;   hw   lw
+    vpshrdd       m12, m0, m1, 16   ;  m1lw m0hw
+    vpshrdd        m6, m10, m11, 16 ; m11lw m10hw
+    vpshrdd        m8, m4, m5, 16   ;  m5lw m4hw
+    vpblendmw  m7{k1}, m10, m11     ; m11hw m10lw
+    vpblendmw  m9{k1}, m4, m5       ;  m5hw m4lw
+    vpblendmw  m4{k1}, m0, m12      ;  m1lw m0lw
+    vpblendmw  m5{k1}, m12, m1      ;  m1hw m0hw
+    vpshrdd        m2, m3, 16
+    pminub         m6, m7
+    pmaxub         m8, m9
+    mova         ym14, [base+end_perm]
+    vpcmpw         k1, m4, m20, 1
+    vpshldw        m2, m5, 8
+    pslldq         m7, m6, 1
+    pslldq         m9, m8, 1
+    psubw          m5, m20, m4
+    paddusw        m0, m4, m2 ; clip >0xff
+    pminub         m6, m7
+    pmaxub         m8, m9
+    psubusw    m0{k1}, m2, m5 ; clip <0x00
+    pmaxub         m0, m6
+    pminub         m0, m8
+    vpermb         m0, m14, m0
+    vpscatterdd [dstq+ym21]{k2}, ym0
+    RET
+.sec_only:
+    movifnidn     t1d, secm
+    call .sec
+.end_no_clip:
+    mova          ym4, [base+end_perm]
+    kxnorb         k1, k1, k1
+    vpshldd        m2, m0, 8  ; (px << 8) + ((sum > -8) << 4)
+    vpshldd        m3, m1, 8
+    paddw          m0, m2     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+    paddw          m1, m3
+    pslld          m0, 16
+    vpshrdd        m0, m1, 16
+    vpermb         m0, m4, m0 ; output in bits 8-15 of each word
+    vpscatterdd [dstq+ym21]{k1}, ym0
+    RET
+.mask_edges_sec_only:
+    movifnidn     t1d, secm
+    call .mask_edges_sec
+    jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+    mov           t1d, r6d
+    or            r6d, 8 ; top 4x4 has bottom
+    or            t1d, 4 ; bottom 4x4 has top
+    vpbroadcastq  m17, [base+edge_mask+r6*8]
+    vpbroadcastq  m18, [base+edge_mask+t1*8]
+    test         prid, prid
+    jz .mask_edges_sec_only
+    vpaddd         m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16}
+    vpshufbitqmb   k1, m17, m6 ; index in-range
+    vpshufbitqmb   k2, m18, m6
+    mova           m4, m2
+    mova           m5, m3
+    vpermb     m4{k1}, m6, m14
+    vpermb     m5{k2}, m6, m15
+    CDEF_FILTER_4x8_PRI
+    test          t1d, t1d
+    jz .end_no_clip
+    call .mask_edges_sec
+    jmp .end_clip
+.mask_edges_sec:
+    vpaddd        m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16}
+    vpaddd        m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16}
+    vpshufbitqmb   k1, m17, m10
+    vpshufbitqmb   k2, m18, m10
+    vpshufbitqmb   k3, m17, m11
+    vpshufbitqmb   k4, m18, m11
+    mova           m6, m2
+    mova           m7, m3
+    mova           m8, m2
+    mova           m9, m3
+    vpermb     m6{k1}, m10, m14
+    vpermb     m7{k2}, m10, m15
+    vpermb     m8{k3}, m11, m14
+    vpermb     m9{k4}, m11, m15
+    jmp .sec_main
+ALIGN function_align
+.sec:
+    vpaddd         m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+    vpaddd         m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+    vpermb         m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1
+    vpermb         m7, m8, m15 ; pNb
+    vpermb         m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3
+    vpermb         m9, m9, m15 ; pNb
+.sec_main:
+    vpbroadcastb  m18, t1d
+    lzcnt         t1d, t1d
+    vpcmpub        k1, m2, m6, 6
+    vpcmpub        k2, m3, m7, 6
+    vpcmpub        k3, m2, m8, 6
+    vpcmpub        k4, m3, m9, 6
+    vpbroadcastq  m17, [r3+t1*8]
+    psubb         m10, m6, m2
+    psubb         m11, m7, m3
+    psubb         m12, m8, m2
+    psubb         m13, m9, m3
+    vpsubb    m10{k1}, m2, m6      ; abs(dt0)
+    vpsubb    m11{k2}, m3, m7      ; abs(db0)
+    vpsubb    m12{k3}, m2, m8      ; abs(dt1)
+    vpsubb    m13{k4}, m3, m9      ; abs(db1)
+    vpbroadcastd  m19, [base+sec_tap]
+    gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift
+    gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift
+    gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift
+    gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift
+    psubusb       m14, m18, m14    ; imax(0, sec_strength - (abs(dt0) >> shift)))
+    psubusb       m15, m18, m15    ; imax(0, sec_strength - (abs(db0) >> shift)))
+    psubusb       m16, m18, m16    ; imax(0, sec_strength - (abs(dt1) >> shift)))
+    psubusb       m17, m18, m17    ; imax(0, sec_strength - (abs(db1) >> shift)))
+    pminub        m10, m14
+    pminub        m11, m15
+    pminub        m12, m16
+    pminub        m13, m17
+    mova          m14, m19
+    mova          m15, m19
+    mova          m16, m19
+    vpsubb    m14{k1}, m20, m19    ; apply_sign(sec_tap_top_0)
+    vpsubb    m15{k2}, m20, m19    ; apply_sign(sec_tap_bottom_0)
+    vpsubb    m16{k3}, m20, m19    ; apply_sign(sec_tap_top_1)
+    vpsubb    m19{k4}, m20, m19    ; apply_sign(sec_tap_bottom_1)
+    vpdpbusd       m0, m10, m14
+    vpdpbusd       m1, m11, m15
+    vpdpbusd       m0, m12, m16
+    vpdpbusd       m1, m13, m19
+    ret
+
+;         lut tl                   lut tr
+; t0 t1 t2 t3 t4 t5 t6 t7  t6 t7 t8 t9 ta tb tc td
+; T0 T1 T2 T3 T4 T5 T6 T7  T6 T7 T8 T9 TA TB TC TD
+; L0 L1 00 01 02 03 04 05  04 05 06 07 08 09 0a 0b
+; L2 L3 10 11 12 13 14 15  14 15 16 17 18 19 1a 1b
+; L4 L5 20 21 22 23 24 25  24 25 26 27 28 29 2a 2b
+; L6 L7 30 31 32 33 34 35  34 35 36 37 38 39 3a 3b
+; L8 L9 40 41 42 43 44 45  44 45 46 47 48 49 4a 4b
+; La Lb 50 51 52 53 54 55  54 55 56 57 58 59 5a 5b
+;         lut bl                   lut br
+; L4 L5 20 21 22 23 24 25  24 25 26 27 28 29 2a 2b
+; L6 L7 30 31 32 33 34 35  34 35 36 37 38 39 3a 3b
+; L8 L9 40 41 42 43 44 45  44 45 46 47 48 49 4a 4b
+; La Lb 50 51 52 53 54 55  54 55 56 57 58 59 5a 5b
+; Lc Ld 60 61 62 63 64 65  64 65 66 67 68 69 6a 6b
+; Le Lf 70 71 72 73 74 75  74 75 76 77 78 79 7a 7b
+; 8e 8f 80 81 82 83 84 85  84 85 86 87 88 89 8a 8b
+; 9e 9f 90 91 92 93 94 95  94 95 96 97 98 99 9a 9b
+
+cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \
+                                          pri, sec, dir, damping, edge
+%define base r8-edge_mask
+    mov           r6d, edgem
+    lea           r10, [dstq+strideq*4-2]
+    movu         xmm0, [topq+strideq*0-2]
+    movu         xmm1, [dstq+strideq*2-2]
+    movu         xmm2, [r10 +strideq*2  ]
+    lea            r8, [edge_mask]
+    lea            r9, [strideq*3]
+    pmovzxwq      m10, [leftq-4]
+    vinserti32x4  ym0, ymm0, [topq+strideq*1-2], 1
+    vinserti32x4  ym1, ymm1, [dstq+r9       -2], 1
+    vinserti32x4  ym2, ymm2, [r10 +r9         ], 1
+    lea            r7, [r10 +strideq*4  ]
+    pmovzxwq      m11, [leftq+4]
+    vinserti32x4   m0, [dstq+strideq*0-2], 2
+    vinserti32x4   m1, [r10 +strideq*0  ], 2
+    mova          m12, [base+lut_perm_8x8a]
+    movu          m13, [base+lut_perm_8x8b]
+    vinserti32x4   m0, [dstq+strideq*1-2], 3
+    vinserti32x4   m1, [r10 +strideq*1  ], 3
+    test          r6b, 0x08       ; avoid buffer overread
+    jz .main
+    vinserti32x4   m2, [r7  +strideq*0], 2
+    vinserti32x4   m2, [r7  +strideq*1], 3
+.main:
+    mov           t1d, 0x11111100
+    mova          m14, m12
+    mova          m15, m13
+    kmovd          k1, t1d
+    kshiftrd       k2, k1, 8
+    movifnidn    prid, prim
+    mov           t0d, dirm
+    mova          m30, [base+px_idx]
+    mov           r3d, dampingm
+    vpermi2b      m12, m0, m1     ; lut tl
+    vpermi2b      m14, m1, m2     ; lut bl
+    vpermi2b      m13, m0, m1     ; lut tr
+    vpermi2b      m15, m1, m2     ; lut br
+    vpblendmw m12{k1}, m12, m10
+    vpblendmw m14{k2}, m14, m11
+    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+    pxor          m31, m31
+    lea            r3, [r8+r3*8]  ; gf_shr + (damping - 30) * 8
+    vpermb         m4, m30, m12   ; pxtl
+    vpermb         m5, m30, m13   ; pxtr
+    vpermb         m6, m30, m14   ; pxbl
+    vpermb         m7, m30, m15   ; pxbr
+    mova           m1, m0
+    mova           m2, m0
+    mova           m3, m0
+    cmp           r6b, 0x0f
+    jne .mask_edges               ; mask edges only if required
+    test         prid, prid
+    jz .sec_only
+    vpaddd        m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+    vpermb         m8, m11, m12   ; pNtl k0p0 k0p1 k1p0 k1p1
+    vpermb         m9, m11, m13   ; pNtr
+    vpermb        m10, m11, m14   ; pNbl
+    vpermb        m11, m11, m15   ; pNbr
+%macro CDEF_FILTER_8x8_PRI 0
+    vpcmpub        k1, m4, m8, 6  ; pxtl > pNtl
+    vpcmpub        k2, m5, m9, 6  ; pxtr > pNtr
+    vpcmpub        k3, m6, m10, 6 ; pxbl > pNbl
+    vpcmpub        k4, m7, m11, 6 ; pxbr > pNbr
+    psubb         m16, m8, m4
+    psubb         m17, m9, m5
+    psubb         m18, m10, m6
+    psubb         m19, m11, m7
+    lzcnt         r6d, prid
+    vpsubb    m16{k1}, m4, m8     ; abs(diff_tl)
+    vpsubb    m17{k2}, m5, m9     ; abs(diff_tr)
+    vpsubb    m18{k3}, m6, m10    ; abs(diff_bl)
+    vpsubb    m19{k4}, m7, m11    ; abs(diff_br)
+    vpbroadcastq  m28, [r3+r6*8]
+    vpbroadcastb  m29, prid
+    and          prid, 1
+    vpbroadcastd  m27, [base+pri_tap+priq*4]
+    vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift
+    vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift
+    vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift
+    vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift
+    mova          m24, m27
+    mova          m25, m27
+    mova          m26, m27
+    movifnidn     t1d, secm
+    vpsubb    m24{k1}, m31, m27   ; apply_sign(pri_tap_tl)
+    vpsubb    m25{k2}, m31, m27   ; apply_sign(pri_tap_tr)
+    vpsubb    m26{k3}, m31, m27   ; apply_sign(pri_tap_tl)
+    vpsubb    m27{k4}, m31, m27   ; apply_sign(pri_tap_tr)
+    psubusb       m20, m29, m20   ; imax(0, pri_strength - (abs(dtl) >> shift)))
+    psubusb       m21, m29, m21   ; imax(0, pri_strength - (abs(dtr) >> shift)))
+    psubusb       m22, m29, m22   ; imax(0, pri_strength - (abs(dbl) >> shift)))
+    psubusb       m23, m29, m23   ; imax(0, pri_strength - (abs(dbr) >> shift)))
+    pminub        m16, m20
+    pminub        m17, m21
+    pminub        m18, m22
+    pminub        m19, m23
+    vpdpbusd       m0, m16, m24   ; sum tl
+    vpdpbusd       m1, m17, m25   ; sum tr
+    vpdpbusd       m2, m18, m26   ; sum bl
+    vpdpbusd       m3, m19, m27   ; sum br
+%endmacro
+    CDEF_FILTER_8x8_PRI
+    test          t1d, t1d        ; sec
+    jz .end_no_clip
+    call .sec
+.end_clip:
+    pminub        m20, m8, m4
+    pminub        m24, m12, m16
+    pminub        m21, m9, m5
+    pminub        m25, m13, m17
+    pminub        m22, m10, m6
+    pminub        m26, m14, m18
+    pminub        m23, m11, m7
+    pminub        m27, m15, m19
+    pmaxub         m8, m4
+    pmaxub        m12, m16
+    pmaxub         m9, m5
+    pmaxub        m13, m17
+    pmaxub        m10, m6
+    pmaxub        m14, m18
+    pmaxub        m11, m7
+    pmaxub        m15, m19
+    pminub        m20, m24
+    pminub        m21, m25
+    pminub        m22, m26
+    pminub        m23, m27
+    pmaxub         m8, m12
+    pmaxub         m9, m13
+    pmaxub        m10, m14
+    pmaxub        m11, m15
+    mov           r2d, 0xAAAAAAAA
+    kmovd          k1, r2d
+    vpshrdd       m24,  m0,  m1, 16
+    vpshrdd       m25,  m2,  m3, 16
+    vpshrdd       m12, m20, m21, 16
+    vpshrdd       m14, m22, m23, 16
+    vpshrdd       m16,  m8,  m9, 16
+    vpshrdd       m18, m10, m11, 16
+    vpblendmw m13{k1}, m20, m21
+    vpblendmw m15{k1}, m22, m23
+    vpblendmw m17{k1},  m8, m9
+    vpblendmw m19{k1}, m10, m11
+    vpblendmw m20{k1},  m0, m24
+    vpblendmw m21{k1}, m24, m1
+    vpblendmw m22{k1},  m2, m25
+    vpblendmw m23{k1}, m25, m3
+    vpshrdd        m4, m5, 16
+    vpshrdd        m6, m7, 16
+    pminub        m12, m13
+    pminub        m14, m15
+    pmaxub        m16, m17
+    pmaxub        m18, m19
+    mova           m8, [base+end_perm_w8clip]
+    vpcmpw         k2, m20, m31, 1
+    vpcmpw         k3, m22, m31, 1
+    vpshldw        m4, m21, 8
+    vpshldw        m6, m23, 8
+    kunpckdq       k1, k1, k1
+    kxnorb         k4, k4, k4
+    vpshrdw       m11, m12, m14, 8
+    vpshrdw       m15, m16, m18, 8
+    vpblendmb m13{k1}, m12, m14
+    vpblendmb m17{k1}, m16, m18
+    psubw         m21, m31, m20
+    psubw         m23, m31, m22
+    paddusw        m0, m20, m4  ; clip >0xff
+    paddusw        m1, m22, m6
+    pminub        m11, m13
+    pmaxub        m15, m17
+    psubusw    m0{k2}, m4, m21  ; clip <0x00
+    psubusw    m1{k3}, m6, m23
+    psrlw          m0, 8
+    vmovdqu8   m0{k1}, m1
+    pmaxub         m0, m11
+    pminub         m0, m15
+    vpermb         m0, m8, m0
+    add           r10, 2
+    vextracti32x4 xm1, m0, 1
+    vextracti32x4 xm2, m0, 2
+    vextracti32x4 xm3, m0, 3
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*2], xm1
+    movq   [r10 +strideq*0], xm2
+    movq   [r10 +strideq*2], xm3
+    movhps [dstq+strideq*1], xm0
+    movhps [dstq+r9       ], xm1
+    movhps [r10 +strideq*1], xm2
+    movhps [r10 +r9       ], xm3
+    RET
+.sec_only:
+    movifnidn     t1d, secm
+    call .sec
+.end_no_clip:
+    mova          xm8, [base+end_perm]
+    kxnorb         k1, k1, k1
+    vpshldd        m4, m0, 8  ; (px << 8) + ((sum > -8) << 4)
+    vpshldd        m5, m1, 8
+    vpshldd        m6, m2, 8
+    vpshldd        m7, m3, 8
+    paddw          m0, m4     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+    paddw          m1, m5
+    paddw          m2, m6
+    paddw          m3, m7
+    vpermb         m0, m8, m0
+    vpermb         m1, m8, m1
+    vpermb         m2, m8, m2
+    vpermb         m3, m8, m3
+    add           r10, 2
+    punpckldq      m4, m0, m1
+    punpckhdq      m0, m1
+    punpckldq      m5, m2, m3
+    punpckhdq      m2, m3
+    movq   [dstq+strideq*0], xm4
+    movq   [dstq+strideq*2], xm0
+    movq   [r10 +strideq*0], xm5
+    movq   [r10 +strideq*2], xm2
+    movhps [dstq+strideq*1], xm4
+    movhps [dstq+r9       ], xm0
+    movhps [r10 +strideq*1], xm5
+    movhps [r10 +r9       ], xm2
+    RET
+.mask_edges_sec_only:
+    movifnidn     t1d, secm
+    call .mask_edges_sec
+    jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+    mov           t0d, r6d
+    mov           t1d, r6d
+    or            t0d, 0xA ; top-left 4x4 has bottom and right
+    or            t1d, 0x9 ; top-right 4x4 has bottom and left
+    vpbroadcastq  m26, [base+edge_mask+t0*8]
+    vpbroadcastq  m27, [base+edge_mask+t1*8]
+    mov           t1d, r6d
+    or            r6d, 0x6 ; bottom-left 4x4 has top and right
+    or            t1d, 0x5 ; bottom-right 4x4 has top and left
+    vpbroadcastq  m28, [base+edge_mask+r6*8]
+    vpbroadcastq  m29, [base+edge_mask+t1*8]
+    mov           t0d, dirm
+    test         prid, prid
+    jz .mask_edges_sec_only
+    vpaddd        m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16}
+    vpshufbitqmb   k1, m26, m20 ; index in-range
+    vpshufbitqmb   k2, m27, m20
+    vpshufbitqmb   k3, m28, m20
+    vpshufbitqmb   k4, m29, m20
+    mova           m8, m4
+    mova           m9, m5
+    mova          m10, m6
+    mova          m11, m7
+    vpermb     m8{k1}, m20, m12
+    vpermb     m9{k2}, m20, m13
+    vpermb    m10{k3}, m20, m14
+    vpermb    m11{k4}, m20, m15
+    mova   [rsp+0x00], m26
+    mova   [rsp+0x40], m27
+    mova   [rsp+0x80], m28
+    mova   [rsp+0xC0], m29
+    CDEF_FILTER_8x8_PRI
+    test          t1d, t1d
+    jz .end_no_clip
+    mova          m26, [rsp+0x00]
+    mova          m27, [rsp+0x40]
+    mova          m28, [rsp+0x80]
+    mova          m29, [rsp+0xC0]
+    call .mask_edges_sec
+    jmp .end_clip
+.mask_edges_sec:
+    vpaddd        m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16}
+    vpaddd        m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16}
+    vpshufbitqmb   k1, m26, m20
+    vpshufbitqmb   k2, m27, m20
+    vpshufbitqmb   k3, m28, m20
+    vpshufbitqmb   k4, m29, m20
+    mova          m16, m4
+    mova          m17, m5
+    mova          m18, m6
+    mova          m19, m7
+    vpermb    m16{k1}, m20, m12
+    vpermb    m17{k2}, m20, m13
+    vpermb    m18{k3}, m20, m14
+    vpermb    m19{k4}, m20, m15
+    vpshufbitqmb   k1, m26, m21
+    vpshufbitqmb   k2, m27, m21
+    vpshufbitqmb   k3, m28, m21
+    vpshufbitqmb   k4, m29, m21
+    vpermb        m12, m21, m12
+    vpermb        m13, m21, m13
+    vpermb        m14, m21, m14
+    vpermb        m15, m21, m15
+    vpblendmb m12{k1}, m4, m12
+    vpblendmb m13{k2}, m5, m13
+    vpblendmb m14{k3}, m6, m14
+    vpblendmb m15{k4}, m7, m15
+    jmp .sec_main
+ALIGN function_align
+.sec:
+    vpaddd        m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+    vpaddd        m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+    vpermb        m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1
+    vpermb        m17, m20, m13 ; pNtr
+    vpermb        m18, m20, m14 ; pNbl
+    vpermb        m19, m20, m15 ; pNbr
+    vpermb        m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3
+    vpermb        m13, m21, m13 ; pNtr
+    vpermb        m14, m21, m14 ; pNbl
+    vpermb        m15, m21, m15 ; pNbr
+.sec_main:
+%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants
+    vpcmpub        k1, m4, %1, 6
+    vpcmpub        k2, m5, %2, 6
+    vpcmpub        k3, m6, %3, 6
+    vpcmpub        k4, m7, %4, 6
+    psubb         m20, %1, m4
+    psubb         m21, %2, m5
+    psubb         m22, %3, m6
+    psubb         m23, %4, m7
+%if %5
+    vpbroadcastb  m28, t1d
+    lzcnt         t1d, t1d
+    vpbroadcastq  m29, [r3+t1*8]
+%endif
+    vpsubb    m20{k1}, m4, %1
+    vpsubb    m21{k2}, m5, %2
+    vpsubb    m22{k3}, m6, %3
+    vpsubb    m23{k4}, m7, %4
+    gf2p8affineqb m24, m20, m29, 0
+    gf2p8affineqb m25, m21, m29, 0
+    gf2p8affineqb m26, m22, m29, 0
+    gf2p8affineqb m27, m23, m29, 0
+%if %5
+    vpbroadcastd  m30, [base+sec_tap]
+%endif
+    psubusb       m24, m28, m24
+    psubusb       m25, m28, m25
+    psubusb       m26, m28, m26
+    psubusb       m27, m28, m27
+    pminub        m20, m24
+    pminub        m21, m25
+    pminub        m22, m26
+    pminub        m23, m27
+    mova          m24, m30
+    mova          m25, m30
+    mova          m26, m30
+    mova          m27, m30
+    vpsubb    m24{k1}, m31, m30
+    vpsubb    m25{k2}, m31, m30
+    vpsubb    m26{k3}, m31, m30
+    vpsubb    m27{k4}, m31, m30
+    vpdpbusd       m0, m20, m24
+    vpdpbusd       m1, m21, m25
+    vpdpbusd       m2, m22, m26
+    vpdpbusd       m3, m23, m27
+%endmacro
+    CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1
+    CDEF_FILTER_8x8_SEC m12, m13, m14, m15
+    ret
+
+%endif ; HAVE_AVX512ICL && ARCH_X86_64
diff --git a/src/x86/cdef_init_tmpl.c b/src/x86/cdef_init_tmpl.c
new file mode 100644 (file)
index 0000000..edc3b5d
--- /dev/null
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+#define decl_cdef_size_fn(sz) \
+    decl_cdef_fn(dav1d_cdef_filter_##sz##_avx512icl); \
+    decl_cdef_fn(dav1d_cdef_filter_##sz##_avx2); \
+    decl_cdef_fn(dav1d_cdef_filter_##sz##_sse4); \
+    decl_cdef_fn(dav1d_cdef_filter_##sz##_ssse3); \
+    decl_cdef_fn(dav1d_cdef_filter_##sz##_sse2)
+
+decl_cdef_size_fn(4x4);
+decl_cdef_size_fn(4x8);
+decl_cdef_size_fn(8x8);
+
+decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
+decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
+decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
+
+COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+#if BITDEPTH == 8
+    c->fb[0] = dav1d_cdef_filter_8x8_sse2;
+    c->fb[1] = dav1d_cdef_filter_4x8_sse2;
+    c->fb[2] = dav1d_cdef_filter_4x4_sse2;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+    c->dir = dav1d_cdef_dir_ssse3;
+    c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
+    c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
+    c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
+
+#if BITDEPTH == 8
+    c->dir = dav1d_cdef_dir_sse4;
+    c->fb[0] = dav1d_cdef_filter_8x8_sse4;
+    c->fb[1] = dav1d_cdef_filter_4x8_sse4;
+    c->fb[2] = dav1d_cdef_filter_4x4_sse4;
+#endif
+
+#if ARCH_X86_64
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8
+    c->dir = dav1d_cdef_dir_avx2;
+    c->fb[0] = dav1d_cdef_filter_8x8_avx2;
+    c->fb[1] = dav1d_cdef_filter_4x8_avx2;
+    c->fb[2] = dav1d_cdef_filter_4x4_avx2;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+#if HAVE_AVX512ICL && BITDEPTH == 8
+    c->fb[0] = dav1d_cdef_filter_8x8_avx512icl;
+    c->fb[1] = dav1d_cdef_filter_4x8_avx512icl;
+    c->fb[2] = dav1d_cdef_filter_4x4_avx512icl;
+#endif
+
+#endif
+}
diff --git a/src/x86/cdef_sse.asm b/src/x86/cdef_sse.asm
new file mode 100644 (file)
index 0000000..dda97b5
--- /dev/null
@@ -0,0 +1,1405 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2019, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%if ARCH_X86_32
+pb_0: times 16 db 0
+pb_0xFF: times 16 db 0xFF
+%endif
+pw_8: times 8 dw 8
+pw_128: times 8 dw 128
+pw_256: times 8 dw 256
+pw_2048: times 8 dw 2048
+%if ARCH_X86_32
+pw_0x7FFF: times 8 dw 0x7FFF
+pw_0x8000: times 8 dw 0x8000
+%endif
+div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
+                dd 420, 210, 140, 105, 105, 105, 105, 105
+div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
+                 dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105
+shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
+tap_table: ; masks for 8-bit shift emulation
+           db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
+           ; weights
+           db 4, 2, 3, 3, 2, 1
+           ; taps indices
+           db -1 * 16 + 1, -2 * 16 + 2
+           db  0 * 16 + 1, -1 * 16 + 2
+           db  0 * 16 + 1,  0 * 16 + 2
+           db  0 * 16 + 1,  1 * 16 + 2
+           db  1 * 16 + 1,  2 * 16 + 2
+           db  1 * 16 + 0,  2 * 16 + 1
+           db  1 * 16 + 0,  2 * 16 + 0
+           db  1 * 16 + 0,  2 * 16 - 1
+           ; the last 6 are repeats of the first 6 so we don't need to & 7
+           db -1 * 16 + 1, -2 * 16 + 2
+           db  0 * 16 + 1, -1 * 16 + 2
+           db  0 * 16 + 1,  0 * 16 + 2
+           db  0 * 16 + 1,  1 * 16 + 2
+           db  1 * 16 + 1,  2 * 16 + 2
+           db  1 * 16 + 0,  2 * 16 + 1
+
+SECTION .text
+
+%macro movif32 2
+ %if ARCH_X86_32
+    mov     %1, %2
+ %endif
+%endmacro
+
+%macro SAVE_ARG 2   ; varname, argnum
+ %define %1_stkloc  [rsp+%2*gprsize]
+ %define %1_argnum  %2
+    mov             r2, r%2m
+    mov      %1_stkloc, r2
+%endmacro
+
+%macro LOAD_ARG 1-2 0 ; varname, load_to_varname_register
+ %if %2 == 0
+    mov r %+ %{1}_argnum, %1_stkloc
+ %else
+    mov            %1q, %1_stkloc
+ %endif
+%endmacro
+
+%macro LOAD_ARG32 1-2 ; varname, load_to_varname_register
+ %if ARCH_X86_32
+  %if %0 == 1
+    LOAD_ARG %1
+  %else
+    LOAD_ARG %1, %2
+  %endif
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+%else
+ %define PIC_sym(sym) sym
+%endif
+
+%macro SAVE_PIC_REG 1
+ %if ARCH_X86_32
+    mov       [esp+%1], PIC_reg
+ %endif
+%endmacro
+
+%macro LOAD_PIC_REG 1
+ %if ARCH_X86_32
+    mov        PIC_reg, [esp+%1]
+ %endif
+%endmacro
+
+%macro PMOVZXBW 2-3 0 ; %3 = half
+ %if %3 == 1
+    movd            %1, %2
+ %else
+    movq            %1, %2
+ %endif
+    punpcklbw       %1, m15
+%endmacro
+
+%macro PSHUFB_0 2
+ %if cpuflag(ssse3)
+    pshufb          %1, %2
+ %else
+    punpcklbw       %1, %1
+    pshuflw         %1, %1, q0000
+    punpcklqdq      %1, %1
+ %endif
+%endmacro
+
+%macro LOAD_SEC_TAP 0
+ %if ARCH_X86_64
+    movd            m3, [secq+kq]
+    PSHUFB_0        m3, m15
+ %else
+    movd            m2, [secq+kq]             ; sec_taps
+    pxor            m3, m3
+    PSHUFB_0        m2, m3
+ %endif
+%endmacro
+
+%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, stride
+    ; load p0/p1
+    movsx         offq, byte [dirq+kq+%1]       ; off1
+ %if %6 == 4
+    movq            m5, [stkq+offq*2+%7*0]      ; p0
+    movhps          m5, [stkq+offq*2+%7*1]
+ %else
+    movu            m5, [stkq+offq*2+%7*0]      ; p0
+ %endif
+    neg           offq                          ; -off1
+ %if %6 == 4
+    movq            m6, [stkq+offq*2+%7*0]      ; p1
+    movhps          m6, [stkq+offq*2+%7*1]
+ %else
+    movu            m6, [stkq+offq*2+%7*0]      ; p1
+ %endif
+ %if cpuflag(sse4)
+    ; out of bounds values are set to a value that is a both a large unsigned
+    ; value and a negative signed value.
+    ; use signed max and unsigned min to remove them
+    pmaxsw          m7, m5
+    pminuw          m8, m5
+    pmaxsw          m7, m6
+    pminuw          m8, m6
+ %else
+  %if ARCH_X86_64
+    pcmpeqw         m9, m14, m5
+    pcmpeqw        m10, m14, m6
+    pandn           m9, m5
+    pandn          m10, m6
+    pmaxsw          m7, m9                      ; max after p0
+    pminsw          m8, m5                      ; min after p0
+    pmaxsw          m7, m10                     ; max after p1
+    pminsw          m8, m6                      ; min after p1
+  %else
+    pcmpeqw         m9, m5, OUT_OF_BOUNDS_MEM
+    pandn           m9, m5
+    pmaxsw          m7, m9                      ; max after p0
+    pminsw          m8, m5                      ; min after p0
+    pcmpeqw         m9, m6, OUT_OF_BOUNDS_MEM
+    pandn           m9, m6
+    pmaxsw          m7, m9                      ; max after p1
+    pminsw          m8, m6                      ; min after p1
+  %endif
+ %endif
+
+    ; accumulate sum[m13] over p0/p1
+    psubw           m5, m4          ; diff_p0(p0 - px)
+    psubw           m6, m4          ; diff_p1(p1 - px)
+    packsswb        m5, m6          ; convert pixel diff to 8-bit
+ %if cpuflag(ssse3)
+  %if ARCH_X86_64 && cpuflag(sse4)
+    pshufb          m5, m14         ; group diffs p0 and p1 into pairs
+  %else
+    pshufb          m5, [PIC_sym(shufb_lohi)]
+  %endif
+    pabsb           m6, m5
+    psignb          m9, %5, m5
+ %else
+    movlhps         m6, m5
+    punpckhbw       m6, m5
+    pxor            m5, m5
+    pcmpgtb         m5, m6
+    paddb           m6, m5
+    pxor            m6, m5
+    paddb           m9, %5, m5
+    pxor            m9, m5
+ %endif
+ %if ARCH_X86_64
+    psrlw          m10, m6, %2      ; emulate 8-bit shift
+    pand           m10, %3
+    psubusb         m5, %4, m10
+ %else
+    psrlw           m5, m6, %2      ; emulate 8-bit shift
+    pand            m5, %3
+    paddusb         m5, %4
+    pxor            m5, [PIC_sym(pb_0xFF)]
+ %endif
+    pminub          m5, m6          ; constrain(diff_p)
+ %if cpuflag(ssse3)
+    pmaddubsw       m5, m9          ; constrain(diff_p) * taps
+ %else
+    psrlw           m2, m5, 8
+    psraw           m6, m9, 8
+    psllw           m5, 8
+    psllw           m9, 8
+    pmullw          m2, m6
+    pmulhw          m5, m9
+    paddw           m5, m2
+ %endif
+    paddw          m13, m5
+%endmacro
+
+%macro LOAD_BODY 4  ; dst, src, block_width, tmp_stride
+ %if %3 == 4
+    PMOVZXBW        m0, [%2+strideq*0]
+    PMOVZXBW        m1, [%2+strideq*1]
+    PMOVZXBW        m2, [%2+strideq*2]
+    PMOVZXBW        m3, [%2+stride3q]
+ %else
+    movu            m0, [%2+strideq*0]
+    movu            m1, [%2+strideq*1]
+    movu            m2, [%2+strideq*2]
+    movu            m3, [%2+stride3q]
+    punpckhbw       m4, m0, m15
+    punpcklbw       m0, m15
+    punpckhbw       m5, m1, m15
+    punpcklbw       m1, m15
+    punpckhbw       m6, m2, m15
+    punpcklbw       m2, m15
+    punpckhbw       m7, m3, m15
+    punpcklbw       m3, m15
+ %endif
+    mova     [%1+0*%4], m0
+    mova     [%1+1*%4], m1
+    mova     [%1+2*%4], m2
+    mova     [%1+3*%4], m3
+ %if %3 == 8
+    mova [%1+0*%4+2*8], m4
+    mova [%1+1*%4+2*8], m5
+    mova [%1+2*%4+2*8], m6
+    mova [%1+3*%4+2*8], m7
+ %endif
+%endmacro
+
+%macro CDEF_FILTER 3 ; w, h, stride
+
+ %if cpuflag(sse4)
+  %define OUT_OF_BOUNDS 0x80008000
+ %else
+  %define OUT_OF_BOUNDS 0x7FFF7FFF
+ %endif
+
+ %if ARCH_X86_64
+cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*%3, \
+                           dst, stride, left, top, pri, sec, stride3, dst4, edge
+    pcmpeqw        m14, m14
+  %if cpuflag(sse4)
+    psllw          m14, 15                  ; 0x8000
+  %else
+    psrlw          m14, 1                   ; 0x7FFF
+  %endif
+    pxor           m15, m15
+
+  %define px rsp+3*16+2*%3
+ %else
+cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
+                           dst, stride, left, top, stride3, dst4, edge
+    SAVE_ARG      left, 2
+    SAVE_ARG       top, 3
+    SAVE_ARG       pri, 4
+    SAVE_ARG       sec, 5
+    SAVE_ARG       dir, 6
+    SAVE_ARG   damping, 7
+
+  %define PIC_reg r2
+    LEA        PIC_reg, PIC_base_offset
+
+  %if cpuflag(sse4)
+   %define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x8000)]
+  %else
+   %define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x7FFF)]
+  %endif
+
+  %define m15 [PIC_sym(pb_0)]
+
+  %define px esp+7*16+2*%3
+ %endif
+
+    mov          edged, r8m
+
+    ; prepare pixel buffers - body/right
+ %if %2 == 8
+    lea          dst4q, [dstq+strideq*4]
+ %endif
+    lea       stride3q, [strideq*3]
+    test         edged, 2                   ; have_right
+    jz .no_right
+    LOAD_BODY       px, dstq, %1, %3
+ %if %2 == 8
+    LOAD_BODY  px+4*%3, dst4q, %1, %3
+ %endif
+    jmp .body_done
+.no_right:
+    PMOVZXBW        m0, [dstq+strideq*0], %1 == 4
+    PMOVZXBW        m1, [dstq+strideq*1], %1 == 4
+    PMOVZXBW        m2, [dstq+strideq*2], %1 == 4
+    PMOVZXBW        m3, [dstq+stride3q ], %1 == 4
+ %if %2 == 8
+    PMOVZXBW        m4, [dst4q+strideq*0], %1 == 4
+    PMOVZXBW        m5, [dst4q+strideq*1], %1 == 4
+    PMOVZXBW        m6, [dst4q+strideq*2], %1 == 4
+    PMOVZXBW        m7, [dst4q+stride3q ], %1 == 4
+ %endif
+    mova     [px+0*%3], m0
+    mova     [px+1*%3], m1
+    mova     [px+2*%3], m2
+    mova     [px+3*%3], m3
+ %if %2 == 8
+    mova     [px+4*%3], m4
+    mova     [px+5*%3], m5
+    mova     [px+6*%3], m6
+    mova     [px+7*%3], m7
+    mov dword [px+4*%3+%1*2], OUT_OF_BOUNDS
+    mov dword [px+5*%3+%1*2], OUT_OF_BOUNDS
+    mov dword [px+6*%3+%1*2], OUT_OF_BOUNDS
+    mov dword [px+7*%3+%1*2], OUT_OF_BOUNDS
+ %endif
+    mov dword [px+0*%3+%1*2], OUT_OF_BOUNDS
+    mov dword [px+1*%3+%1*2], OUT_OF_BOUNDS
+    mov dword [px+2*%3+%1*2], OUT_OF_BOUNDS
+    mov dword [px+3*%3+%1*2], OUT_OF_BOUNDS
+.body_done:
+
+    ; top
+    LOAD_ARG32     top
+    test         edged, 4                    ; have_top
+    jz .no_top
+    test         edged, 1                    ; have_left
+    jz .top_no_left
+    test         edged, 2                    ; have_right
+    jz .top_no_right
+ %if %1 == 4
+    PMOVZXBW        m0, [topq+strideq*0-2]
+    PMOVZXBW        m1, [topq+strideq*1-2]
+ %else
+    movu            m0, [topq+strideq*0-4]
+    movu            m1, [topq+strideq*1-4]
+    punpckhbw       m2, m0, m15
+    punpcklbw       m0, m15
+    punpckhbw       m3, m1, m15
+    punpcklbw       m1, m15
+    movu  [px-2*%3+8], m2
+    movu  [px-1*%3+8], m3
+ %endif
+    movu  [px-2*%3-%1], m0
+    movu  [px-1*%3-%1], m1
+    jmp .top_done
+.top_no_right:
+ %if %1 == 4
+    PMOVZXBW        m0, [topq+strideq*0-%1]
+    PMOVZXBW        m1, [topq+strideq*1-%1]
+    movu [px-2*%3-4*2], m0
+    movu [px-1*%3-4*2], m1
+ %else
+    movu            m0, [topq+strideq*0-%1]
+    movu            m1, [topq+strideq*1-%2]
+    punpckhbw       m2, m0, m15
+    punpcklbw       m0, m15
+    punpckhbw       m3, m1, m15
+    punpcklbw       m1, m15
+    mova [px-2*%3-8*2], m0
+    mova [px-2*%3-0*2], m2
+    mova [px-1*%3-8*2], m1
+    mova [px-1*%3-0*2], m3
+ %endif
+    mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
+    mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
+    jmp .top_done
+.top_no_left:
+    test         edged, 2                   ; have_right
+    jz .top_no_left_right
+ %if %1 == 4
+    PMOVZXBW        m0, [topq+strideq*0]
+    PMOVZXBW        m1, [topq+strideq*1]
+ %else
+    movu            m0, [topq+strideq*0]
+    movu            m1, [topq+strideq*1]
+    punpckhbw       m2, m0, m15
+    punpcklbw       m0, m15
+    punpckhbw       m3, m1, m15
+    punpcklbw       m1, m15
+    movd [px-2*%3+8*2], m2
+    movd [px-1*%3+8*2], m3
+ %endif
+    mova     [px-2*%3], m0
+    mova     [px-1*%3], m1
+    mov dword [px-2*%3-4], OUT_OF_BOUNDS
+    mov dword [px-1*%3-4], OUT_OF_BOUNDS
+    jmp .top_done
+.top_no_left_right:
+    PMOVZXBW        m0, [topq+strideq*0], %1 == 4
+    PMOVZXBW        m1, [topq+strideq*1], %1 == 4
+    mova     [px-2*%3], m0
+    mova     [px-1*%3], m1
+    mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
+    mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
+    mov dword [px-2*%3-4], OUT_OF_BOUNDS
+    mov dword [px-1*%3-4], OUT_OF_BOUNDS
+    jmp .top_done
+.no_top:
+ %if ARCH_X86_64
+    SWAP            m0, m14
+ %else
+    mova            m0, OUT_OF_BOUNDS_MEM
+ %endif
+    movu   [px-2*%3-4], m0
+    movu   [px-1*%3-4], m0
+ %if %1 == 8
+    movq   [px-2*%3+12], m0
+    movq   [px-1*%3+12], m0
+ %endif
+ %if ARCH_X86_64
+    SWAP            m0, m14
+ %endif
+.top_done:
+
+    ; left
+    test         edged, 1                   ; have_left
+    jz .no_left
+    SAVE_PIC_REG     0
+    LOAD_ARG32    left
+ %if %2 == 4
+    movq            m0, [leftq]
+ %else
+    movu            m0, [leftq]
+ %endif
+    LOAD_PIC_REG     0
+ %if %2 == 4
+    punpcklbw       m0, m15
+ %else
+    punpckhbw       m1, m0, m15
+    punpcklbw       m0, m15
+    movhlps         m3, m1
+    movd   [px+4*%3-4], m1
+    movd   [px+6*%3-4], m3
+    psrlq           m1, 32
+    psrlq           m3, 32
+    movd   [px+5*%3-4], m1
+    movd   [px+7*%3-4], m3
+ %endif
+    movhlps         m2, m0
+    movd   [px+0*%3-4], m0
+    movd   [px+2*%3-4], m2
+    psrlq           m0, 32
+    psrlq           m2, 32
+    movd   [px+1*%3-4], m0
+    movd   [px+3*%3-4], m2
+    jmp .left_done
+.no_left:
+    mov dword [px+0*%3-4], OUT_OF_BOUNDS
+    mov dword [px+1*%3-4], OUT_OF_BOUNDS
+    mov dword [px+2*%3-4], OUT_OF_BOUNDS
+    mov dword [px+3*%3-4], OUT_OF_BOUNDS
+ %if %2 == 8
+    mov dword [px+4*%3-4], OUT_OF_BOUNDS
+    mov dword [px+5*%3-4], OUT_OF_BOUNDS
+    mov dword [px+6*%3-4], OUT_OF_BOUNDS
+    mov dword [px+7*%3-4], OUT_OF_BOUNDS
+ %endif
+.left_done:
+
+    ; bottom
+ %if ARCH_X86_64
+    DEFINE_ARGS dst, stride, dummy1, dst8, pri, sec, stride3, dummy2, edge
+ %else
+    DEFINE_ARGS dst, stride, dummy1, dst8, stride3, dummy2, edge
+ %endif
+    test         edged, 8                   ; have_bottom
+    jz .no_bottom
+    lea          dst8q, [dstq+%2*strideq]
+    test         edged, 1                   ; have_left
+    jz .bottom_no_left
+    test         edged, 2                   ; have_right
+    jz .bottom_no_right
+ %if %1 == 4
+    PMOVZXBW        m0, [dst8q-(%1/2)]
+    PMOVZXBW        m1, [dst8q+strideq-(%1/2)]
+ %else
+    movu            m0, [dst8q-4]
+    movu            m1, [dst8q+strideq-4]
+    punpckhbw       m2, m0, m15
+    punpcklbw       m0, m15
+    punpckhbw       m3, m1, m15
+    punpcklbw       m1, m15
+    movu [px+(%2+0)*%3+8], m2
+    movu [px+(%2+1)*%3+8], m3
+ %endif
+    movu [px+(%2+0)*%3-%1], m0
+    movu [px+(%2+1)*%3-%1], m1
+    jmp .bottom_done
+.bottom_no_right:
+ %if %1 == 4
+    PMOVZXBW        m0, [dst8q-4]
+    PMOVZXBW        m1, [dst8q+strideq-4]
+    movu [px+(%2+0)*%3-4*2], m0
+    movu [px+(%2+1)*%3-4*2], m1
+ %else
+    movu            m0, [dst8q-8]
+    movu            m1, [dst8q+strideq-8]
+    punpckhbw       m2, m0, m15
+    punpcklbw       m0, m15
+    punpckhbw       m3, m1, m15
+    punpcklbw       m1, m15
+    mova [px+(%2+0)*%3-8*2], m0
+    mova [px+(%2+0)*%3-0*2], m2
+    mova [px+(%2+1)*%3-8*2], m1
+    mova [px+(%2+1)*%3-0*2], m3
+    mov dword [px+(%2-1)*%3+8*2], OUT_OF_BOUNDS     ; overwritten by first mova
+ %endif
+    mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
+    mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
+    jmp .bottom_done
+.bottom_no_left:
+    test          edged, 2                  ; have_right
+    jz .bottom_no_left_right
+ %if %1 == 4
+    PMOVZXBW        m0, [dst8q]
+    PMOVZXBW        m1, [dst8q+strideq]
+ %else
+    movu            m0, [dst8q]
+    movu            m1, [dst8q+strideq]
+    punpckhbw       m2, m0, m15
+    punpcklbw       m0, m15
+    punpckhbw       m3, m1, m15
+    punpcklbw       m1, m15
+    mova [px+(%2+0)*%3+8*2], m2
+    mova [px+(%2+1)*%3+8*2], m3
+ %endif
+    mova [px+(%2+0)*%3], m0
+    mova [px+(%2+1)*%3], m1
+    mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
+    mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
+    jmp .bottom_done
+.bottom_no_left_right:
+    PMOVZXBW        m0, [dst8q+strideq*0], %1 == 4
+    PMOVZXBW        m1, [dst8q+strideq*1], %1 == 4
+    mova [px+(%2+0)*%3], m0
+    mova [px+(%2+1)*%3], m1
+    mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
+    mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
+    mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
+    mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
+    jmp .bottom_done
+.no_bottom:
+ %if ARCH_X86_64
+    SWAP            m0, m14
+ %else
+    mova            m0, OUT_OF_BOUNDS_MEM
+ %endif
+    movu [px+(%2+0)*%3-4], m0
+    movu [px+(%2+1)*%3-4], m0
+ %if %1 == 8
+    movq [px+(%2+0)*%3+12], m0
+    movq [px+(%2+1)*%3+12], m0
+ %endif
+ %if ARCH_X86_64
+    SWAP            m0, m14
+ %endif
+.bottom_done:
+
+    ; actual filter
+    DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, secdmp
+ %if ARCH_X86_64
+    movifnidn     prid, prim
+    movifnidn     secd, secm
+    mov       dampingd, r7m
+ %else
+    LOAD_ARG       pri
+    LOAD_ARG       sec
+    LOAD_ARG   damping, 1
+ %endif
+
+    SAVE_PIC_REG     8
+    mov        pridmpd, prid
+    mov        secdmpd, secd
+    or         pridmpd, 1
+    or         secdmpd, 1
+    bsr        pridmpd, pridmpd
+    bsr        secdmpd, secdmpd
+    sub        pridmpd, dampingd
+    sub        secdmpd, dampingd
+    xor       dampingd, dampingd
+    neg        pridmpd
+    cmovs      pridmpd, dampingd
+    neg        secdmpd
+    cmovs      secdmpd, dampingd
+ %if ARCH_X86_64
+    mov       [rsp+ 0], pridmpq                 ; pri_shift
+    mov       [rsp+16], secdmpq                 ; sec_shift
+ %else
+    mov     [esp+0x00], pridmpd
+    mov     [esp+0x30], secdmpd
+    mov dword [esp+0x04], 0                     ; zero upper 32 bits of psrlw
+    mov dword [esp+0x34], 0                     ; source operand in ACCUMULATE_TAP
+  %define PIC_reg r4
+    LOAD_PIC_REG     8
+ %endif
+
+    DEFINE_ARGS dst, stride, pridmp, table, pri, sec, secdmp
+    lea         tableq, [PIC_sym(tap_table)]
+ %if ARCH_X86_64
+    SWAP            m2, m11
+    SWAP            m3, m12
+ %endif
+    movd            m2, [tableq+pridmpq]
+    movd            m3, [tableq+secdmpq]
+    PSHUFB_0        m2, m15                     ; pri_shift_mask
+    PSHUFB_0        m3, m15                     ; sec_shift_mask
+ %if ARCH_X86_64
+    SWAP            m2, m11
+    SWAP            m3, m12
+ %else
+  %define PIC_reg r6
+    mov        PIC_reg, r4
+    DEFINE_ARGS dst, stride, dir, table, pri, sec, secdmp
+    LOAD_ARG       pri
+    LOAD_ARG       dir, 1
+    mova    [esp+0x10], m2
+    mova    [esp+0x40], m3
+ %endif
+
+    ; pri/sec_taps[k] [4 total]
+    DEFINE_ARGS dst, stride, dummy, tap, pri, sec
+    movd            m0, prid
+    movd            m1, secd
+ %if ARCH_X86_64
+    PSHUFB_0        m0, m15
+    PSHUFB_0        m1, m15
+ %else
+  %if cpuflag(ssse3)
+    pxor            m2, m2
+  %endif
+    mova            m3, [PIC_sym(pb_0xFF)]
+    PSHUFB_0        m0, m2
+    PSHUFB_0        m1, m2
+    pxor            m0, m3
+    pxor            m1, m3
+    mova    [esp+0x20], m0
+    mova    [esp+0x50], m1
+ %endif
+    and           prid, 1
+    lea           priq, [tapq+8+priq*2]         ; pri_taps
+    lea           secq, [tapq+12]               ; sec_taps
+
+ %if ARCH_X86_64 && cpuflag(sse4)
+    mova           m14, [shufb_lohi]
+ %endif
+
+    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+    DEFINE_ARGS dst, stride, dir, tap, pri, sec
+ %if ARCH_X86_64
+    mov           dird, r6m
+    lea           dirq, [tapq+14+dirq*2]
+    DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
+ %else
+    lea           dird, [tapd+14+dird*2]
+    DEFINE_ARGS dst, stride, dir, stk, pri, sec
+  %define hd    dword [esp+8]
+  %define offq  dstq
+  %define kq    strideq
+ %endif
+    mov             hd, %1*%2*2/mmsize
+    lea           stkq, [px]
+    movif32 [esp+0x3C], strided
+.v_loop:
+    movif32 [esp+0x38], dstd
+    mov             kq, 1
+ %if %1 == 4
+    movq            m4, [stkq+%3*0]
+    movhps          m4, [stkq+%3*1]
+ %else
+    mova            m4, [stkq+%3*0]             ; px
+ %endif
+
+ %if ARCH_X86_32
+  %xdefine m9   m3
+  %xdefine m13  m7
+  %xdefine  m7  m0
+  %xdefine  m8  m1
+ %endif
+
+    pxor           m13, m13                     ; sum
+    mova            m7, m4                      ; max
+    mova            m8, m4                      ; min
+.k_loop:
+    movd            m2, [priq+kq]               ; pri_taps
+ %if ARCH_X86_64
+    PSHUFB_0        m2, m15
+  %if cpuflag(ssse3)
+    LOAD_SEC_TAP                                ; sec_taps
+  %endif
+    ACCUMULATE_TAP 0*2, [rsp+ 0], m11, m0, m2, %1, %3
+  %if notcpuflag(ssse3)
+    LOAD_SEC_TAP                                ; sec_taps
+  %endif
+    ACCUMULATE_TAP 2*2, [rsp+16], m12, m1, m3, %1, %3
+    ACCUMULATE_TAP 6*2, [rsp+16], m12, m1, m3, %1, %3
+ %else
+  %if cpuflag(ssse3)
+    pxor            m3, m3
+  %endif
+    PSHUFB_0        m2, m3
+    ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, %3
+    LOAD_SEC_TAP                                ; sec_taps
+    ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
+  %if notcpuflag(ssse3)
+    LOAD_SEC_TAP                                ; sec_taps
+  %endif
+    ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
+ %endif
+
+    dec             kq
+    jge .k_loop
+
+    pxor            m6, m6
+    pcmpgtw         m6, m13
+    paddw          m13, m6
+ %if cpuflag(ssse3)
+    pmulhrsw       m13, [PIC_sym(pw_2048)]
+ %else
+    paddw          m13, [PIC_sym(pw_8)]
+    psraw          m13, 4
+ %endif
+    paddw           m4, m13
+    pminsw          m4, m7
+    pmaxsw          m4, m8
+    packuswb        m4, m4
+    movif32       dstd, [esp+0x38]
+    movif32    strided, [esp+0x3C]
+ %if %1 == 4
+    movd [dstq+strideq*0], m4
+    psrlq           m4, 32
+    movd [dstq+strideq*1], m4
+ %else
+    movq [dstq], m4
+ %endif
+
+ %if %1 == 4
+ %define vloop_lines (mmsize/(%1*2))
+    lea           dstq, [dstq+strideq*vloop_lines]
+    add           stkq, %3*vloop_lines
+ %else
+    lea           dstq, [dstq+strideq]
+    add           stkq, %3
+ %endif
+    dec             hd
+    jg .v_loop
+
+    RET
+%endmacro
+
+%macro MULLD 2
+ %if cpuflag(sse4)
+    pmulld          %1, %2
+ %else
+  %if ARCH_X86_32
+   %define m15 m1
+  %endif
+    pmulhuw        m15, %1, %2
+    pmullw          %1, %2
+    pslld          m15, 16
+    paddd           %1, m15
+ %endif
+%endmacro
+
+%macro CDEF_DIR 0
+ %if ARCH_X86_64
+cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3
+    lea       stride3q, [strideq*3]
+    movq            m1, [srcq+strideq*0]
+    movhps          m1, [srcq+strideq*1]
+    movq            m3, [srcq+strideq*2]
+    movhps          m3, [srcq+stride3q]
+    lea           srcq, [srcq+strideq*4]
+    movq            m5, [srcq+strideq*0]
+    movhps          m5, [srcq+strideq*1]
+    movq            m7, [srcq+strideq*2]
+    movhps          m7, [srcq+stride3q]
+
+    pxor            m8, m8
+    psadbw          m0, m1, m8
+    psadbw          m2, m3, m8
+    psadbw          m4, m5, m8
+    psadbw          m6, m7, m8
+    packssdw        m0, m2
+    packssdw        m4, m6
+    packssdw        m0, m4
+    SWAP            m0, m9
+
+    punpcklbw       m0, m1, m8
+    punpckhbw       m1, m8
+    punpcklbw       m2, m3, m8
+    punpckhbw       m3, m8
+    punpcklbw       m4, m5, m8
+    punpckhbw       m5, m8
+    punpcklbw       m6, m7, m8
+    punpckhbw       m7, m8
+
+    mova            m8, [pw_128]
+    psubw           m0, m8
+    psubw           m1, m8
+    psubw           m2, m8
+    psubw           m3, m8
+    psubw           m4, m8
+    psubw           m5, m8
+    psubw           m6, m8
+    psubw           m7, m8
+    psllw           m8, 3
+    psubw           m9, m8                  ; partial_sum_hv[0]
+
+    paddw           m8, m0, m1
+    paddw          m10, m2, m3
+    paddw           m8, m4
+    paddw          m10, m5
+    paddw           m8, m6
+    paddw          m10, m7
+    paddw           m8, m10                 ; partial_sum_hv[1]
+
+    pmaddwd         m8, m8
+    pmaddwd         m9, m9
+    phaddd          m9, m8
+    SWAP            m8, m9
+    MULLD           m8, [div_table%+SUFFIX+48]
+
+    pslldq          m9, m1, 2
+    psrldq         m10, m1, 14
+    pslldq         m11, m2, 4
+    psrldq         m12, m2, 12
+    pslldq         m13, m3, 6
+    psrldq         m14, m3, 10
+    paddw           m9, m0
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14                 ; partial_sum_diag[0] top/right half
+    paddw           m9, m11                 ; partial_sum_diag[0] top/left half
+    pslldq         m11, m4, 8
+    psrldq         m12, m4, 8
+    pslldq         m13, m5, 10
+    psrldq         m14, m5, 6
+    paddw           m9, m11
+    paddw          m10, m12
+    paddw           m9, m13
+    paddw          m10, m14
+    pslldq         m11, m6, 12
+    psrldq         m12, m6, 4
+    pslldq         m13, m7, 14
+    psrldq         m14, m7, 2
+    paddw           m9, m11
+    paddw          m10, m12
+    paddw           m9, m13                 ; partial_sum_diag[0][0-7]
+    paddw          m10, m14                 ; partial_sum_diag[0][8-14,zero]
+    pshufb         m10, [shufw_6543210x]
+    punpckhwd      m11, m9, m10
+    punpcklwd       m9, m10
+    pmaddwd        m11, m11
+    pmaddwd         m9, m9
+    MULLD          m11, [div_table%+SUFFIX+16]
+    MULLD           m9, [div_table%+SUFFIX+0]
+    paddd           m9, m11                 ; cost[0a-d]
+
+    pslldq         m10, m0, 14
+    psrldq         m11, m0, 2
+    pslldq         m12, m1, 12
+    psrldq         m13, m1, 4
+    pslldq         m14, m2, 10
+    psrldq         m15, m2, 6
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14
+    paddw          m11, m15
+    pslldq         m12, m3, 8
+    psrldq         m13, m3, 8
+    pslldq         m14, m4, 6
+    psrldq         m15, m4, 10
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14
+    paddw          m11, m15
+    pslldq         m12, m5, 4
+    psrldq         m13, m5, 12
+    pslldq         m14, m6, 2
+    psrldq         m15, m6, 14
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14
+    paddw          m11, m15                 ; partial_sum_diag[1][8-14,zero]
+    paddw          m10, m7                  ; partial_sum_diag[1][0-7]
+    pshufb         m11, [shufw_6543210x]
+    punpckhwd      m12, m10, m11
+    punpcklwd      m10, m11
+    pmaddwd        m12, m12
+    pmaddwd        m10, m10
+    MULLD          m12, [div_table%+SUFFIX+16]
+    MULLD          m10, [div_table%+SUFFIX+0]
+    paddd          m10, m12                 ; cost[4a-d]
+    phaddd          m9, m10                 ; cost[0a/b,4a/b]
+
+    paddw          m10, m0, m1
+    paddw          m11, m2, m3
+    paddw          m12, m4, m5
+    paddw          m13, m6, m7
+    phaddw          m0, m4
+    phaddw          m1, m5
+    phaddw          m2, m6
+    phaddw          m3, m7
+
+    ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1)
+    pslldq          m4, m11, 2
+    psrldq          m5, m11, 14
+    pslldq          m6, m12, 4
+    psrldq          m7, m12, 12
+    pslldq         m14, m13, 6
+    psrldq         m15, m13, 10
+    paddw           m4, m10
+    paddw           m5, m7
+    paddw           m4, m6
+    paddw           m5, m15                 ; partial_sum_alt[3] right
+    paddw           m4, m14                 ; partial_sum_alt[3] left
+    pshuflw         m6, m5, q3012
+    punpckhwd       m5, m4
+    punpcklwd       m4, m6
+    pmaddwd         m5, m5
+    pmaddwd         m4, m4
+    MULLD           m5, [div_table%+SUFFIX+48]
+    MULLD           m4, [div_table%+SUFFIX+32]
+    paddd           m4, m5                  ; cost[7a-d]
+
+    pslldq          m5, m10, 6
+    psrldq          m6, m10, 10
+    pslldq          m7, m11, 4
+    psrldq         m10, m11, 12
+    pslldq         m11, m12, 2
+    psrldq         m12, 14
+    paddw           m5, m7
+    paddw           m6, m10
+    paddw           m5, m11
+    paddw           m6, m12
+    paddw           m5, m13
+    pshuflw         m7, m6, q3012
+    punpckhwd       m6, m5
+    punpcklwd       m5, m7
+    pmaddwd         m6, m6
+    pmaddwd         m5, m5
+    MULLD           m6, [div_table%+SUFFIX+48]
+    MULLD           m5, [div_table%+SUFFIX+32]
+    paddd           m5, m6                  ; cost[5a-d]
+
+    pslldq          m6, m1, 2
+    psrldq          m7, m1, 14
+    pslldq         m10, m2, 4
+    psrldq         m11, m2, 12
+    pslldq         m12, m3, 6
+    psrldq         m13, m3, 10
+    paddw           m6, m0
+    paddw           m7, m11
+    paddw           m6, m10
+    paddw           m7, m13                 ; partial_sum_alt[3] right
+    paddw           m6, m12                 ; partial_sum_alt[3] left
+    pshuflw        m10, m7, q3012
+    punpckhwd       m7, m6
+    punpcklwd       m6, m10
+    pmaddwd         m7, m7
+    pmaddwd         m6, m6
+    MULLD           m7, [div_table%+SUFFIX+48]
+    MULLD           m6, [div_table%+SUFFIX+32]
+    paddd           m6, m7                  ; cost[1a-d]
+
+    pshufd          m0, m0, q1032
+    pshufd          m1, m1, q1032
+    pshufd          m2, m2, q1032
+    pshufd          m3, m3, q1032
+
+    pslldq         m10, m0, 6
+    psrldq         m11, m0, 10
+    pslldq         m12, m1, 4
+    psrldq         m13, m1, 12
+    pslldq         m14, m2, 2
+    psrldq          m2, 14
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14
+    paddw          m11, m2
+    paddw          m10, m3
+    pshuflw        m12, m11, q3012
+    punpckhwd      m11, m10
+    punpcklwd      m10, m12
+    pmaddwd        m11, m11
+    pmaddwd        m10, m10
+    MULLD          m11, [div_table%+SUFFIX+48]
+    MULLD          m10, [div_table%+SUFFIX+32]
+    paddd          m10, m11                 ; cost[3a-d]
+
+    phaddd          m9, m8                  ; cost[0,4,2,6]
+    phaddd          m6, m10
+    phaddd          m5, m4
+    phaddd          m6, m5                  ; cost[1,3,5,7]
+    pshufd          m4, m9, q3120
+
+    ; now find the best cost
+  %if cpuflag(sse4)
+    pmaxsd          m9, m6
+    pshufd          m0, m9, q1032
+    pmaxsd          m0, m9
+    pshufd          m1, m0, q2301
+    pmaxsd          m0, m1                  ; best cost
+  %else
+    pcmpgtd         m0, m9, m6
+    pand            m9, m0
+    pandn           m0, m6
+    por             m9, m0
+    pshufd          m1, m9, q1032
+    pcmpgtd         m0, m9, m1
+    pand            m9, m0
+    pandn           m0, m1
+    por             m9, m0
+    pshufd          m1, m9, q2301
+    pcmpgtd         m0, m9, m1
+    pand            m9, m0
+    pandn           m0, m1
+    por             m0, m9
+  %endif
+
+    ; get direction and variance
+    punpckhdq       m1, m4, m6
+    punpckldq       m4, m6
+    psubd           m2, m0, m1
+    psubd           m3, m0, m4
+    mova    [rsp+0x00], m2                  ; emulate ymm in stack
+    mova    [rsp+0x10], m3
+    pcmpeqd         m1, m0                  ; compute best cost mask
+    pcmpeqd         m4, m0
+    packssdw        m4, m1
+    pmovmskb       eax, m4                  ; get byte-idx from mask
+    tzcnt          eax, eax
+    mov            r1d, [rsp+rax*2]         ; get idx^4 complement from emulated ymm
+    shr            eax, 1                   ; get direction by converting byte-idx to word-idx
+    shr            r1d, 10
+    mov         [varq], r1d
+ %else
+cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
+  %define PIC_reg r4
+    LEA        PIC_reg, PIC_base_offset
+
+    pxor            m0, m0
+    mova            m1, [PIC_sym(pw_128)]
+
+    lea       stride3q, [strideq*3]
+    movq            m5, [srcq+strideq*0]
+    movhps          m5, [srcq+strideq*1]
+    movq            m7, [srcq+strideq*2]
+    movhps          m7, [srcq+stride3q]
+    psadbw          m2, m5, m0
+    psadbw          m3, m7, m0
+    packssdw        m2, m3
+    punpcklbw       m4, m5, m0
+    punpckhbw       m5, m0
+    punpcklbw       m6, m7, m0
+    punpckhbw       m7, m0
+    psubw           m4, m1
+    psubw           m5, m1
+    psubw           m6, m1
+    psubw           m7, m1
+
+    mova    [esp+0x00], m4
+    mova    [esp+0x10], m5
+    mova    [esp+0x20], m6
+    mova    [esp+0x50], m7
+
+    lea           srcq, [srcq+strideq*4]
+    movq            m5, [srcq+strideq*0]
+    movhps          m5, [srcq+strideq*1]
+    movq            m7, [srcq+strideq*2]
+    movhps          m7, [srcq+stride3q]
+    psadbw          m3, m5, m0
+    psadbw          m0, m7, m0
+    packssdw        m3, m0
+    pxor            m0, m0
+    packssdw        m2, m3
+    punpcklbw       m4, m5, m0
+    punpckhbw       m5, m0
+    punpcklbw       m6, m7, m0
+    punpckhbw       m7, m0
+    psubw           m4, m1
+    psubw           m5, m1
+    psubw           m6, m1
+    psubw           m7, m1
+
+    psllw           m1, 3
+    psubw           m2, m1                  ; partial_sum_hv[0]
+    pmaddwd         m2, m2
+
+    mova            m3, [esp+0x50]
+    mova            m0, [esp+0x00]
+    paddw           m0, [esp+0x10]
+    paddw           m1, m3, [esp+0x20]
+    paddw           m0, m4
+    paddw           m1, m5
+    paddw           m0, m6
+    paddw           m1, m7
+    paddw           m0, m1                  ; partial_sum_hv[1]
+    pmaddwd         m0, m0
+
+    phaddd          m2, m0
+    MULLD           m2, [PIC_sym(div_table%+SUFFIX)+48]
+    mova    [esp+0x30], m2
+
+    mova            m1, [esp+0x10]
+    pslldq          m0, m1, 2
+    psrldq          m1, 14
+    paddw           m0, [esp+0x00]
+    pslldq          m2, m3, 6
+    psrldq          m3, 10
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m3, [esp+0x20]
+    pslldq          m2, m3, 4
+    psrldq          m3, 12
+    paddw           m0, m2                  ; partial_sum_diag[0] top/left half
+    paddw           m1, m3                  ; partial_sum_diag[0] top/right half
+    pslldq          m2, m4, 8
+    psrldq          m3, m4, 8
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m5, 10
+    psrldq          m3, m5, 6
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m6, 12
+    psrldq          m3, m6, 4
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m7, 14
+    psrldq          m3, m7, 2
+    paddw           m0, m2                  ; partial_sum_diag[0][0-7]
+    paddw           m1, m3                  ; partial_sum_diag[0][8-14,zero]
+    mova            m3, [esp+0x50]
+    pshufb          m1, [PIC_sym(shufw_6543210x)]
+    punpckhwd       m2, m0, m1
+    punpcklwd       m0, m1
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table%+SUFFIX)+16]
+    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+0]
+    paddd           m0, m2                  ; cost[0a-d]
+    mova    [esp+0x40], m0
+
+    mova            m1, [esp+0x00]
+    pslldq          m0, m1, 14
+    psrldq          m1, 2
+    paddw           m0, m7
+    pslldq          m2, m3, 8
+    psrldq          m3, 8
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m3, [esp+0x20]
+    pslldq          m2, m3, 10
+    psrldq          m3, 6
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m3, [esp+0x10]
+    pslldq          m2, m3, 12
+    psrldq          m3, 4
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m4, 6
+    psrldq          m3, m4, 10
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m5, 4
+    psrldq          m3, m5, 12
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m6, 2
+    psrldq          m3, m6, 14
+    paddw           m0, m2                  ; partial_sum_diag[1][0-7]
+    paddw           m1, m3                  ; partial_sum_diag[1][8-14,zero]
+    mova            m3, [esp+0x50]
+    pshufb          m1, [PIC_sym(shufw_6543210x)]
+    punpckhwd       m2, m0, m1
+    punpcklwd       m0, m1
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table%+SUFFIX)+16]
+    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+0]
+    paddd           m0, m2                  ; cost[4a-d]
+    phaddd          m1, [esp+0x40], m0      ; cost[0a/b,4a/b]
+    phaddd          m1, [esp+0x30]          ; cost[0,4,2,6]
+    mova    [esp+0x30], m1
+
+    phaddw          m0, [esp+0x00], m4
+    phaddw          m1, [esp+0x10], m5
+    paddw           m4, m5
+    mova            m2, [esp+0x20]
+    paddw           m5, m2, m3
+    phaddw          m2, m6
+    paddw           m6, m7
+    phaddw          m3, m7
+    mova            m7, [esp+0x00]
+    paddw           m7, [esp+0x10]
+    mova    [esp+0x00], m0
+    mova    [esp+0x10], m1
+    mova    [esp+0x20], m2
+
+    pslldq          m1, m4, 4
+    pslldq          m2, m6, 6
+    pslldq          m0, m5, 2
+    paddw           m1, m2
+    paddw           m0, m7
+    psrldq          m2, m5, 14
+    paddw           m0, m1                  ; partial_sum_alt[3] left
+    psrldq          m1, m4, 12
+    paddw           m1, m2
+    psrldq          m2, m6, 10
+    paddw           m1, m2                  ; partial_sum_alt[3] right
+    pshuflw         m1, m1, q3012
+    punpckhwd       m2, m0, m1
+    punpcklwd       m0, m1
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table%+SUFFIX)+48]
+    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+32]
+    paddd           m0, m2                  ; cost[7a-d]
+    mova    [esp+0x40], m0
+
+    pslldq          m0, m7, 6
+    psrldq          m7, 10
+    pslldq          m1, m5, 4
+    psrldq          m5, 12
+    pslldq          m2, m4, 2
+    psrldq          m4, 14
+    paddw           m0, m6
+    paddw           m7, m5
+    paddw           m0, m1
+    paddw           m7, m4
+    paddw           m0, m2
+    pshuflw         m2, m7, q3012
+    punpckhwd       m7, m0
+    punpcklwd       m0, m2
+    pmaddwd         m7, m7
+    pmaddwd         m0, m0
+    MULLD           m7, [PIC_sym(div_table%+SUFFIX)+48]
+    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+32]
+    paddd           m0, m7                  ; cost[5a-d]
+    mova    [esp+0x50], m0
+
+    mova            m7, [esp+0x10]
+    mova            m2, [esp+0x20]
+    pslldq          m0, m7, 2
+    psrldq          m7, 14
+    pslldq          m4, m2, 4
+    psrldq          m2, 12
+    pslldq          m5, m3, 6
+    psrldq          m6, m3, 10
+    paddw           m0, [esp+0x00]
+    paddw           m7, m2
+    paddw           m4, m5
+    paddw           m7, m6                  ; partial_sum_alt[3] right
+    paddw           m0, m4                  ; partial_sum_alt[3] left
+    pshuflw         m2, m7, q3012
+    punpckhwd       m7, m0
+    punpcklwd       m0, m2
+    pmaddwd         m7, m7
+    pmaddwd         m0, m0
+    MULLD           m7, [PIC_sym(div_table%+SUFFIX)+48]
+    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+32]
+    paddd           m0, m7                  ; cost[1a-d]
+    SWAP            m0, m4
+
+    pshufd          m0, [esp+0x00], q1032
+    pshufd          m1, [esp+0x10], q1032
+    pshufd          m2, [esp+0x20], q1032
+    pshufd          m3, m3, q1032
+    mova    [esp+0x00], m4
+
+    pslldq          m4, m0, 6
+    psrldq          m0, 10
+    pslldq          m5, m1, 4
+    psrldq          m1, 12
+    pslldq          m6, m2, 2
+    psrldq          m2, 14
+    paddw           m4, m3
+    paddw           m0, m1
+    paddw           m5, m6
+    paddw           m0, m2
+    paddw           m4, m5
+    pshuflw         m2, m0, q3012
+    punpckhwd       m0, m4
+    punpcklwd       m4, m2
+    pmaddwd         m0, m0
+    pmaddwd         m4, m4
+    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+48]
+    MULLD           m4, [PIC_sym(div_table%+SUFFIX)+32]
+    paddd           m4, m0                   ; cost[3a-d]
+
+    mova            m1, [esp+0x00]
+    mova            m2, [esp+0x50]
+    mova            m0, [esp+0x30]          ; cost[0,4,2,6]
+    phaddd          m1, m4
+    phaddd          m2, [esp+0x40]          ; cost[1,3,5,7]
+    phaddd          m1, m2
+    pshufd          m2, m0, q3120
+
+    ; now find the best cost
+  %if cpuflag(sse4)
+    pmaxsd          m0, m1
+    pshufd          m3, m0, q1032
+    pmaxsd          m3, m0
+    pshufd          m0, m3, q2301
+    pmaxsd          m0, m3
+  %else
+    pcmpgtd         m3, m0, m1
+    pand            m0, m3
+    pandn           m3, m1
+    por             m0, m3
+    pshufd          m4, m0, q1032
+    pcmpgtd         m3, m0, m4
+    pand            m0, m3
+    pandn           m3, m4
+    por             m0, m3
+    pshufd          m4, m0, q2301
+    pcmpgtd         m3, m0, m4
+    pand            m0, m3
+    pandn           m3, m4
+    por             m0, m3
+  %endif
+
+    ; get direction and variance
+    punpckhdq       m3, m2, m1
+    punpckldq       m2, m1
+    psubd           m1, m0, m3
+    psubd           m4, m0, m2
+    mova    [esp+0x00], m1                  ; emulate ymm in stack
+    mova    [esp+0x10], m4
+    pcmpeqd         m3, m0                  ; compute best cost mask
+    pcmpeqd         m2, m0
+    packssdw        m2, m3
+    pmovmskb       eax, m2                  ; get byte-idx from mask
+    tzcnt          eax, eax
+    mov            r1d, [esp+eax*2]         ; get idx^4 complement from emulated ymm
+    shr            eax, 1                   ; get direction by converting byte-idx to word-idx
+    shr            r1d, 10
+    mov         [vard], r1d
+ %endif
+
+    RET
+%endmacro
+
+INIT_XMM sse4
+CDEF_FILTER 8, 8, 32
+CDEF_FILTER 4, 8, 32
+CDEF_FILTER 4, 4, 32
+CDEF_DIR
+
+INIT_XMM ssse3
+CDEF_FILTER 8, 8, 32
+CDEF_FILTER 4, 8, 32
+CDEF_FILTER 4, 4, 32
+CDEF_DIR
+
+INIT_XMM sse2
+CDEF_FILTER 8, 8, 32
+CDEF_FILTER 4, 8, 32
+CDEF_FILTER 4, 4, 32
diff --git a/src/x86/cpu.c b/src/x86/cpu.c
new file mode 100644 (file)
index 0000000..eb2b4bb
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+
+#include "common/attributes.h"
+
+#include "src/x86/cpu.h"
+
+typedef struct {
+    uint32_t eax, ebx, ecx, edx;
+} CpuidRegisters;
+
+void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf);
+uint64_t dav1d_cpu_xgetbv(unsigned xcr);
+
+#define X(reg, mask) (((reg) & (mask)) == (mask))
+
+COLD unsigned dav1d_get_cpu_flags_x86(void) {
+    CpuidRegisters r = { 0 };
+    dav1d_cpu_cpuid(&r, 0, 0);
+    const unsigned max_leaf = r.eax;
+    unsigned flags = 0;
+
+    if (max_leaf >= 1) {
+        dav1d_cpu_cpuid(&r, 1, 0);
+        if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ {
+            flags |= DAV1D_X86_CPU_FLAG_SSE2;
+            if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ {
+                flags |= DAV1D_X86_CPU_FLAG_SSSE3;
+                if (X(r.ecx, 0x00080000)) /* SSE4.1 */
+                    flags |= DAV1D_X86_CPU_FLAG_SSE41;
+            }
+        }
+#if ARCH_X86_64
+        /* We only support >128-bit SIMD on x86-64. */
+        if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ {
+            const uint64_t xcr0 = dav1d_cpu_xgetbv(0);
+            if (X(xcr0, 0x00000006)) /* XMM/YMM */ {
+                if (max_leaf >= 7) {
+                    dav1d_cpu_cpuid(&r, 7, 0);
+                    if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ {
+                        flags |= DAV1D_X86_CPU_FLAG_AVX2;
+                        if (X(xcr0, 0x000000e0)) /* ZMM/OPMASK */ {
+                            if (X(r.ebx, 0xd0230000) && X(r.ecx, 0x00005f42))
+                                flags |= DAV1D_X86_CPU_FLAG_AVX512ICL;
+                        }
+                    }
+                }
+            }
+        }
+#endif
+    }
+
+    return flags;
+}
diff --git a/src/x86/cpu.h b/src/x86/cpu.h
new file mode 100644 (file)
index 0000000..26ebc38
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_X86_CPU_H
+#define DAV1D_SRC_X86_CPU_H
+
+enum CpuFlags {
+    DAV1D_X86_CPU_FLAG_SSE2      = 1 << 0,
+    DAV1D_X86_CPU_FLAG_SSSE3     = 1 << 1,
+    DAV1D_X86_CPU_FLAG_SSE41     = 1 << 2,
+    DAV1D_X86_CPU_FLAG_AVX2      = 1 << 3,
+    DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
+                                            * VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
+};
+
+unsigned dav1d_get_cpu_flags_x86(void);
+
+#endif /* DAV1D_SRC_X86_CPU_H */
diff --git a/src/x86/cpuid.asm b/src/x86/cpuid.asm
new file mode 100644 (file)
index 0000000..c08df50
--- /dev/null
@@ -0,0 +1,54 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+SECTION .text
+
+cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf
+    mov        r4, regsmp
+    mov       eax, leafm
+    mov       ecx, subleafm
+%if ARCH_X86_64
+    mov        r5, rbx
+%endif
+    cpuid
+    mov  [r4+4*0], eax
+    mov  [r4+4*1], ebx
+    mov  [r4+4*2], ecx
+    mov  [r4+4*3], edx
+%if ARCH_X86_64
+    mov       rbx, r5
+%endif
+    RET
+
+cglobal cpu_xgetbv, 0, 0, 0, xcr
+    movifnidn ecx, xcrm
+    xgetbv
+%if ARCH_X86_64
+    shl       rdx, 32
+    or        rax, rdx
+%endif
+    RET
diff --git a/src/x86/film_grain.asm b/src/x86/film_grain.asm
new file mode 100644 (file)
index 0000000..94ee123
--- /dev/null
@@ -0,0 +1,2404 @@
+; Copyright © 2019, VideoLAN and dav1d authors
+; Copyright © 2019, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+pb_8x_27_17_8x_17_27: times 8 db 27, 17
+                      times 8 db 17, 27
+pw_1024: times 16 dw 1024
+pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
+rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
+pw_seed_xor: times 2 dw 0xb524
+             times 2 dw 0x49d8
+pd_m65536: dd ~0xffff
+pb_23_22: times 2 db 23, 22
+pb_1: times 4 db 1
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512
+max: dw 255, 240, 235
+min: dw 0, 16
+pb_27_17_17_27: db 27, 17, 17, 27
+pw_1: dw 1
+
+%macro JMP_TABLE 1-*
+    %xdefine %1_table %%table
+    %xdefine %%base %1_table
+    %xdefine %%prefix mangle(private_prefix %+ _%1)
+    %%table:
+    %rep %0 - 1
+        dd %%prefix %+ .ar%2 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+ALIGN 4
+JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3
+
+struc FGData
+    .seed:                      resd 1
+    .num_y_points:              resd 1
+    .y_points:                  resb 14 * 2
+    .chroma_scaling_from_luma:  resd 1
+    .num_uv_points:             resd 2
+    .uv_points:                 resb 2 * 10 * 2
+    .scaling_shift:             resd 1
+    .ar_coeff_lag:              resd 1
+    .ar_coeffs_y:               resb 24
+    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
+    .ar_coeff_shift:            resq 1
+    .grain_scale_shift:         resd 1
+    .uv_mult:                   resd 2
+    .uv_luma_mult:              resd 2
+    .uv_offset:                 resd 2
+    .overlap_flag:              resd 1
+    .clip_to_restricted_range:  resd 1
+endstruc
+
+cextern gaussian_sequence
+
+SECTION .text
+
+INIT_XMM avx2
+cglobal generate_grain_y, 2, 9, 16, buf, fg_data
+    lea              r4, [pb_mask]
+%define base r4-pb_mask
+    movq            xm1, [base+rnd_next_upperbit_mask]
+    movq            xm4, [base+mul_bits]
+    movq            xm7, [base+hmul_bits]
+    mov             r2d, [fg_dataq+FGData.grain_scale_shift]
+    vpbroadcastw    xm8, [base+round+r2*2]
+    mova            xm5, [base+pb_mask]
+    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
+    vpbroadcastd    xm9, [base+pd_m65536]
+    mov              r2, -73*82
+    sub            bufq, r2
+    lea              r3, [gaussian_sequence]
+.loop:
+    pand            xm2, xm0, xm1
+    psrlw           xm3, xm2, 10
+    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+    pmullw          xm2, xm4            ; bits 0x0f00 are set
+    pshufb          xm2, xm5, xm2       ; set 15th bit for next 4 seeds
+    psllq           xm6, xm2, 30
+    por             xm2, xm6
+    psllq           xm6, xm2, 15
+    por             xm2, xm6            ; aggregate each bit into next seed's high bit
+    pmulhuw         xm3, xm0, xm7
+    por             xm2, xm3            ; 4 next output seeds
+    pshuflw         xm0, xm2, q3333
+    psrlw           xm2, 5
+    pmovzxwd        xm3, xm2
+    mova            xm6, xm9
+    vpgatherdd      xm2, [r3+xm3*2], xm6
+    pandn           xm2, xm9, xm2
+    packusdw        xm2, xm2
+    pmulhrsw        xm2, xm8
+    packsswb        xm2, xm2
+    movd      [bufq+r2], xm2
+    add              r2, 4
+    jl .loop
+
+    ; auto-regression code
+    movsxd           r2, [fg_dataq+FGData.ar_coeff_lag]
+    movsxd           r2, [base+generate_grain_y_avx2_table+r2*4]
+    lea              r2, [r2+base+generate_grain_y_avx2_table]
+    jmp              r2
+
+.ar1:
+    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+    movd            xm4, [fg_dataq+FGData.ar_coeffs_y]
+    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
+    pinsrb          xm4, [pb_1], 3
+    pmovsxbw        xm4, xm4
+    pshufd          xm5, xm4, q1111
+    pshufd          xm4, xm4, q0000
+    vpbroadcastw    xm3, [base+round_vals+shiftq*2-12]    ; rnd
+    sub            bufq, 82*73-(82*3+79)
+    mov              hd, 70
+    mov            mind, -128
+    mov            maxd, 127
+.y_loop_ar1:
+    mov              xq, -76
+    movsx         val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+    pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
+    pmovsxbw        xm2, [bufq+xq-82+0]     ; top
+    pmovsxbw        xm1, [bufq+xq-82+1]     ; top/right
+    punpcklwd       xm0, xm2
+    punpcklwd       xm1, xm3
+    pmaddwd         xm0, xm4
+    pmaddwd         xm1, xm5
+    paddd           xm0, xm1
+.x_loop_ar1_inner:
+    movd          val0d, xm0
+    psrldq          xm0, 4
+    imul          val3d, cf3d
+    add           val3d, val0d
+%if WIN64
+    sarx          val3d, val3d, shiftd
+%else
+    sar           val3d, shiftb
+%endif
+    movsx         val0d, byte [bufq+xq]
+    add           val3d, val0d
+    cmp           val3d, maxd
+    cmovns        val3d, maxd
+    cmp           val3d, mind
+    cmovs         val3d, mind
+    mov  byte [bufq+xq], val3b
+    ; keep val3d in-place as left for next x iteration
+    inc              xq
+    jz .x_loop_ar1_end
+    test             xq, 3
+    jnz .x_loop_ar1_inner
+    jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+    add            bufq, 82
+    dec              hd
+    jg .y_loop_ar1
+.ar0:
+    RET
+
+.ar2:
+    DEFINE_ARGS buf, fg_data, shift
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
+    movq           xm15, [base+byte_blend+1]
+    pmovsxbw        xm8, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
+    movd            xm9, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
+    pmovsxbw        xm9, xm9
+    DEFINE_ARGS buf, fg_data, h, x
+    pshufd         xm12, xm9, q0000
+    pshufd         xm13, xm9, q1111
+    pshufd         xm11, xm8, q3333
+    pshufd         xm10, xm8, q2222
+    pshufd          xm9, xm8, q1111
+    pshufd          xm8, xm8, q0000
+    pmovzxwd       xm14, xm14
+    sub            bufq, 82*73-(82*3+79)
+    mov              hd, 70
+.y_loop_ar2:
+    mov              xq, -76
+
+.x_loop_ar2:
+    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
+    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
+    psrldq          xm2, xm0, 2             ; y=-2,x=[-1,+5]
+    psrldq          xm3, xm1, 2             ; y=-1,x=[-1,+5]
+    psrldq          xm4, xm1, 4             ; y=-1,x=[+0,+5]
+    punpcklwd       xm2, xm0, xm2
+    punpcklwd       xm3, xm4
+    pmaddwd         xm2, xm8
+    pmaddwd         xm3, xm11
+    paddd           xm2, xm3
+
+    psrldq          xm4, xm0, 4             ; y=-2,x=[+0,+5]
+    psrldq          xm5, xm0, 6             ; y=-2,x=[+1,+5]
+    psrldq          xm6, xm0, 8             ; y=-2,x=[+2,+5]
+    punpcklwd       xm4, xm5
+    punpcklwd       xm6, xm1
+    psrldq          xm7, xm1, 6             ; y=-1,x=[+1,+5]
+    psrldq          xm1, xm1, 8             ; y=-1,x=[+2,+5]
+    punpcklwd       xm7, xm1
+    pmaddwd         xm4, xm9
+    pmaddwd         xm6, xm10
+    pmaddwd         xm7, xm12
+    paddd           xm4, xm6
+    paddd           xm2, xm7
+    paddd           xm2, xm4
+    paddd           xm2, xm14
+
+    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+    pmovsxbw        xm1, xm0
+    pmaddwd         xm3, xm1, xm13
+    paddd           xm3, xm2
+    psrldq          xm1, 4                  ; y=0,x=0
+    psrldq          xm2, 4                  ; shift top to next pixel
+    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
+    ; don't packssdw since we only care about one value
+    paddw           xm3, xm1
+    packsswb        xm3, xm3
+    pextrb    [bufq+xq], xm3, 0
+    pslldq          xm3, 2
+    pand            xm3, xm15
+    pandn           xm0, xm15, xm0
+    por             xm0, xm3
+    psrldq          xm0, 1
+    inc              xq
+    jz .x_loop_ar2_end
+    test             xq, 3
+    jnz .x_loop_ar2_inner
+    jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+    add            bufq, 82
+    dec              hd
+    jg .y_loop_ar2
+    RET
+
+.ar3:
+    DEFINE_ARGS buf, fg_data, shift
+%if WIN64
+    SUB             rsp, 16*12
+%assign stack_size_padded (stack_size_padded+16*12)
+%assign stack_size (stack_size+16*12)
+%else
+    ALLOC_STACK   16*12
+%endif
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
+    movq           xm15, [base+byte_blend]
+    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-7
+    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_y+ 8]   ; cf8-15
+    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
+    pshufd          xm9, xm0, q1111
+    pshufd         xm10, xm0, q2222
+    pshufd         xm11, xm0, q3333
+    pshufd          xm0, xm0, q0000
+    pshufd          xm6, xm1, q1111
+    pshufd          xm7, xm1, q2222
+    pshufd          xm8, xm1, q3333
+    pshufd          xm1, xm1, q0000
+    pshufd          xm3, xm2, q1111
+    psrldq         xm13, xm2, 10
+    pinsrw          xm2, [pw_1], 5
+    pshufd          xm4, xm2, q2222
+    pshufd          xm2, xm2, q0000
+    pinsrw         xm13, [base+round_vals+shiftq*2-10], 3
+    mova    [rsp+ 0*16], xm0
+    mova    [rsp+ 1*16], xm9
+    mova    [rsp+ 2*16], xm10
+    mova    [rsp+ 3*16], xm11
+    mova    [rsp+ 4*16], xm1
+    mova    [rsp+ 5*16], xm6
+    mova    [rsp+ 6*16], xm7
+    mova    [rsp+ 7*16], xm8
+    mova    [rsp+ 8*16], xm2
+    mova    [rsp+ 9*16], xm3
+    mova    [rsp+10*16], xm4
+    DEFINE_ARGS buf, fg_data, h, x
+    sub            bufq, 82*73-(82*3+79)
+    mov              hd, 70
+.y_loop_ar3:
+    mov              xq, -76
+
+.x_loop_ar3:
+    movu            xm0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
+    movu            xm1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
+    movu            xm2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
+    pxor            xm3, xm3
+    pcmpgtb         xm6, xm3, xm2
+    pcmpgtb         xm5, xm3, xm1
+    pcmpgtb         xm4, xm3, xm0
+    punpckhbw       xm3, xm0, xm4
+    punpcklbw       xm0, xm4
+    punpckhbw       xm4, xm1, xm5
+    punpcklbw       xm1, xm5
+    punpckhbw       xm5, xm2, xm6
+    punpcklbw       xm2, xm6
+
+    psrldq          xm6, xm0, 2
+    psrldq          xm7, xm0, 4
+    psrldq          xm8, xm0, 6
+    psrldq          xm9, xm0, 8
+    palignr        xm10, xm3, xm0, 10
+    palignr        xm11, xm3, xm0, 12
+
+    punpcklwd       xm0, xm6
+    punpcklwd       xm7, xm8
+    punpcklwd       xm9, xm10
+    punpcklwd      xm11, xm1
+    pmaddwd         xm0, [rsp+ 0*16]
+    pmaddwd         xm7, [rsp+ 1*16]
+    pmaddwd         xm9, [rsp+ 2*16]
+    pmaddwd        xm11, [rsp+ 3*16]
+    paddd           xm0, xm7
+    paddd           xm9, xm11
+    paddd           xm0, xm9
+
+    psrldq          xm6, xm1, 2
+    psrldq          xm7, xm1, 4
+    psrldq          xm8, xm1, 6
+    psrldq          xm9, xm1, 8
+    palignr        xm10, xm4, xm1, 10
+    palignr        xm11, xm4, xm1, 12
+    psrldq         xm12, xm2, 2
+
+    punpcklwd       xm6, xm7
+    punpcklwd       xm8, xm9
+    punpcklwd      xm10, xm11
+    punpcklwd      xm12, xm2, xm12
+    pmaddwd         xm6, [rsp+ 4*16]
+    pmaddwd         xm8, [rsp+ 5*16]
+    pmaddwd        xm10, [rsp+ 6*16]
+    pmaddwd        xm12, [rsp+ 7*16]
+    paddd           xm6, xm8
+    paddd          xm10, xm12
+    paddd           xm6, xm10
+    paddd           xm0, xm6
+
+    psrldq          xm6, xm2, 4
+    psrldq          xm7, xm2, 6
+    psrldq          xm8, xm2, 8
+    palignr         xm9, xm5, xm2, 10
+    palignr         xm5, xm5, xm2, 12
+
+    punpcklwd       xm6, xm7
+    punpcklwd       xm8, xm9
+    punpcklwd       xm5, xm14
+    pmaddwd         xm6, [rsp+ 8*16]
+    pmaddwd         xm8, [rsp+ 9*16]
+    pmaddwd         xm5, [rsp+10*16]
+    paddd           xm0, xm6
+    paddd           xm8, xm5
+    paddd           xm0, xm8
+
+    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+    pmovsxbw        xm2, xm1
+    pmaddwd         xm2, xm13
+    pshufd          xm3, xm2, q1111
+    paddd           xm2, xm3                ; left+cur
+    paddd           xm2, xm0                ; add top
+    psrldq          xm0, 4
+    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
+    ; don't packssdw since we only care about one value
+    packsswb        xm2, xm2
+    pextrb    [bufq+xq], xm2, 0
+    pslldq          xm2, 3
+    pand            xm2, xm15
+    pandn           xm1, xm15, xm1
+    por             xm1, xm2
+    psrldq          xm1, 1
+    inc              xq
+    jz .x_loop_ar3_end
+    test             xq, 3
+    jnz .x_loop_ar3_inner
+    jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+    add            bufq, 82
+    dec              hd
+    jg .y_loop_ar3
+    RET
+
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
+INIT_XMM avx2
+cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv
+    lea              r4, [pb_mask]
+%define base r4-pb_mask
+    movq            xm1, [base+rnd_next_upperbit_mask]
+    movq            xm4, [base+mul_bits]
+    movq            xm7, [base+hmul_bits]
+    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
+    vpbroadcastw    xm8, [base+round+r5*2]
+    mova            xm5, [base+pb_mask]
+    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
+    vpbroadcastw    xm9, [base+pw_seed_xor+uvq*4]
+    pxor            xm0, xm9
+    vpbroadcastd    xm9, [base+pd_m65536]
+    lea              r6, [gaussian_sequence]
+%if %2
+    mov             r7d, 73-35*%3
+    add            bufq, 44
+.loop_y:
+    mov              r5, -44
+.loop_x:
+%else
+    mov              r5, -73*82
+    sub            bufq, r5
+.loop:
+%endif
+    pand            xm2, xm0, xm1
+    psrlw           xm3, xm2, 10
+    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+    pmullw          xm2, xm4            ; bits 0x0f00 are set
+    pshufb          xm2, xm5, xm2       ; set 15th bit for next 4 seeds
+    psllq           xm6, xm2, 30
+    por             xm2, xm6
+    psllq           xm6, xm2, 15
+    por             xm2, xm6            ; aggregate each bit into next seed's high bit
+    pmulhuw         xm3, xm0, xm7
+    por             xm2, xm3            ; 4 next output seeds
+    pshuflw         xm0, xm2, q3333
+    psrlw           xm2, 5
+    pmovzxwd        xm3, xm2
+    mova            xm6, xm9
+    vpgatherdd      xm2, [r6+xm3*2], xm6
+    pandn           xm2, xm9, xm2
+    packusdw        xm2, xm2
+    pmulhrsw        xm2, xm8
+    packsswb        xm2, xm2
+    movd      [bufq+r5], xm2
+    add              r5, 4
+%if %2
+    jl .loop_x
+    add            bufq, 82
+    dec             r7d
+    jg .loop_y
+%else
+    jl .loop
+%endif
+
+    ; auto-regression code
+    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
+    movsxd           r5, [base+generate_grain_uv_%1_avx2_table+r5*4]
+    lea              r5, [r5+base+generate_grain_uv_%1_avx2_table]
+    jmp              r5
+
+.ar0:
+    INIT_YMM avx2
+    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+    imul            uvd, 28
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+    movd            xm3, [base+hmul_bits+shiftq*2]
+    DEFINE_ARGS buf, bufy, h
+    pmovsxbw        xm4, xm4
+%if %2
+    vpbroadcastd     m7, [pb_1]
+    vpbroadcastw     m6, [hmul_bits+2+%3*2]
+%endif
+    vpbroadcastw     m4, xm4
+    vpbroadcastw     m3, xm3
+    pxor            m12, m12
+%if %2
+    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+    sub            bufq, 82*70-3
+%endif
+    add           bufyq, 3+82*3
+    mov              hd, 70-35*%3
+.y_loop_ar0:
+%if %2
+    ; first 32 pixels
+    movu            xm8, [bufyq]
+%if %3
+    movu            xm9, [bufyq+82]
+%endif
+    movu           xm10, [bufyq+16]
+%if %3
+    movu           xm11, [bufyq+82+16]
+%endif
+    vinserti128      m8, [bufyq+32], 1
+%if %3
+    vinserti128      m9, [bufyq+82+32], 1
+%endif
+    vinserti128     m10, [bufyq+48], 1
+%if %3
+    vinserti128     m11, [bufyq+82+48], 1
+%endif
+    pmaddubsw        m8, m7, m8
+%if %3
+    pmaddubsw        m9, m7, m9
+%endif
+    pmaddubsw       m10, m7, m10
+%if %3
+    pmaddubsw       m11, m7, m11
+    paddw            m8, m9
+    paddw           m10, m11
+%endif
+    pmulhrsw         m8, m6
+    pmulhrsw        m10, m6
+%else
+    xor             r3d, r3d
+    ; first 32x2 pixels
+.x_loop_ar0:
+    movu             m8, [bufyq+r3]
+    pcmpgtb          m9, m12, m8
+    punpckhbw       m10, m8, m9
+    punpcklbw        m8, m9
+%endif
+    pmullw           m8, m4
+    pmullw          m10, m4
+    pmulhrsw         m8, m3
+    pmulhrsw        m10, m3
+%if %2
+    movu             m0, [bufq]
+%else
+    movu             m0, [bufq+r3]
+%endif
+    pcmpgtb          m1, m12, m0
+    punpckhbw        m9, m0, m1
+    punpcklbw        m0, m1
+    paddw            m0, m8
+    paddw            m9, m10
+    packsswb         m0, m9
+%if %2
+    movu         [bufq], m0
+%else
+    movu      [bufq+r3], m0
+    add             r3d, 32
+    cmp             r3d, 64
+    jl .x_loop_ar0
+%endif
+
+    ; last 6/12 pixels
+    movu            xm8, [bufyq+32*2]
+%if %2
+%if %3
+    movu            xm9, [bufyq+32*2+82]
+%endif
+    pmaddubsw       xm8, xm7, xm8
+%if %3
+    pmaddubsw       xm9, xm7, xm9
+    paddw           xm8, xm9
+%endif
+    pmulhrsw        xm8, xm6
+    pmullw          xm8, xm4
+    pmulhrsw        xm8, xm3
+    movq            xm0, [bufq+32]
+    pcmpgtb         xm9, xm12, xm0
+    punpcklbw       xm9, xm0, xm9
+    paddw           xm8, xm9
+    packsswb        xm8, xm8
+    vpblendw        xm0, xm8, xm0, 1000b
+    movq      [bufq+32], xm0
+%else
+    pcmpgtb         xm9, xm12, xm8
+    punpckhbw      xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    pmullw         xm10, xm4
+    pmullw          xm8, xm4
+    pmulhrsw       xm10, xm3
+    pmulhrsw        xm8, xm3
+    movu            xm0, [bufq+64]
+    pcmpgtb         xm9, xm12, xm0
+    punpcklbw       xm1, xm0, xm9
+    punpckhbw       xm9, xm0, xm9
+    paddw           xm1, xm8
+    paddw           xm9, xm10
+    packsswb        xm1, xm9
+    vpblendw        xm0, xm1, xm0, 11000000b
+    movu      [bufq+64], xm0
+%endif
+
+    add            bufq, 82
+    add           bufyq, 82<<%3
+    dec              hd
+    jg .y_loop_ar0
+    RET
+
+.ar1:
+    INIT_XMM avx2
+    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
+    imul            uvd, 28
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
+    DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
+    pmovsxbw        xm4, xm4
+    pshufd          xm5, xm4, q1111
+    pshufd          xm4, xm4, q0000
+    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
+%if %2
+    vpbroadcastd    xm7, [pb_1]
+    vpbroadcastw    xm6, [hmul_bits+2+%3*2]
+%endif
+    vpbroadcastd    xm3, xm3
+%if %2
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*70-(82-3)
+%endif
+    add           bufyq, 79+82*3
+    mov              hd, 70-35*%3
+    mov            mind, -128
+    mov            maxd, 127
+.y_loop_ar1:
+    mov              xq, -(76>>%2)
+    movsx         val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+    pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
+%if %2
+    movq            xm8, [bufyq+xq*2]
+%if %3
+    movq            xm9, [bufyq+xq*2+82]
+%endif
+%endif
+    psrldq          xm2, xm0, 2             ; top
+    psrldq          xm1, xm0, 4             ; top/right
+%if %2
+    pmaddubsw       xm8, xm7, xm8
+%if %3
+    pmaddubsw       xm9, xm7, xm9
+    paddw           xm8, xm9
+%endif
+    pmulhrsw        xm8, xm6
+%else
+    pmovsxbw        xm8, [bufyq+xq]
+%endif
+    punpcklwd       xm0, xm2
+    punpcklwd       xm1, xm8
+    pmaddwd         xm0, xm4
+    pmaddwd         xm1, xm5
+    paddd           xm0, xm1
+    paddd           xm0, xm3
+.x_loop_ar1_inner:
+    movd          val0d, xm0
+    psrldq          xm0, 4
+    imul          val3d, cf3d
+    add           val3d, val0d
+    sarx          val3d, val3d, shiftd
+    movsx         val0d, byte [bufq+xq]
+    add           val3d, val0d
+    cmp           val3d, maxd
+    cmovns        val3d, maxd
+    cmp           val3d, mind
+    cmovs         val3d, mind
+    mov  byte [bufq+xq], val3b
+    ; keep val3d in-place as left for next x iteration
+    inc              xq
+    jz .x_loop_ar1_end
+    test             xq, 3
+    jnz .x_loop_ar1_inner
+    jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+    add            bufq, 82
+    add           bufyq, 82<<%3
+    dec              hd
+    jg .y_loop_ar1
+    RET
+
+.ar2:
+    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    imul            uvd, 28
+    vpbroadcastw   xm15, [base+round_vals-12+shiftq*2]
+    pmovsxbw        xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-7
+    pmovsxbw        xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8]   ; cf8-12
+    pinsrw          xm9, [base+pw_1], 5
+%if %2
+    vpbroadcastw    xm7, [base+hmul_bits+2+%3*2]
+    vpbroadcastd    xm6, [base+pb_1]
+%endif
+    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+    pshufd         xm12, xm9, q0000
+    pshufd         xm13, xm9, q1111
+    pshufd         xm14, xm9, q2222
+    pshufd         xm11, xm8, q3333
+    pshufd         xm10, xm8, q2222
+    pshufd          xm9, xm8, q1111
+    pshufd          xm8, xm8, q0000
+%if %2
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*70-(82-3)
+%endif
+    add           bufyq, 79+82*3
+    mov              hd, 70-35*%3
+.y_loop_ar2:
+    mov              xq, -(76>>%2)
+
+.x_loop_ar2:
+    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
+    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
+    psrldq          xm2, xm0, 2             ; y=-2,x=[-1,+5]
+    psrldq          xm3, xm1, 2             ; y=-1,x=[-1,+5]
+    psrldq          xm4, xm1, 4             ; y=-1,x=[+0,+5]
+    punpcklwd       xm2, xm0, xm2
+    punpcklwd       xm3, xm4
+    pmaddwd         xm2, xm8
+    pmaddwd         xm3, xm11
+    paddd           xm2, xm3
+
+    psrldq          xm4, xm0, 4             ; y=-2,x=[+0,+5]
+    psrldq          xm5, xm0, 6             ; y=-2,x=[+1,+5]
+    psrldq          xm0, 8                  ; y=-2,x=[+2,+5]
+    punpcklwd       xm4, xm5
+    punpcklwd       xm0, xm1
+    psrldq          xm3, xm1, 6             ; y=-1,x=[+1,+5]
+    psrldq          xm1, xm1, 8             ; y=-1,x=[+2,+5]
+    punpcklwd       xm3, xm1
+    pmaddwd         xm4, xm9
+    pmaddwd         xm0, xm10
+    pmaddwd         xm3, xm12
+    paddd           xm4, xm0
+    paddd           xm2, xm3
+    paddd           xm2, xm4
+
+%if %2
+    movq            xm0, [bufyq+xq*2]
+%if %3
+    movq            xm3, [bufyq+xq*2+82]
+%endif
+    pmaddubsw       xm0, xm6, xm0
+%if %3
+    pmaddubsw       xm3, xm6, xm3
+    paddw           xm0, xm3
+%endif
+    pmulhrsw        xm0, xm7
+%else
+    pmovsxbw        xm0, [bufyq+xq]
+%endif
+    punpcklwd       xm0, xm15
+    pmaddwd         xm0, xm14
+    paddd           xm2, xm0
+
+    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+    pmovsxbw        xm0, xm0
+    pmaddwd         xm3, xm0, xm13
+    paddd           xm3, xm2
+    psrldq          xm2, 4                  ; shift top to next pixel
+    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
+    pslldq          xm3, 2
+    psrldq          xm0, 2
+    paddw           xm3, xm0
+    vpblendw        xm0, xm3, 00000010b
+    packsswb        xm0, xm0
+    pextrb    [bufq+xq], xm0, 1
+    inc              xq
+    jz .x_loop_ar2_end
+    test             xq, 3
+    jnz .x_loop_ar2_inner
+    jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+    add            bufq, 82
+    add           bufyq, 82<<%3
+    dec              hd
+    jg .y_loop_ar2
+    RET
+
+.ar3:
+    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+    SUB             rsp, 16*12
+%assign stack_size_padded (stack_size_padded+16*12)
+%assign stack_size (stack_size+16*12)
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    imul            uvd, 28
+    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
+    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-7
+    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8]   ; cf8-15
+    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-23
+    pmovsxbw        xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24]   ; cf24 [luma]
+    pshufd          xm9, xm0, q1111
+    pshufd         xm10, xm0, q2222
+    pshufd         xm11, xm0, q3333
+    pshufd          xm0, xm0, q0000
+    pshufd          xm6, xm1, q1111
+    pshufd          xm7, xm1, q2222
+    pshufd          xm8, xm1, q3333
+    pshufd          xm1, xm1, q0000
+    pshufd          xm3, xm2, q1111
+    pshufd          xm4, xm2, q2222
+    vpbroadcastw    xm5, xm5
+    vpblendw        xm4, xm5, 10101010b                     ; interleave luma cf
+    psrldq          xm5, xm2, 10
+    pshufd          xm2, xm2, q0000
+    pinsrw          xm5, [base+round_vals+shiftq*2-10], 3
+    pmovzxwd       xm14, xm14
+    mova    [rsp+ 0*16], xm0
+    mova    [rsp+ 1*16], xm9
+    mova    [rsp+ 2*16], xm10
+    mova    [rsp+ 3*16], xm11
+    mova    [rsp+ 4*16], xm1
+    mova    [rsp+ 5*16], xm6
+    mova    [rsp+ 6*16], xm7
+    mova    [rsp+ 7*16], xm8
+    mova    [rsp+ 8*16], xm2
+    mova    [rsp+ 9*16], xm3
+    mova    [rsp+10*16], xm4
+    mova    [rsp+11*16], xm5
+%if %2
+    vpbroadcastd   xm13, [base+pb_1]
+    vpbroadcastw   xm15, [base+hmul_bits+2+%3*2]
+%endif
+    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+%if %2
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*70-(82-3)
+%endif
+    add           bufyq, 79+82*3
+    mov              hd, 70-35*%3
+.y_loop_ar3:
+    mov              xq, -(76>>%2)
+
+.x_loop_ar3:
+    movu            xm0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
+    movu            xm1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
+    movu            xm2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
+    pxor            xm3, xm3
+    pcmpgtb         xm6, xm3, xm2
+    pcmpgtb         xm5, xm3, xm1
+    pcmpgtb         xm4, xm3, xm0
+    punpckhbw       xm3, xm0, xm4
+    punpcklbw       xm0, xm4
+    punpckhbw       xm4, xm1, xm5
+    punpcklbw       xm1, xm5
+    punpckhbw       xm5, xm2, xm6
+    punpcklbw       xm2, xm6
+
+    psrldq          xm6, xm0, 2
+    psrldq          xm7, xm0, 4
+    psrldq          xm8, xm0, 6
+    psrldq          xm9, xm0, 8
+    palignr        xm10, xm3, xm0, 10
+    palignr        xm11, xm3, xm0, 12
+
+    punpcklwd       xm0, xm6
+    punpcklwd       xm7, xm8
+    punpcklwd       xm9, xm10
+    punpcklwd      xm11, xm1
+    pmaddwd         xm0, [rsp+ 0*16]
+    pmaddwd         xm7, [rsp+ 1*16]
+    pmaddwd         xm9, [rsp+ 2*16]
+    pmaddwd        xm11, [rsp+ 3*16]
+    paddd           xm0, xm7
+    paddd           xm9, xm11
+    paddd           xm0, xm9
+
+    psrldq          xm6, xm1, 2
+    psrldq          xm7, xm1, 4
+    psrldq          xm8, xm1, 6
+    psrldq          xm9, xm1, 8
+    palignr        xm10, xm4, xm1, 10
+    palignr        xm11, xm4, xm1, 12
+    psrldq         xm12, xm2, 2
+
+    punpcklwd       xm6, xm7
+    punpcklwd       xm8, xm9
+    punpcklwd      xm10, xm11
+    punpcklwd      xm12, xm2, xm12
+    pmaddwd         xm6, [rsp+ 4*16]
+    pmaddwd         xm8, [rsp+ 5*16]
+    pmaddwd        xm10, [rsp+ 6*16]
+    pmaddwd        xm12, [rsp+ 7*16]
+    paddd           xm6, xm8
+    paddd          xm10, xm12
+    paddd           xm6, xm10
+    paddd           xm0, xm6
+
+    psrldq          xm6, xm2, 4
+    psrldq          xm7, xm2, 6
+    psrldq          xm8, xm2, 8
+    palignr         xm9, xm5, xm2, 10
+    palignr         xm5, xm5, xm2, 12
+
+%if %2
+    movq            xm1, [bufyq+xq*2]
+%if %3
+    movq            xm2, [bufyq+xq*2+82]
+%endif
+    pmaddubsw       xm1, xm13, xm1
+%if %3
+    pmaddubsw       xm2, xm13, xm2
+    paddw           xm1, xm2
+%endif
+    pmulhrsw        xm1, xm15
+%else
+    pmovsxbw        xm1, [bufyq+xq]
+%endif
+
+    punpcklwd       xm6, xm7
+    punpcklwd       xm8, xm9
+    punpcklwd       xm5, xm1
+    pmaddwd         xm6, [rsp+ 8*16]
+    pmaddwd         xm8, [rsp+ 9*16]
+    pmaddwd         xm5, [rsp+10*16]
+    paddd           xm0, xm6
+    paddd           xm8, xm5
+    paddd           xm0, xm8
+    paddd           xm0, xm14
+
+    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+    pmovsxbw        xm1, xm1
+    pmaddwd         xm2, xm1, [rsp+16*11]
+    pshufd          xm3, xm2, q1111
+    paddd           xm2, xm3                ; left+cur
+    paddd           xm2, xm0                ; add top
+    psrldq          xm0, 4
+    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
+    ; don't packssdw, we only care about one value
+    pslldq          xm2, 6
+    vpblendw        xm1, xm2, 1000b
+    packsswb        xm1, xm1
+    pextrb    [bufq+xq], xm1, 3
+    psrldq          xm1, 1
+    inc              xq
+    jz .x_loop_ar3_end
+    test             xq, 3
+    jnz .x_loop_ar3_inner
+    jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+    add            bufq, 82
+    add           bufyq, 82<<%3
+    dec              hd
+    jg .y_loop_ar3
+    RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
+
+INIT_YMM avx2
+cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
+    pcmpeqw         m10, m10
+    psrld           m10, 24
+    mov             r7d, [fg_dataq+FGData.scaling_shift]
+    lea              r8, [pb_mask]
+%define base r8-pb_mask
+    vpbroadcastw    m11, [base+mul_bits+r7*2-14]
+    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
+    vpbroadcastw    m12, [base+max+r7*4]
+    vpbroadcastw    m13, [base+min+r7*2]
+
+    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+
+    mov        overlapd, [fg_dataq+FGData.overlap_flag]
+    movifnidn      sbyd, sbym
+    test           sbyd, sbyd
+    setnz           r7b
+    test            r7b, overlapb
+    jnz .vertical_overlap
+
+    imul           seed, sbyd, (173 << 24) | 37
+    add            seed, (105 << 24) | 178
+    rol            seed, 8
+    movzx          seed, seew
+    xor            seed, [fg_dataq+FGData.seed]
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                unused1, unused2, see, overlap
+
+    lea        src_bakq, [srcq+wq]
+    neg              wq
+    sub            dstq, srcq
+
+.loop_x:
+    mov             r6d, seed
+    or             seed, 0xEFF4
+    shr             r6d, 1
+    test           seeb, seeh
+    lea            seed, [r6+0x8000]
+    cmovp          seed, r6d                ; updated seed
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                offx, offy, see, overlap
+
+    mov           offxd, seed
+    rorx          offyd, seed, 8
+    shr           offxd, 12
+    and           offyd, 0xf
+    imul          offyd, 164
+    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                h, offxy, see, overlap
+
+    mov              hd, hm
+    mov      grain_lutq, grain_lutmp
+.loop_y:
+    ; src
+    mova             m0, [srcq]
+    pxor             m2, m2
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+    punpckhwd        m5, m0, m2
+    punpcklwd        m4, m0, m2
+    punpckhwd        m7, m1, m2
+    punpcklwd        m6, m1, m2             ; m4-7: src as dword
+
+    ; scaling[src]
+    pcmpeqw          m3, m3
+    pcmpeqw          m9, m9
+    vpgatherdd       m8, [scalingq+m4], m3
+    vpgatherdd       m4, [scalingq+m5], m9
+    pcmpeqw          m3, m3
+    pcmpeqw          m9, m9
+    vpgatherdd       m5, [scalingq+m6], m3
+    vpgatherdd       m6, [scalingq+m7], m9
+    pand             m8, m10
+    pand             m4, m10
+    pand             m5, m10
+    pand             m6, m10
+    packusdw         m8, m4
+    packusdw         m5, m6
+
+    ; grain = grain_lut[offy+y][offx+x]
+    movu             m3, [grain_lutq+offxyq]
+    pcmpgtb          m7, m2, m3
+    punpcklbw        m2, m3, m7
+    punpckhbw        m3, m7
+
+    ; noise = round2(scaling[src] * grain, scaling_shift)
+    pmullw           m2, m8
+    pmullw           m3, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m3, m11
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m2
+    paddw            m1, m3
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+    mova    [dstq+srcq], m0
+
+    add            srcq, strideq
+    add      grain_lutq, 82
+    dec              hd
+    jg .loop_y
+
+    add              wq, 32
+    jge .end
+    lea            srcq, [src_bakq+wq]
+    test       overlapd, overlapd
+    jz .loop_x
+
+    ; r8m = sbym
+    movd           xm15, [pb_27_17_17_27]
+    cmp       dword r8m, 0
+    jne .loop_x_hv_overlap
+
+    ; horizontal overlap (without vertical overlap)
+    movd           xm14, [pw_1024]
+.loop_x_h_overlap:
+    mov             r6d, seed
+    or             seed, 0xEFF4
+    shr             r6d, 1
+    test           seeb, seeh
+    lea            seed, [r6+0x8000]
+    cmovp          seed, r6d                ; updated seed
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                offx, offy, see, left_offxy
+
+    lea     left_offxyd, [offyd+32]         ; previous column's offy*stride+offx
+    mov           offxd, seed
+    rorx          offyd, seed, 8
+    shr           offxd, 12
+    and           offyd, 0xf
+    imul          offyd, 164
+    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                h, offxy, see, left_offxy
+
+    mov              hd, hm
+    mov      grain_lutq, grain_lutmp
+.loop_y_h_overlap:
+    ; src
+    mova             m0, [srcq]
+    pxor             m2, m2
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+    punpckhwd        m5, m0, m2
+    punpcklwd        m4, m0, m2
+    punpckhwd        m7, m1, m2
+    punpcklwd        m6, m1, m2             ; m4-7: src as dword
+
+    ; scaling[src]
+    pcmpeqw          m3, m3
+    pcmpeqw          m9, m9
+    vpgatherdd       m8, [scalingq+m4], m3
+    vpgatherdd       m4, [scalingq+m5], m9
+    pcmpeqw          m3, m3
+    pcmpeqw          m9, m9
+    vpgatherdd       m5, [scalingq+m6], m3
+    vpgatherdd       m6, [scalingq+m7], m9
+    pand             m8, m10
+    pand             m4, m10
+    pand             m5, m10
+    pand             m6, m10
+    packusdw         m8, m4
+    packusdw         m5, m6
+
+    ; grain = grain_lut[offy+y][offx+x]
+    movu             m3, [grain_lutq+offxyq]
+    movd            xm4, [grain_lutq+left_offxyq]
+    punpcklbw       xm4, xm3
+    pmaddubsw       xm4, xm15, xm4
+    pmulhrsw        xm4, xm14
+    packsswb        xm4, xm4
+    vpblendw        xm4, xm3, 11111110b
+    vpblendd         m3, m4, 00001111b
+    pcmpgtb          m7, m2, m3
+    punpcklbw        m2, m3, m7
+    punpckhbw        m3, m7
+
+    ; noise = round2(scaling[src] * grain, scaling_shift)
+    pmullw           m2, m8
+    pmullw           m3, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m3, m11
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m2
+    paddw            m1, m3
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+    mova    [dstq+srcq], m0
+
+    add            srcq, strideq
+    add      grain_lutq, 82
+    dec              hd
+    jg .loop_y_h_overlap
+
+    add              wq, 32
+    jge .end
+    lea            srcq, [src_bakq+wq]
+
+    ; r8m = sbym
+    cmp       dword r8m, 0
+    jne .loop_x_hv_overlap
+    jmp .loop_x_h_overlap
+
+.end:
+    RET
+
+.vertical_overlap:
+    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+
+    movzx          sbyd, sbyb
+    imul           seed, [fg_dataq+FGData.seed], 0x00010001
+    imul            r7d, sbyd, 173 * 0x00010001
+    imul           sbyd, 37 * 0x01000100
+    add             r7d, (105 << 16) | 188
+    add            sbyd, (178 << 24) | (141 << 8)
+    and             r7d, 0x00ff00ff
+    and            sbyd, 0xff00ff00
+    xor            seed, r7d
+    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                unused1, unused2, see, overlap
+
+    lea        src_bakq, [srcq+wq]
+    neg              wq
+    sub            dstq, srcq
+
+    vpbroadcastd    m14, [pw_1024]
+.loop_x_v_overlap:
+    vpbroadcastw    m15, [pb_27_17_17_27]
+
+    ; we assume from the block above that bits 8-15 of r7d are zero'ed
+    mov             r6d, seed
+    or             seed, 0xeff4eff4
+    test           seeb, seeh
+    setp            r7b                     ; parity of top_seed
+    shr            seed, 16
+    shl             r7d, 16
+    test           seeb, seeh
+    setp            r7b                     ; parity of cur_seed
+    or              r6d, 0x00010001
+    xor             r7d, r6d
+    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                offx, offy, see, overlap, top_offxy
+
+    rorx          offyd, seed, 8
+    rorx          offxd, seed, 12
+    and           offyd, 0xf000f
+    and           offxd, 0xf000f
+    imul          offyd, 164
+    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                h, offxy, see, overlap, top_offxy
+
+    movzx    top_offxyd, offxyw
+    shr          offxyd, 16
+
+    mov              hd, hm
+    mov      grain_lutq, grain_lutmp
+.loop_y_v_overlap:
+    ; src
+    mova             m0, [srcq]
+    pxor             m2, m2
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+    punpckhwd        m5, m0, m2
+    punpcklwd        m4, m0, m2
+    punpckhwd        m7, m1, m2
+    punpcklwd        m6, m1, m2             ; m4-7: src as dword
+
+    ; scaling[src]
+    pcmpeqw          m3, m3
+    pcmpeqw          m9, m9
+    vpgatherdd       m8, [scalingq+m4], m3
+    vpgatherdd       m4, [scalingq+m5], m9
+    pcmpeqw          m3, m3
+    pcmpeqw          m9, m9
+    vpgatherdd       m5, [scalingq+m6], m3
+    vpgatherdd       m6, [scalingq+m7], m9
+    pand             m8, m10
+    pand             m4, m10
+    pand             m5, m10
+    pand             m6, m10
+    packusdw         m8, m4
+    packusdw         m5, m6
+
+    ; grain = grain_lut[offy+y][offx+x]
+    movu             m3, [grain_lutq+offxyq]
+    movu             m4, [grain_lutq+top_offxyq]
+    punpckhbw        m6, m4, m3
+    punpcklbw        m4, m3
+    pmaddubsw        m6, m15, m6
+    pmaddubsw        m4, m15, m4
+    pmulhrsw         m6, m14
+    pmulhrsw         m4, m14
+    packsswb         m3, m4, m6
+    pcmpgtb          m7, m2, m3
+    punpcklbw        m2, m3, m7
+    punpckhbw        m3, m7
+
+    ; noise = round2(scaling[src] * grain, scaling_shift)
+    pmullw           m2, m8
+    pmullw           m3, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m3, m11
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m2
+    paddw            m1, m3
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+    mova    [dstq+srcq], m0
+
+    vpbroadcastw    m15, [pb_27_17_17_27+2] ; swap weights for second v-overlap line
+    add            srcq, strideq
+    add      grain_lutq, 82
+    dec              hw
+    jz .end_y_v_overlap
+    ; 2 lines get vertical overlap, then fall back to non-overlap code for
+    ; remaining (up to) 30 lines
+    btc              hd, 16
+    jnc .loop_y_v_overlap
+    jmp .loop_y
+
+.end_y_v_overlap:
+    add              wq, 32
+    jge .end_hv
+    lea            srcq, [src_bakq+wq]
+
+    ; since fg_dataq.overlap is guaranteed to be set, we never jump
+    ; back to .loop_x_v_overlap, and instead always fall-through to
+    ; h+v overlap
+
+    movd           xm15, [pb_27_17_17_27]
+.loop_x_hv_overlap:
+    vpbroadcastw     m8, [pb_27_17_17_27]
+
+    ; we assume from the block above that bits 8-15 of r7d are zero'ed
+    mov             r6d, seed
+    or             seed, 0xeff4eff4
+    test           seeb, seeh
+    setp            r7b                     ; parity of top_seed
+    shr            seed, 16
+    shl             r7d, 16
+    test           seeb, seeh
+    setp            r7b                     ; parity of cur_seed
+    or              r6d, 0x00010001
+    xor             r7d, r6d
+    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                offx, offy, see, left_offxy, top_offxy, topleft_offxy
+
+    lea  topleft_offxyq, [top_offxyq+32]
+    lea     left_offxyq, [offyq+32]
+    rorx          offyd, seed, 8
+    rorx          offxd, seed, 12
+    and           offyd, 0xf000f
+    and           offxd, 0xf000f
+    imul          offyd, 164
+    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                h, offxy, see, left_offxy, top_offxy, topleft_offxy
+
+    movzx    top_offxyd, offxyw
+    shr          offxyd, 16
+
+    mov              hd, hm
+    mov      grain_lutq, grain_lutmp
+.loop_y_hv_overlap:
+    ; src
+    mova             m0, [srcq]
+    pxor             m2, m2
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+    punpckhwd        m5, m0, m2
+    punpcklwd        m4, m0, m2
+    punpckhwd        m7, m1, m2
+    punpcklwd        m6, m1, m2             ; m4-7: src as dword
+
+    ; scaling[src]
+    pcmpeqw          m3, m3
+    ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel
+    vpgatherdd       m9, [scalingq+m4], m3
+    pcmpeqw          m3, m3
+    vpgatherdd       m4, [scalingq+m5], m3
+    pcmpeqw          m3, m3
+    vpgatherdd       m5, [scalingq+m6], m3
+    pcmpeqw          m3, m3
+    vpgatherdd       m6, [scalingq+m7], m3
+    pand             m9, m10
+    pand             m4, m10
+    pand             m5, m10
+    pand             m6, m10
+    packusdw         m9, m4
+    packusdw         m5, m6
+
+    ; grain = grain_lut[offy+y][offx+x]
+    movu             m3, [grain_lutq+offxyq]
+    movu             m6, [grain_lutq+top_offxyq]
+    movd            xm4, [grain_lutq+left_offxyq]
+    movd            xm7, [grain_lutq+topleft_offxyq]
+    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+    punpcklbw       xm4, xm3
+    punpcklbw       xm7, xm6
+    pmaddubsw       xm4, xm15, xm4
+    pmaddubsw       xm7, xm15, xm7
+    pmulhrsw        xm4, xm14
+    pmulhrsw        xm7, xm14
+    packsswb        xm4, xm4
+    packsswb        xm7, xm7
+    vpblendw        xm4, xm3, 11111110b
+    vpblendw        xm7, xm6, 11111110b
+    vpblendd         m3, m4, 00001111b
+    vpblendd         m6, m7, 00001111b
+    ; followed by v interpolation (top | cur -> cur)
+    punpckhbw        m7, m6, m3
+    punpcklbw        m6, m3
+    pmaddubsw        m7, m8, m7
+    pmaddubsw        m6, m8, m6
+    pmulhrsw         m7, m14
+    pmulhrsw         m6, m14
+    packsswb         m3, m6, m7
+    pcmpgtb          m7, m2, m3
+    punpcklbw        m2, m3, m7
+    punpckhbw        m3, m7
+
+    ; noise = round2(scaling[src] * grain, scaling_shift)
+    pmullw           m2, m9
+    pmullw           m3, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m3, m11
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m2
+    paddw            m1, m3
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+    mova    [dstq+srcq], m0
+
+    vpbroadcastw     m8, [pb_27_17_17_27+2] ; swap weights for second v-overlap line
+    add            srcq, strideq
+    add      grain_lutq, 82
+    dec              hw
+    jz .end_y_hv_overlap
+    ; 2 lines get vertical overlap, then fall back to non-overlap code for
+    ; remaining (up to) 30 lines
+    btc              hd, 16
+    jnc .loop_y_hv_overlap
+    jmp .loop_y_h_overlap
+
+.end_y_hv_overlap:
+    add              wq, 32
+    lea            srcq, [src_bakq+wq]
+    jl .loop_x_hv_overlap
+
+.end_hv:
+    RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+                                     grain_lut, h, sby, luma, lstride, uv_pl, is_id
+    pcmpeqw         m10, m10
+    psrld           m10, 24
+    mov             r7d, [fg_dataq+FGData.scaling_shift]
+    lea              r8, [pb_mask]
+%define base r8-pb_mask
+    vpbroadcastw    m11, [base+mul_bits+r7*2-14]
+    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
+    mov             r9d, dword is_idm
+    vpbroadcastw    m13, [base+min+r7*2]
+    shlx            r7d, r7d, r9d
+    vpbroadcastw    m12, [base+max+r7*2]
+
+    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+    jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+
+%if %1
+    mov             r7d, dword r11m
+    vpbroadcastb     m0, [fg_dataq+FGData.uv_mult+r7*4]
+    vpbroadcastb     m1, [fg_dataq+FGData.uv_luma_mult+r7*4]
+    punpcklbw       m14, m1, m0
+    vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r7*4]
+%else
+    vpbroadcastd    m14, [pw_1024]
+%if %2
+    vpbroadcastd    m15, [pb_23_22]
+%else
+    vpbroadcastd   xm15, [pb_27_17_17_27]
+%endif
+%endif
+
+    mov        overlapd, [fg_dataq+FGData.overlap_flag]
+    movifnidn      sbyd, sbym
+    test           sbyd, sbyd
+    setnz           r7b
+    test            r7b, overlapb
+    jnz %%vertical_overlap
+
+    imul           seed, sbyd, (173 << 24) | 37
+    add            seed, (105 << 24) | 178
+    rol            seed, 8
+    movzx          seed, seew
+    xor            seed, [fg_dataq+FGData.seed]
+
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                unused2, unused3, see, overlap, unused4, unused5, lstride
+
+    mov           lumaq, r9mp
+    lea             r12, [srcq+wq]
+    lea             r13, [dstq+wq]
+    lea             r14, [lumaq+wq*(1+%2)]
+    mov           r11mp, r12
+    mov           r12mp, r13
+    mov        lstrideq, r10mp
+    neg              wq
+
+%%loop_x:
+    mov             r6d, seed
+    or             seed, 0xEFF4
+    shr             r6d, 1
+    test           seeb, seeh
+    lea            seed, [r6+0x8000]
+    cmovp          seed, r6d               ; updated seed
+
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                offx, offy, see, overlap, unused1, unused2, lstride
+
+    mov           offxd, seed
+    rorx          offyd, seed, 8
+    shr           offxd, 12
+    and           offyd, 0xf
+    imul          offyd, 164>>%3
+    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
+
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                h, offxy, see, overlap, unused1, unused2, lstride
+
+    mov              hd, hm
+    mov      grain_lutq, grain_lutmp
+%%loop_y:
+    ; src
+%if %2
+    mova            xm4, [lumaq+lstrideq*0+ 0]
+    mova            xm6, [lumaq+lstrideq*0+16]
+    mova            xm0, [srcq]
+    vpbroadcastd     m7, [pb_1]
+    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
+    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
+    vinserti128      m0, [srcq+strideq], 1
+    pxor             m2, m2
+    pmaddubsw        m4, m7
+    pmaddubsw        m6, m7
+    pavgw            m4, m2
+    pavgw            m6, m2
+%else
+    pxor             m2, m2
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%endif
+
+%if %1
+%if %2
+    packuswb         m4, m6                 ; luma
+%endif
+    punpckhbw        m6, m4, m0
+    punpcklbw        m4, m0                 ; { luma, chroma }
+    pmaddubsw        m6, m14
+    pmaddubsw        m4, m14
+    psraw            m6, 6
+    psraw            m4, 6
+    paddw            m6, m15
+    paddw            m4, m15
+    packuswb         m4, m6                 ; pack+unpack = clip
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%endif
+
+    punpckhwd        m5, m4, m2
+    punpcklwd        m4, m2
+    punpckhwd        m7, m6, m2
+    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
+
+    ; scaling[luma_src]
+    pcmpeqw          m3, m3
+    pcmpeqw          m9, m9
+    vpgatherdd       m8, [scalingq+m4], m3
+    vpgatherdd       m4, [scalingq+m5], m9
+    pcmpeqw          m3, m3
+    pcmpeqw          m9, m9
+    vpgatherdd       m5, [scalingq+m6], m3
+    vpgatherdd       m6, [scalingq+m7], m9
+    pand             m8, m10
+    pand             m4, m10
+    pand             m5, m10
+    pand             m6, m10
+    packusdw         m8, m4
+    packusdw         m5, m6
+
+    ; unpack chroma_source
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+
+    ; grain = grain_lut[offy+y][offx+x]
+%if %2
+    movu            xm3, [grain_lutq+offxyq+ 0]
+    vinserti128      m3, [grain_lutq+offxyq+82], 1
+%else
+    movu             m3, [grain_lutq+offxyq]
+%endif
+    pcmpgtb          m7, m2, m3
+    punpcklbw        m2, m3, m7
+    punpckhbw        m3, m7
+
+    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+    pmullw           m2, m8
+    pmullw           m3, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m3, m11
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m2
+    paddw            m1, m3
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+%if %2
+    mova         [dstq], xm0
+    vextracti128 [dstq+strideq], m0, 1
+%else
+    mova         [dstq], m0
+%endif
+
+%if %2
+    lea            srcq, [srcq+strideq*2]
+    lea            dstq, [dstq+strideq*2]
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+    add            srcq, strideq
+    add            dstq, strideq
+    add           lumaq, lstrideq
+%endif
+    add      grain_lutq, 82<<%2
+    sub              hb, 1+%2
+    jg %%loop_y
+
+    add              wq, 32>>%2
+    jge %%end
+    mov            srcq, r11mp
+    mov            dstq, r12mp
+    lea           lumaq, [r14+wq*(1+%2)]
+    add            srcq, wq
+    add            dstq, wq
+    test       overlapd, overlapd
+    jz %%loop_x
+
+    ; r8m = sbym
+    cmp       dword r8m, 0
+    jne %%loop_x_hv_overlap
+
+    ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+    mov             r6d, seed
+    or             seed, 0xEFF4
+    shr             r6d, 1
+    test           seeb, seeh
+    lea            seed, [r6+0x8000]
+    cmovp          seed, r6d               ; updated seed
+
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                offx, offy, see, left_offxy, unused1, unused2, lstride
+
+    lea     left_offxyd, [offyd+(32>>%2)]         ; previous column's offy*stride+offx
+    mov           offxd, seed
+    rorx          offyd, seed, 8
+    shr           offxd, 12
+    and           offyd, 0xf
+    imul          offyd, 164>>%3
+    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
+
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                h, offxy, see, left_offxy, unused1, unused2, lstride
+
+    mov              hd, hm
+    mov      grain_lutq, grain_lutmp
+%%loop_y_h_overlap:
+    ; src
+%if %2
+    mova            xm4, [lumaq+lstrideq*0+ 0]
+    mova            xm6, [lumaq+lstrideq*0+16]
+    mova            xm0, [srcq]
+    vpbroadcastd     m7, [pb_1]
+    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
+    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
+    vinserti128      m0, [srcq+strideq], 1
+    pxor             m2, m2
+    pmaddubsw        m4, m7
+    pmaddubsw        m6, m7
+    pavgw            m4, m2
+    pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+    pxor             m2, m2
+%endif
+
+%if %1
+%if %2
+    packuswb         m4, m6                 ; luma
+%endif
+    punpckhbw        m6, m4, m0
+    punpcklbw        m4, m0                 ; { luma, chroma }
+    pmaddubsw        m6, m14
+    pmaddubsw        m4, m14
+    psraw            m6, 6
+    psraw            m4, 6
+    paddw            m6, m15
+    paddw            m4, m15
+    packuswb         m4, m6                 ; pack+unpack = clip
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%endif
+
+    punpckhwd        m5, m4, m2
+    punpcklwd        m4, m2
+    punpckhwd        m7, m6, m2
+    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
+
+    ; scaling[luma_src]
+    pcmpeqw          m3, m3
+    pcmpeqw          m9, m9
+    vpgatherdd       m8, [scalingq+m4], m3
+    vpgatherdd       m4, [scalingq+m5], m9
+    pcmpeqw          m3, m3
+    pcmpeqw          m9, m9
+    vpgatherdd       m5, [scalingq+m6], m3
+    vpgatherdd       m6, [scalingq+m7], m9
+    pand             m8, m10
+    pand             m4, m10
+    pand             m5, m10
+    pand             m6, m10
+    packusdw         m8, m4
+    packusdw         m5, m6
+
+    ; unpack chroma_source
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+
+    ; grain = grain_lut[offy+y][offx+x]
+%if %2
+%if %1
+    vpbroadcastd     m6, [pb_23_22] ; FIXME
+%endif
+    movu            xm3, [grain_lutq+offxyq+ 0]
+    movd            xm4, [grain_lutq+left_offxyq+ 0]
+    vinserti128      m3, [grain_lutq+offxyq+82], 1
+    vinserti128      m4, [grain_lutq+left_offxyq+82], 1
+    punpcklbw        m4, m3
+%if %1
+    pmaddubsw        m4, m6, m4
+    pmulhrsw         m4, [pw_1024]
+%else
+    pmaddubsw        m4, m15, m4
+    pmulhrsw         m4, m14
+%endif
+    packsswb         m4, m4
+    pcmpeqw          m6, m6 ; FIXME
+    psrldq           m6, 15 ; FIXME
+    vpblendvb        m3, m3, m4, m6
+%else
+%if %1
+    vpbroadcastd    xm6, [pb_27_17_17_27]
+%endif
+    movu             m3, [grain_lutq+offxyq]
+    movd            xm4, [grain_lutq+left_offxyq]
+    punpcklbw       xm4, xm3
+%if %1
+    pmaddubsw       xm4, xm6, xm4
+    pmulhrsw        xm4, [pw_1024]
+%else
+    pmaddubsw       xm4, xm15, xm4
+    pmulhrsw        xm4, xm14
+%endif
+    packsswb        xm4, xm4
+    pcmpeqw         xm6, xm6
+    psrldq          xm6, 14
+    vpblendvb        m3, m3, m4, m6
+%endif
+    pcmpgtb          m7, m2, m3
+    punpcklbw        m2, m3, m7
+    punpckhbw        m3, m7
+
+    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+    pmullw           m2, m8
+    pmullw           m3, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m3, m11
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m2
+    paddw            m1, m3
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+%if %2
+    mova         [dstq], xm0
+    vextracti128 [dstq+strideq], m0, 1
+%else
+    mova         [dstq], m0
+%endif
+
+%if %2
+    lea            srcq, [srcq+strideq*2]
+    lea            dstq, [dstq+strideq*2]
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+    add            srcq, strideq
+    add            dstq, strideq
+    add           lumaq, lstrideq
+%endif
+    add      grain_lutq, 82*(1+%2)
+    sub              hb, 1+%2
+    jg %%loop_y_h_overlap
+
+    add              wq, 32>>%2
+    jge %%end
+    mov            srcq, r11mp
+    mov            dstq, r12mp
+    lea           lumaq, [r14+wq*(1+%2)]
+    add            srcq, wq
+    add            dstq, wq
+
+    ; r8m = sbym
+    cmp       dword r8m, 0
+    jne %%loop_x_hv_overlap
+    jmp %%loop_x_h_overlap
+
+%%end:
+    RET
+
+%%vertical_overlap:
+    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
+                sby, see, overlap, unused1, unused2, lstride
+
+    movzx          sbyd, sbyb
+    imul           seed, [fg_dataq+FGData.seed], 0x00010001
+    imul            r7d, sbyd, 173 * 0x00010001
+    imul           sbyd, 37 * 0x01000100
+    add             r7d, (105 << 16) | 188
+    add            sbyd, (178 << 24) | (141 << 8)
+    and             r7d, 0x00ff00ff
+    and            sbyd, 0xff00ff00
+    xor            seed, r7d
+    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                unused1, unused2, see, overlap, unused3, unused4, lstride
+
+    mov           lumaq, r9mp
+    lea             r12, [srcq+wq]
+    lea             r13, [dstq+wq]
+    lea             r14, [lumaq+wq*(1+%2)]
+    mov           r11mp, r12
+    mov           r12mp, r13
+    mov        lstrideq, r10mp
+    neg              wq
+
+%%loop_x_v_overlap:
+    ; we assume from the block above that bits 8-15 of r7d are zero'ed
+    mov             r6d, seed
+    or             seed, 0xeff4eff4
+    test           seeb, seeh
+    setp            r7b                     ; parity of top_seed
+    shr            seed, 16
+    shl             r7d, 16
+    test           seeb, seeh
+    setp            r7b                     ; parity of cur_seed
+    or              r6d, 0x00010001
+    xor             r7d, r6d
+    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                offx, offy, see, overlap, top_offxy, unused, lstride
+
+    rorx          offyd, seed, 8
+    rorx          offxd, seed, 12
+    and           offyd, 0xf000f
+    and           offxd, 0xf000f
+    imul          offyd, 164>>%3
+    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                h, offxy, see, overlap, top_offxy, unused, lstride
+
+    movzx    top_offxyd, offxyw
+    shr          offxyd, 16
+
+    mov              hd, hm
+    mov      grain_lutq, grain_lutmp
+%if %2 == 0
+    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27]
+%endif
+%%loop_y_v_overlap:
+    ; src
+%if %2
+    mova            xm4, [lumaq+lstrideq*0+ 0]
+    mova            xm6, [lumaq+lstrideq*0+16]
+    mova            xm0, [srcq]
+    vpbroadcastd     m7, [pb_1]
+    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
+    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
+    vinserti128      m0, [srcq+strideq], 1
+    pxor             m2, m2
+    pmaddubsw        m4, m7
+    pmaddubsw        m6, m7
+    pavgw            m4, m2
+    pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+    pxor             m2, m2
+%endif
+
+%if %1
+%if %2
+    packuswb         m4, m6                 ; luma
+%endif
+    punpckhbw        m6, m4, m0
+    punpcklbw        m4, m0                 ; { luma, chroma }
+    pmaddubsw        m6, m14
+    pmaddubsw        m4, m14
+    psraw            m6, 6
+    psraw            m4, 6
+    paddw            m6, m15
+    paddw            m4, m15
+    packuswb         m4, m6                 ; pack+unpack = clip
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%endif
+
+    punpckhwd        m5, m4, m2
+    punpcklwd        m4, m2
+    punpckhwd        m7, m6, m2
+    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
+
+    ; scaling[luma_src]
+    pcmpeqw          m3, m3
+    pcmpeqw          m9, m9
+    vpgatherdd       m8, [scalingq+m4], m3
+    vpgatherdd       m4, [scalingq+m5], m9
+    pcmpeqw          m3, m3
+    pcmpeqw          m9, m9
+    vpgatherdd       m5, [scalingq+m6], m3
+    vpgatherdd       m6, [scalingq+m7], m9
+    pand             m8, m10
+    pand             m4, m10
+    pand             m5, m10
+    pand             m6, m10
+    packusdw         m8, m4
+    packusdw         m5, m6
+
+%if %2
+    ; unpack chroma_source
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+%endif
+
+    ; grain = grain_lut[offy+y][offx+x]
+%if %3 == 0
+%if %2
+    mova             m6, [pb_8x_27_17_8x_17_27]
+    movu            xm3, [grain_lutq+offxyq]
+    movu            xm4, [grain_lutq+top_offxyq]
+    vinserti128      m3, [grain_lutq+offxyq+82], 1
+    vinserti128      m4, [grain_lutq+top_offxyq+82], 1
+%else
+    movu             m3, [grain_lutq+offxyq]
+    movu             m4, [grain_lutq+top_offxyq]
+%endif
+    punpckhbw        m9, m4, m3
+    punpcklbw        m4, m3
+%if %2
+    pmaddubsw        m9, m6, m9
+    pmaddubsw        m4, m6, m4
+%else
+    pmaddubsw        m9, m1, m9
+    pmaddubsw        m4, m1, m4
+%endif
+%if %1
+    pmulhrsw         m9, [pw_1024]
+    pmulhrsw         m4, [pw_1024]
+%else
+    pmulhrsw         m9, m14
+    pmulhrsw         m4, m14
+%endif
+    packsswb         m3, m4, m9
+%else
+%if %1
+    vpbroadcastd     m6, [pb_23_22]
+%endif
+    movq            xm3, [grain_lutq+offxyq]
+    movq            xm4, [grain_lutq+top_offxyq]
+    vinserti128      m3, [grain_lutq+offxyq+8], 1
+    vinserti128      m4, [grain_lutq+top_offxyq+8], 1
+    punpcklbw        m4, m3
+%if %1
+    pmaddubsw        m4, m6, m4
+    pmulhrsw         m4, [pw_1024]
+%else
+    pmaddubsw        m4, m15, m4
+    pmulhrsw         m4, m14
+%endif
+    packsswb         m4, m4
+    vpermq           m4, m4, q3120
+    ; only interpolate first line, insert second line unmodified
+    vinserti128      m3, m4, [grain_lutq+offxyq+82], 1
+%endif
+    pcmpgtb          m7, m2, m3
+    punpcklbw        m2, m3, m7
+    punpckhbw        m3, m7
+
+    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+    pmullw           m2, m8
+    pmullw           m3, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m3, m11
+
+    ; dst = clip_pixel(src, noise)
+%if %2
+    paddw            m0, m2
+    paddw            m1, m3
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+    mova         [dstq], xm0
+    vextracti128 [dstq+strideq], m0, 1
+%else
+    pxor             m6, m6
+    punpckhbw        m9, m0, m6
+    punpcklbw        m0, m6                 ; m0-1: src as word
+
+    paddw            m0, m2
+    paddw            m9, m3
+    pmaxsw           m0, m13
+    pmaxsw           m9, m13
+    pminsw           m0, m12
+    pminsw           m9, m12
+    packuswb         m0, m9
+    mova         [dstq], m0
+%endif
+
+    sub              hb, 1+%2
+    jl %%end_y_v_overlap
+%if %2
+    lea            srcq, [srcq+strideq*2]
+    lea            dstq, [dstq+strideq*2]
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+    add            srcq, strideq
+    add            dstq, strideq
+    add           lumaq, lstrideq
+%endif
+    add      grain_lutq, 82<<%2
+%if %2 == 0
+    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27+16]
+    btc              hd, 16
+    jnc %%loop_y_v_overlap
+%endif
+    jmp %%loop_y
+
+%%end_y_v_overlap:
+    add              wq, 32>>%2
+    jge %%end_hv
+    mov            srcq, r11mp
+    mov            dstq, r12mp
+    lea           lumaq, [r14+wq*(1+%2)]
+    add            srcq, wq
+    add            dstq, wq
+
+    ; since fg_dataq.overlap is guaranteed to be set, we never jump
+    ; back to .loop_x_v_overlap, and instead always fall-through to
+    ; h+v overlap
+
+%%loop_x_hv_overlap:
+    ; we assume from the block above that bits 8-15 of r7d are zero'ed
+    mov             r6d, seed
+    or             seed, 0xeff4eff4
+    test           seeb, seeh
+    setp            r7b                     ; parity of top_seed
+    shr            seed, 16
+    shl             r7d, 16
+    test           seeb, seeh
+    setp            r7b                     ; parity of cur_seed
+    or              r6d, 0x00010001
+    xor             r7d, r6d
+    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+    lea  topleft_offxyq, [top_offxyq+(32>>%2)]
+    lea     left_offxyq, [offyq+(32>>%2)]
+    rorx          offyd, seed, 8
+    rorx          offxd, seed, 12
+    and           offyd, 0xf000f
+    and           offxd, 0xf000f
+    imul          offyd, 164>>%3
+    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+    movzx    top_offxyd, offxyw
+    shr          offxyd, 16
+
+    mov              hd, hm
+    mov      grain_lutq, grain_lutmp
+%if %2 == 0
+    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27]
+%endif
+%%loop_y_hv_overlap:
+    ; src
+%if %2
+    mova            xm4, [lumaq+lstrideq*0+ 0]
+    mova            xm6, [lumaq+lstrideq*0+16]
+    mova            xm0, [srcq]
+    vpbroadcastd     m7, [pb_1]
+    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
+    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
+    vinserti128      m0, [srcq+strideq], 1
+    pxor             m2, m2
+    pmaddubsw        m4, m7
+    pmaddubsw        m6, m7
+    pavgw            m4, m2
+    pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+    pxor             m2, m2
+%endif
+
+%if %1
+%if %2
+    packuswb         m4, m6                 ; luma
+%endif
+    punpckhbw        m6, m4, m0
+    punpcklbw        m4, m0                 ; { luma, chroma }
+    pmaddubsw        m6, m14
+    pmaddubsw        m4, m14
+    psraw            m6, 6
+    psraw            m4, 6
+    paddw            m6, m15
+    paddw            m4, m15
+    packuswb         m4, m6                 ; pack+unpack = clip
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%endif
+
+    punpckhwd        m5, m4, m2
+    punpcklwd        m4, m2
+    punpckhwd        m7, m6, m2
+    punpcklwd        m6, m2                 ; m4-7: src as dword
+
+    ; scaling[src]
+    pcmpeqw          m9, m9
+    pcmpeqw          m3, m3
+    vpgatherdd       m8, [scalingq+m4], m9
+    vpgatherdd       m4, [scalingq+m5], m3
+    pcmpeqw          m9, m9
+    pcmpeqw          m3, m3
+    vpgatherdd       m5, [scalingq+m6], m9
+    vpgatherdd       m6, [scalingq+m7], m3
+    pand             m8, m10
+    pand             m4, m10
+    pand             m5, m10
+    pand             m6, m10
+    packusdw         m8, m4
+    packusdw         m5, m6
+
+%if %2
+    ; unpack chroma source
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+%endif
+
+    ; grain = grain_lut[offy+y][offx+x]
+%if %1
+%if %2
+    vpbroadcastd     m9, [pb_23_22]
+%else
+    vpbroadcastd    xm9, [pb_27_17_17_27]
+%endif
+%endif
+
+%if %2
+    movu            xm3, [grain_lutq+offxyq]
+%if %3
+    movq            xm6, [grain_lutq+top_offxyq]
+%else
+    movu            xm6, [grain_lutq+top_offxyq]
+%endif
+    vinserti128      m3, [grain_lutq+offxyq+82], 1
+%if %3
+    vinserti128      m6, [grain_lutq+top_offxyq+8], 1
+%else
+    vinserti128      m6, [grain_lutq+top_offxyq+82], 1
+%endif
+%else
+    movu             m3, [grain_lutq+offxyq]
+    movu             m6, [grain_lutq+top_offxyq]
+%endif
+    movd            xm4, [grain_lutq+left_offxyq]
+    movd            xm7, [grain_lutq+topleft_offxyq]
+%if %2
+    vinserti128      m4, [grain_lutq+left_offxyq+82], 1
+%if %3 == 0
+    vinserti128      m7, [grain_lutq+topleft_offxyq+82], 1
+%endif
+%endif
+
+    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+%if %2
+    punpcklbw        m4, m3
+%if %3
+    punpcklbw       xm7, xm6
+%else
+    punpcklbw        m7, m6
+%endif
+    punpcklwd        m4, m7
+%if %1
+    pmaddubsw        m4, m9, m4
+    pmulhrsw         m4, [pw_1024]
+%else
+    pmaddubsw        m4, m15, m4
+    pmulhrsw         m4, m14
+%endif
+    packsswb         m4, m4
+    pcmpeqw          m9, m9                 ; this is kind of ugly
+    psrldq           m9, 15
+    vpblendvb        m3, m3, m4, m9
+    psrldq           m4, 1
+%if %3
+    shufpd           m9, m9, m9, 1110b      ; clear upper lane
+%endif
+    vpblendvb        m6, m6, m4, m9
+%else
+    punpcklbw       xm4, xm3
+    punpcklbw       xm7, xm6
+    punpckldq       xm4, xm7
+%if %1
+    pmaddubsw       xm4, xm9, xm4
+    pmulhrsw        xm4, [pw_1024]
+%else
+    pmaddubsw       xm4, xm15, xm4
+    pmulhrsw        xm4, xm14
+%endif
+    packsswb        xm4, xm4
+    pcmpeqw         xm9, xm9                 ; this is kind of ugly
+    psrldq          xm9, 14
+    vpblendvb        m3, m3, m4, m9
+    psrldq          xm4, 2
+    vpblendvb        m6, m6, m4, m9
+%endif
+
+    ; followed by v interpolation (top | cur -> cur)
+%if %3
+    vpermq           m9, m3, q3120
+    punpcklbw        m6, m9
+%if %1
+    vpbroadcastd     m9, [pb_23_22]
+    pmaddubsw        m6, m9, m6
+    pmulhrsw         m6, [pw_1024]
+%else
+    pmaddubsw        m6, m15, m6
+    pmulhrsw         m6, m14
+%endif
+    packsswb         m6, m6
+    vpermq           m6, m6, q3120
+    vpblendd         m3, m3, m6, 00001111b
+%else
+    punpckhbw        m9, m6, m3
+    punpcklbw        m6, m3
+%if %2
+    mova             m3, [pb_8x_27_17_8x_17_27]
+    pmaddubsw        m9, m3, m9
+    pmaddubsw        m6, m3, m6
+%else
+    pmaddubsw        m9, m1, m9
+    pmaddubsw        m6, m1, m6
+%endif
+%if %1
+    pmulhrsw         m9, [pw_1024]
+    pmulhrsw         m6, [pw_1024]
+%else
+    pmulhrsw         m9, m14
+    pmulhrsw         m6, m14
+%endif
+    packsswb         m3, m6, m9
+%endif
+    pcmpgtb          m7, m2, m3
+    punpcklbw        m2, m3, m7
+    punpckhbw        m3, m7
+
+    ; noise = round2(scaling[src] * grain, scaling_shift)
+    pmullw           m2, m8
+    pmullw           m3, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m3, m11
+
+    ; dst = clip_pixel(src, noise)
+%if %2
+    paddw            m0, m2
+    paddw            m1, m3
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+    mova         [dstq], xm0
+    vextracti128 [dstq+strideq], m0, 1
+%else
+    pxor             m6, m6
+    punpckhbw        m9, m0, m6
+    punpcklbw        m0, m6                 ; m0-1: src as word
+    paddw            m0, m2
+    paddw            m9, m3
+    pmaxsw           m0, m13
+    pmaxsw           m9, m13
+    pminsw           m0, m12
+    pminsw           m9, m12
+    packuswb         m0, m9
+    mova         [dstq], m0
+%endif
+
+%if %2
+    lea            srcq, [srcq+strideq*2]
+    lea            dstq, [dstq+strideq*2]
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+    add            srcq, strideq
+    add            dstq, strideq
+    add           lumaq, lstrideq
+%endif
+    add      grain_lutq, 82<<%2
+    sub              hb, 1+%2
+%if %2
+    jg %%loop_y_h_overlap
+%else
+    je %%end_y_hv_overlap
+    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27+16]
+    btc              hd, 16
+    jnc %%loop_y_hv_overlap
+    jmp %%loop_y_h_overlap
+%endif
+
+%%end_y_hv_overlap:
+    add              wq, 32>>%2
+    jge %%end_hv
+    mov            srcq, r11mp
+    mov            dstq, r12mp
+    lea           lumaq, [r14+wq*(1+%2)]
+    add            srcq, wq
+    add            dstq, wq
+    jmp %%loop_x_hv_overlap
+
+%%end_hv:
+    RET
+%endmacro
+
+    %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+    %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
+
+%endif ; ARCH_X86_64
diff --git a/src/x86/film_grain_init_tmpl.c b/src/x86/film_grain_init_tmpl.c
new file mode 100644 (file)
index 0000000..25e8ef9
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/film_grain.h"
+
+decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_ssse3);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3);
+decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_ssse3);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_ssse3);
+
+decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_avx2);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2);
+decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_avx2);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_avx2);
+
+COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+    c->generate_grain_y = dav1d_generate_grain_y_ssse3;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_ssse3;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3;
+    c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_ssse3;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_ssse3;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+    c->generate_grain_y = dav1d_generate_grain_y_avx2;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_avx2;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2;
+    c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_avx2;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_avx2;
+#endif
+}
diff --git a/src/x86/film_grain_ssse3.asm b/src/x86/film_grain_ssse3.asm
new file mode 100644 (file)
index 0000000..8212846
--- /dev/null
@@ -0,0 +1,3300 @@
+; Copyright © 2019, VideoLAN and dav1d authors
+; Copyright © 2019, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+pw_1024: times 8 dw 1024
+pb_27_17: times 8 db 27, 17
+pb_17_27: times 8 db 17, 27
+pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
+rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
+pw_seed_xor: times 2 dw 0xb524
+             times 2 dw 0x49d8
+pb_23_22: times 2 db 23, 22
+pb_1: times 4 db 1
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512
+max: dw 255, 240, 235
+min: dw 0, 16
+pw_1: dw 1
+
+%define pb_27_17_17_27 pb_17_27 - 2
+
+%macro JMP_TABLE 1-*
+    %xdefine %1_table %%table
+    %xdefine %%base %1_table
+    %xdefine %%prefix mangle(private_prefix %+ _%1)
+    %%table:
+    %rep %0 - 1
+        dd %%prefix %+ .ar%2 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3
+
+struc FGData
+    .seed:                      resd 1
+    .num_y_points:              resd 1
+    .y_points:                  resb 14 * 2
+    .chroma_scaling_from_luma:  resd 1
+    .num_uv_points:             resd 2
+    .uv_points:                 resb 2 * 10 * 2
+    .scaling_shift:             resd 1
+    .ar_coeff_lag:              resd 1
+    .ar_coeffs_y:               resb 24
+    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
+    .ar_coeff_shift:            resq 1
+    .grain_scale_shift:         resd 1
+    .uv_mult:                   resd 2
+    .uv_luma_mult:              resd 2
+    .uv_offset:                 resd 2
+    .overlap_flag:              resd 1
+    .clip_to_restricted_range:  resd 1
+endstruc
+
+cextern gaussian_sequence
+
+SECTION .text
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+    mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+    SWAP             %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
+    LEA              r4, $$
+%define base r4-$$
+    movq             m1, [base+rnd_next_upperbit_mask]
+    movq             m4, [base+mul_bits]
+    movq             m7, [base+hmul_bits]
+    mov             r2d, [fg_dataq+FGData.grain_scale_shift]
+    movd             m2, [base+round+r2*2]
+    movd             m0, [fg_dataq+FGData.seed]
+    mova             m5, [base+pb_mask]
+    pshuflw          m2, m2, q0000
+    pshuflw          m0, m0, q0000
+    mov              r2, -73*82
+    sub            bufq, r2
+    lea              r3, [base+gaussian_sequence]
+.loop:
+    pand             m6, m0, m1
+    psrlw            m3, m6, 10
+    por              m6, m3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+    pmullw           m6, m4            ; bits 0x0f00 are set
+    pshufb           m3, m5, m6        ; set 15th bit for next 4 seeds
+    psllq            m6, m3, 30
+    por              m3, m6
+    psllq            m6, m3, 15
+    por              m3, m6            ; aggregate each bit into next seed's high bit
+    pmulhuw          m6, m0, m7
+    por              m3, m6            ; 4 next output seeds
+    pshuflw          m0, m3, q3333
+    psrlw            m3, 5
+%if ARCH_X86_64
+    movq             r6, m3
+    mov              r8, r6
+    movzx           r5d, r6w
+    shr             r6d, 16
+    shr              r8, 32
+    movzx            r7, r8w
+    shr              r8, 16
+
+    movd             m6, [r3+r5*2]
+    pinsrw           m6, [r3+r6*2], 1
+    pinsrw           m6, [r3+r7*2], 2
+    pinsrw           m6, [r3+r8*2], 3
+%else
+    movd             r6, m3
+    pshuflw          m3, m3, q3232
+    movzx            r5, r6w
+    shr              r6, 16
+
+    movd             m6, [r3+r5*2]
+    pinsrw           m6, [r3+r6*2], 1
+
+    movd             r6, m3
+    movzx            r5, r6w
+    shr              r6, 16
+
+    pinsrw           m6, [r3+r5*2], 2
+    pinsrw           m6, [r3+r6*2], 3
+%endif
+    pmulhrsw         m6, m2
+    packsswb         m6, m6
+    movd      [bufq+r2], m6
+    add              r2, 4
+    jl .loop
+
+    ; auto-regression code
+    movsxd           r2, [fg_dataq+FGData.ar_coeff_lag]
+    movsxd           r2, [base+generate_grain_y_ssse3_table+r2*4]
+    lea              r2, [r2+base+generate_grain_y_ssse3_table]
+    jmp              r2
+
+.ar1:
+%if ARCH_X86_32
+    DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
+%elif WIN64
+    DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
+    mov            bufq, r0
+%else
+    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
+%endif
+    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+    movd             m4, [fg_dataq+FGData.ar_coeffs_y]
+    mov             ecx, [fg_dataq+FGData.ar_coeff_shift]
+%if ARCH_X86_32
+    mov             r1m, cf3d
+    DEFINE_ARGS buf, shift, val3, min, max, x, val0
+%define hd r0mp
+%define cf3d r1mp
+%elif WIN64
+    DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
+%else
+    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
+%endif
+    pxor             m6, m6
+    pcmpgtb          m7, m6, m4
+    punpcklbw        m4, m7
+    pinsrw           m4, [base+pw_1], 3
+    pshufd           m5, m4, q1111
+    pshufd           m4, m4, q0000
+    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
+    pshuflw          m3, m3, q0000
+    sub            bufq, 82*73-(82*3+79)
+    mov              hd, 70
+    mov            mind, -128
+    mov            maxd, 127
+.y_loop_ar1:
+    mov              xq, -76
+    movsx         val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+    movq             m0, [bufq+xq-82-1]     ; top/left
+    pcmpgtb          m7, m6, m0
+    punpcklbw        m0, m7
+    psrldq           m2, m0, 2              ; top
+    psrldq           m1, m0, 4              ; top/right
+    punpcklwd        m0, m2
+    punpcklwd        m1, m3
+    pmaddwd          m0, m4
+    pmaddwd          m1, m5
+    paddd            m0, m1
+.x_loop_ar1_inner:
+    movd          val0d, m0
+    psrldq           m0, 4
+    imul          val3d, cf3d
+    add           val3d, val0d
+    sar           val3d, shiftb
+    movsx         val0d, byte [bufq+xq]
+    add           val3d, val0d
+    cmp           val3d, maxd
+    cmovns        val3d, maxd
+    cmp           val3d, mind
+    cmovs         val3d, mind
+    mov  byte [bufq+xq], val3b
+    ; keep val3d in-place as left for next x iteration
+    inc              xq
+    jz .x_loop_ar1_end
+    test             xq, 3
+    jnz .x_loop_ar1_inner
+    jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+    add            bufq, 82
+    dec              hd
+    jg .y_loop_ar1
+.ar0:
+    RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+    ALLOC_STACK -16*8
+%endif
+    DEFINE_ARGS buf, fg_data, shift
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    movd             m6, [base+round_vals-12+shiftq*2]
+    movd             m7, [base+byte_blend+1]
+    SCRATCH           7, 15, 7
+    movq             m0, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
+    movd             m1, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
+    pxor             m7, m7
+    pshuflw          m6, m6, q0000
+    punpcklwd        m6, m7
+    pcmpgtb          m4, m7, m0
+    pcmpgtb          m5, m7, m1
+    punpcklbw        m0, m4
+    punpcklbw        m1, m5
+    DEFINE_ARGS buf, fg_data, h, x
+    pshufd           m4, m1, q0000
+    pshufd           m5, m1, q1111
+    pshufd           m3, m0, q3333
+    pshufd           m2, m0, q2222
+    pshufd           m1, m0, q1111
+    pshufd           m0, m0, q0000
+    SCRATCH           0, 8,  0
+    SCRATCH           1, 9,  1
+    SCRATCH           2, 10, 2
+    SCRATCH           3, 11, 3
+    SCRATCH           4, 12, 4
+    SCRATCH           5, 13, 5
+    SCRATCH           6, 14, 6
+    sub            bufq, 82*73-(82*3+79)
+    mov              hd, 70
+.y_loop_ar2:
+    mov              xq, -76
+
+.x_loop_ar2:
+    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
+    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
+    pcmpgtb          m2, m7, m0
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2
+    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
+    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
+    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
+    punpcklwd        m2, m0, m5
+    punpcklwd        m3, m4
+    pmaddwd          m2, m8
+    pmaddwd          m3, m11
+    paddd            m2, m3
+
+    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
+    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
+    psrldq           m6, m0, 8              ; y=-2,x=[+2,+5]
+    punpcklwd        m4, m5
+    punpcklwd        m6, m1
+    psrldq           m5, m1, 6              ; y=-1,x=[+1,+5]
+    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
+    punpcklwd        m5, m1
+    pmaddwd          m4, m9
+    pmaddwd          m6, m10
+    pmaddwd          m5, m12
+    paddd            m4, m6
+    paddd            m2, m5
+    paddd            m2, m4
+    paddd            m2, m14
+
+    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+    pcmpgtb          m4, m7, m0
+    punpcklbw        m1, m0, m4
+    pmaddwd          m3, m1, m13
+    paddd            m3, m2
+    psrldq           m1, 4                  ; y=0,x=0
+    psrldq           m2, 4                  ; shift top to next pixel
+    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
+    ; don't packssdw since we only care about one value
+    paddw            m3, m1
+    packsswb         m3, m3
+    pslldq           m3, 2
+    pand             m3, m15
+    pandn            m1, m15, m0
+    por              m0, m1, m3
+    psrldq           m0, 1
+    ; overwrite 2 pixels, but that's ok
+    movd      [bufq+xq-1], m0
+    inc              xq
+    jz .x_loop_ar2_end
+    test             xq, 3
+    jnz .x_loop_ar2_inner
+    jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+    add            bufq, 82
+    dec              hd
+    jg .y_loop_ar2
+    RET
+
+.ar3:
+    DEFINE_ARGS buf, fg_data, shift
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+    ALLOC_STACK  -16*14
+%elif WIN64
+    SUB             rsp, 16*6
+%assign stack_size_padded (stack_size_padded+16*6)
+%assign stack_size (stack_size+16*6)
+%else
+    ALLOC_STACK  -16*6
+%endif
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    movd             m6, [base+round_vals-12+shiftq*2]
+    movd             m7, [base+byte_blend]
+    movu             m0, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-15
+    movq             m2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
+    pxor             m3, m3
+    pcmpgtb          m4, m3, m0
+    pcmpgtb          m3, m2
+    pshuflw          m6, m6, q0000
+    SCRATCH           6, 14, 12
+    SCRATCH           7, 15, 13
+    punpckhbw        m1, m0, m4
+    punpcklbw        m0, m4
+    punpcklbw        m2, m3
+    pshufd           m3, m0, q1111
+    pshufd           m4, m0, q2222
+    pshufd           m5, m0, q3333
+    pshufd           m0, m0, q0000
+    mova    [rsp+ 0*16], m0
+    mova    [rsp+ 1*16], m3
+    mova    [rsp+ 2*16], m4
+    mova    [rsp+ 3*16], m5
+    pshufd           m6, m1, q1111
+    pshufd           m7, m1, q2222
+    pshufd           m5, m1, q3333
+    pshufd           m1, m1, q0000
+    pshufd           m3, m2, q1111
+    psrldq           m0, m2, 10
+    pinsrw           m2, [base+pw_1], 5
+    pshufd           m4, m2, q2222
+    pshufd           m2, m2, q0000
+    pinsrw           m0, [base+round_vals+shiftq*2-10], 3
+    mova    [rsp+ 4*16], m1
+    mova    [rsp+ 5*16], m6
+    SCRATCH           7, 8,  6
+    SCRATCH           5, 9,  7
+    SCRATCH           2, 10, 8
+    SCRATCH           3, 11, 9
+    SCRATCH           4, 12, 10
+    SCRATCH           0, 13, 11
+    DEFINE_ARGS buf, fg_data, h, x
+    sub            bufq, 82*73-(82*3+79)
+    mov              hd, 70
+.y_loop_ar3:
+    mov              xq, -76
+
+.x_loop_ar3:
+    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
+    pxor             m3, m3
+    pcmpgtb          m3, m0
+    punpckhbw        m2, m0, m3
+    punpcklbw        m0, m3
+
+    psrldq           m5, m0, 2
+    psrldq           m6, m0, 4
+    psrldq           m7, m0, 6
+    punpcklwd        m4, m0, m5
+    punpcklwd        m6, m7
+    pmaddwd          m4, [rsp+ 0*16]
+    pmaddwd          m6, [rsp+ 1*16]
+    paddd            m4, m6
+
+    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
+    pxor             m5, m5
+    pcmpgtb          m5, m1
+    punpckhbw        m3, m1, m5
+    punpcklbw        m1, m5
+    palignr          m6, m2, m0, 10
+    palignr          m7, m2, m0, 12
+    psrldq           m0, 8
+    punpcklwd        m0, m6
+    punpcklwd        m7, m1
+    pmaddwd          m0, [rsp+ 2*16]
+    pmaddwd          m7, [rsp+ 3*16]
+    paddd            m0, m7
+    paddd            m0, m4
+
+    psrldq           m4, m1, 2
+    psrldq           m5, m1, 4
+    psrldq           m6, m1, 6
+    psrldq           m7, m1, 8
+    punpcklwd        m4, m5
+    punpcklwd        m6, m7
+    pmaddwd          m4, [rsp+ 4*16]
+    pmaddwd          m6, [rsp+ 5*16]
+    paddd            m4, m6
+    paddd            m0, m4
+
+    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
+    pxor             m7, m7
+    pcmpgtb          m7, m2
+    punpckhbw        m5, m2, m7
+    punpcklbw        m2, m7
+    palignr          m7, m3, m1, 10
+    palignr          m3, m1, 12
+    psrldq           m1, m2, 2
+    punpcklwd        m7, m3
+    punpcklwd        m3, m2, m1
+    pmaddwd          m7, m8
+    pmaddwd          m3, m9
+    paddd            m7, m3
+    paddd            m0, m7
+
+    psrldq           m6, m2, 4
+    psrldq           m1, m2, 6
+    psrldq           m3, m2, 8
+    palignr          m4, m5, m2, 10
+    palignr          m5, m5, m2, 12
+
+    punpcklwd        m6, m1
+    punpcklwd        m3, m4
+    punpcklwd        m5, m14
+    pmaddwd          m6, m10
+    pmaddwd          m3, m11
+    pmaddwd          m5, m12
+    paddd            m0, m6
+    paddd            m3, m5
+    paddd            m0, m3
+
+    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+    pxor             m5, m5
+    pcmpgtb          m5, m1
+    punpcklbw        m2, m1, m5
+    pmaddwd          m2, m13
+    pshufd           m3, m2, q1111
+    paddd            m2, m3                 ; left+cur
+    paddd            m2, m0                 ; add top
+    psrldq           m0, 4
+    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
+    ; don't packssdw since we only care about one value
+    packsswb         m2, m2
+    pslldq           m2, 3
+    pand             m2, m15
+    pandn            m3, m15, m1
+    por              m1, m2, m3
+    movd    [bufq+xq-3], m1
+    psrldq           m1, 1
+    inc              xq
+    jz .x_loop_ar3_end
+    test             xq, 3
+    jnz .x_loop_ar3_inner
+    jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+    add            bufq, 82
+    dec              hd
+    jg .y_loop_ar3
+    RET
+
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
+INIT_XMM ssse3
+cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
+    movifnidn        r2, r2mp
+    movifnidn        r3, r3mp
+    LEA              r4, $$
+%define base r4-$$
+    movq             m1, [base+rnd_next_upperbit_mask]
+    movq             m4, [base+mul_bits]
+    movq             m7, [base+hmul_bits]
+    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
+    movd             m6, [base+round+r5*2]
+    mova             m5, [base+pb_mask]
+    movd             m0, [fg_dataq+FGData.seed]
+    movd             m2, [base+pw_seed_xor+uvq*4]
+    pxor             m0, m2
+    pshuflw          m6, m6, q0000
+    pshuflw          m0, m0, q0000
+    lea              r6, [base+gaussian_sequence]
+%if %2
+%if ARCH_X86_64
+    mov             r7d, 73-35*%3
+%else
+    mov            r3mp, 73-35*%3
+%endif
+    add            bufq, 44
+.loop_y:
+    mov              r5, -44
+.loop_x:
+%else
+    mov              r5, -82*73
+    sub            bufq, r5
+.loop:
+%endif
+    pand             m2, m0, m1
+    psrlw            m3, m2, 10
+    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+    pmullw           m2, m4             ; bits 0x0f00 are set
+    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
+    psllq            m2, m3, 30
+    por              m3, m2
+    psllq            m2, m3, 15
+    por              m3, m2             ; aggregate each bit into next seed's high bit
+    pmulhuw          m2, m0, m7
+    por              m2, m3             ; 4 next output seeds
+    pshuflw          m0, m2, q3333
+    psrlw            m2, 5
+%if ARCH_X86_64
+    movd            r9d, m2
+    pshuflw          m2, m2, q3232
+    movzx            r8, r9w
+    shr              r9, 16
+
+    movd             m3, [r6+r8*2]
+    pinsrw           m3, [r6+r9*2], 1
+
+    movd            r9d, m2
+    movzx            r8, r9w
+    shr              r9, 16
+
+    pinsrw           m3, [r6+r8*2], 2
+    pinsrw           m3, [r6+r9*2], 3
+%else
+    movd             r2, m2
+    pshuflw          m2, m2, q3232
+    movzx            r1, r2w
+    shr              r2, 16
+
+    movd             m3, [r6+r1*2]
+    pinsrw           m3, [r6+r2*2], 1
+
+    movd             r2, m2
+    movzx            r1, r2w
+    shr              r2, 16
+
+    pinsrw           m3, [r6+r1*2], 2
+    pinsrw           m3, [r6+r2*2], 3
+%endif
+    pmulhrsw         m3, m6
+    packsswb         m3, m3
+    movd      [bufq+r5], m3
+    add              r5, 4
+%if %2
+    jl .loop_x
+    add            bufq, 82
+%if ARCH_X86_64
+    dec             r7d
+%else
+    dec            r3mp
+%endif
+    jg .loop_y
+%else
+    jl .loop
+%endif
+
+%if ARCH_X86_32
+    mov              r2, r2mp
+%endif
+
+    ; auto-regression code
+    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
+    movsxd           r5, [base+generate_grain_uv_%1_ssse3_table+r5*4]
+    lea              r5, [r5+base+generate_grain_uv_%1_ssse3_table]
+    jmp              r5
+
+.ar0:
+    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+    movifnidn     bufyq, bufymp
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+    ALLOC_STACK   -2*16
+%endif
+    imul            uvd, 28
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    movd             m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+    movd             m4, [base+hmul_bits+shiftq*2]
+    DEFINE_ARGS buf, bufy, h, x
+    pxor             m0, m0
+    pcmpgtb          m0, m5
+    punpcklbw        m5, m0
+    movd             m7, [base+pb_1]
+%if %2
+    movd             m6, [base+hmul_bits+2+%3*2]
+%endif
+    pshuflw          m5, m5, q0000
+    pshuflw          m4, m4, q0000
+    pshufd           m7, m7, q0000
+%if %2
+    pshuflw          m6, m6, q0000
+%endif
+    punpcklqdq       m5, m5
+    punpcklqdq       m4, m4
+%if %2
+    punpcklqdq       m6, m6
+%endif
+    pcmpeqw          m1, m1
+    pslldq           m1, 12>>%2
+    SCRATCH           1, 8, 0
+    SCRATCH           4, 9, 1
+%if %2
+    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+    sub            bufq, 82*70-3
+%endif
+    add           bufyq, 3+82*3
+    mov              hd, 70-35*%3
+.y_loop_ar0:
+    xor              xd, xd
+.x_loop_ar0:
+    ; first 32 pixels
+%if %2
+    movu             m1, [bufyq+xq*2]
+%if %3
+    movu             m2, [bufyq+xq*2+82]
+%endif
+    movu             m3, [bufyq+xq*2+16]
+%if %3
+    movu             m4, [bufyq+xq*2+82+16]
+%endif
+    pmaddubsw        m0, m7, m1
+%if %3
+    pmaddubsw        m1, m7, m2
+%endif
+    pmaddubsw        m2, m7, m3
+%if %3
+    pmaddubsw        m3, m7, m4
+    paddw            m0, m1
+    paddw            m2, m3
+%endif
+    pmulhrsw         m0, m6
+    pmulhrsw         m2, m6
+%else
+    movu             m0, [bufyq+xq]
+    pxor             m6, m6
+    pcmpgtb          m6, m0
+    punpckhbw        m2, m0, m6
+    punpcklbw        m0, m6
+%endif
+    pmullw           m0, m5
+    pmullw           m2, m5
+    pmulhrsw         m0, m9
+    pmulhrsw         m2, m9
+    movu             m1, [bufq+xq]
+    pxor             m4, m4
+    pcmpgtb          m4, m1
+    punpckhbw        m3, m1, m4
+%if %2
+    punpcklbw        m1, m4
+    paddw            m2, m3
+    paddw            m0, m1
+%else
+    punpcklbw        m6, m1, m4
+    paddw            m2, m3
+    paddw            m0, m6
+%endif
+    packsswb         m0, m2
+%if %2
+    movu      [bufq+xq], m0
+    add              xd, 16
+    cmp              xd, 32
+    jl .x_loop_ar0
+
+    ; last 6/12 pixels
+    movu             m1, [bufyq+xq*(1+%2)]
+%if %3
+    movu             m2, [bufyq+xq*2+82]
+%endif
+    pmaddubsw        m0, m7, m1
+%if %3
+    pmaddubsw        m1, m7, m2
+    paddw            m0, m1
+%endif
+    pmulhrsw         m0, m6
+    pmullw           m0, m5
+    pmulhrsw         m0, m9
+    movq             m1, [bufq+xq]
+    pxor             m4, m4
+    pcmpgtb          m4, m1
+    punpcklbw        m2, m1, m4
+    paddw            m0, m2
+    packsswb         m0, m0
+    pandn            m2, m8, m0
+    pand             m1, m8
+    por              m2, m1
+    movq      [bufq+xq], m2
+%else
+    add              xd, 16
+    cmp              xd, 80
+    je .y_loop_final_ar0
+    movu   [bufq+xq-16], m0
+    jmp .x_loop_ar0
+.y_loop_final_ar0:
+    pandn            m2, m8, m0
+    pand             m1, m8
+    por              m2, m1
+    movu   [bufq+xq-16], m2
+%endif
+
+    add            bufq, 82
+    add           bufyq, 82<<%3
+    dec              hd
+    jg .y_loop_ar0
+    RET
+
+.ar1:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+%endif
+    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
+    imul            uvd, 28
+    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+    movd             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
+    pinsrw           m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
+%if ARCH_X86_32
+    mov            r3mp, cf3d
+    DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
+%elif WIN64
+    DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
+    mov            bufq, r0
+%else
+    DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
+%endif
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
+%if %2
+    movd             m7, [base+pb_1]
+    movd             m6, [base+hmul_bits+2+%3*2]
+%endif
+    psrldq           m4, 1
+%if ARCH_X86_32
+    DEFINE_ARGS buf, shift, val0, val3, min, max, x
+%elif WIN64
+    DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
+%else
+    DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
+%endif
+    pxor             m5, m5
+    punpcklwd        m3, m5
+%if %2
+    punpcklwd        m6, m6
+%endif
+    pcmpgtb          m5, m4
+    punpcklbw        m4, m5
+    pshufd           m5, m4, q1111
+    pshufd           m4, m4, q0000
+    pshufd           m3, m3, q0000
+%if %2
+    pshufd           m7, m7, q0000
+    pshufd           m6, m6, q0000
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*69+3
+%endif
+%if ARCH_X86_32
+    add            r1mp, 79+82*3
+    mov            r0mp, 70-35*%3
+%else
+    add           bufyq, 79+82*3
+    mov              hd, 70-35*%3
+%endif
+    mov            mind, -128
+    mov            maxd, 127
+.y_loop_ar1:
+    mov              xq, -(76>>%2)
+    movsx         val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+%if %2
+%if ARCH_X86_32
+    mov              r2, r1mp
+    movq             m0, [r2+xq*2]
+%if %3
+    movq             m1, [r2+xq*2+82]
+%endif
+%else
+    movq             m0, [bufyq+xq*2]
+%if %3
+    movq             m1, [bufyq+xq*2+82]
+%endif
+%endif
+    pmaddubsw        m2, m7, m0
+%if %3
+    pmaddubsw        m0, m7, m1
+    paddw            m2, m0
+%endif
+    pmulhrsw         m2, m6
+%else
+%if ARCH_X86_32
+    mov              r2, r1mp
+    movd             m2, [r2+xq]
+%else
+    movd             m2, [bufyq+xq]
+%endif
+    pxor             m0, m0
+    pcmpgtb          m0, m2
+    punpcklbw        m2, m0
+%endif
+
+    movq             m0, [bufq+xq-82-1]     ; top/left
+    pxor             m1, m1
+    pcmpgtb          m1, m0
+    punpcklbw        m0, m1
+    psrldq           m1, m0, 4              ; top/right
+    punpcklwd        m1, m2
+    psrldq           m2, m0, 2              ; top
+    punpcklwd        m0, m2
+    pmaddwd          m0, m4
+    pmaddwd          m1, m5
+    paddd            m0, m1
+    paddd            m0, m3
+.x_loop_ar1_inner:
+    movd          val0d, m0
+    psrldq           m0, 4
+%if ARCH_X86_32
+    imul          val3d, r3mp
+%else
+    imul          val3d, cf3d
+%endif
+    add           val3d, val0d
+    sar           val3d, shiftb
+    movsx         val0d, byte [bufq+xq]
+    add           val3d, val0d
+    cmp           val3d, maxd
+    cmovns        val3d, maxd
+    cmp           val3d, mind
+    cmovs         val3d, mind
+    mov  byte [bufq+xq], val3b
+    ; keep val3d in-place as left for next x iteration
+    inc              xq
+    jz .x_loop_ar1_end
+    test             xq, 3
+    jnz .x_loop_ar1_inner
+    jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+    add            bufq, 82
+%if ARCH_X86_32
+    add            r1mp, 82<<%3
+    dec            r0mp
+%else
+    add           bufyq, 82<<%3
+    dec              hd
+%endif
+    jg .y_loop_ar1
+    RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+    ALLOC_STACK   -8*16
+%endif
+    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+    movifnidn     bufyq, bufymp
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    imul            uvd, 28
+    movd             m7, [base+round_vals-12+shiftq*2]
+    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-12
+    pxor             m2, m2
+    pcmpgtb          m2, m0
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2
+    pinsrw           m1, [base+pw_1], 5
+    punpcklwd        m7, m7
+    pshufd           m7, m7, q0000
+    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+    pshufd           m4, m1, q0000
+    pshufd           m5, m1, q1111
+    pshufd           m6, m1, q2222
+    pshufd           m3, m0, q3333
+    pshufd           m2, m0, q2222
+    pshufd           m1, m0, q1111
+    pshufd           m0, m0, q0000
+    SCRATCH           0, 8,  0
+    SCRATCH           1, 9,  1
+    SCRATCH           2, 10, 2
+    SCRATCH           3, 11, 3
+    SCRATCH           4, 12, 4
+    SCRATCH           5, 13, 5
+    SCRATCH           6, 14, 6
+    SCRATCH           7, 15, 7
+%if %2
+    movd             m7, [base+hmul_bits+2+%3*2]
+    movd             m6, [base+pb_1]
+    punpcklwd        m7, m7
+    pshufd           m6, m6, q0000
+    pshufd           m7, m7, q0000
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*69+3
+%endif
+    add           bufyq, 79+82*3
+    mov              hd, 70-35*%3
+.y_loop_ar2:
+    mov              xq, -(76>>%2)
+
+.x_loop_ar2:
+    pxor             m2, m2
+    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
+    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
+    pcmpgtb          m2, m0
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2
+    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
+    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
+    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
+    punpcklwd        m2, m0, m5
+    punpcklwd        m3, m4
+    pmaddwd          m2, m8
+    pmaddwd          m3, m11
+    paddd            m2, m3
+
+    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
+    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
+    psrldq           m0, 8                  ; y=-2,x=[+2,+5]
+    punpcklwd        m4, m5
+    punpcklwd        m0, m1
+    psrldq           m3, m1, 6              ; y=-1,x=[+1,+5]
+    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
+    punpcklwd        m3, m1
+    pmaddwd          m4, m9
+    pmaddwd          m0, m10
+    pmaddwd          m3, m12
+    paddd            m4, m0
+    paddd            m2, m3
+    paddd            m2, m4
+
+%if %2
+    movq             m1, [bufyq+xq*2]
+%if %3
+    movq             m3, [bufyq+xq*2+82]
+%endif
+    pmaddubsw        m0, m6, m1
+%if %3
+    pmaddubsw        m1, m6, m3
+    paddw            m0, m1
+%endif
+    pmulhrsw         m0, m7
+%else
+    movd             m0, [bufyq+xq]
+    pxor             m1, m1
+    pcmpgtb          m1, m0
+    punpcklbw        m0, m1
+%endif
+    punpcklwd        m0, m15
+    pmaddwd          m0, m14
+    paddd            m2, m0
+
+    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
+    pxor             m4, m4
+    movd             m5, [base+byte_blend+1]
+    punpcklbw        m5, m5
+.x_loop_ar2_inner:
+    pcmpgtb          m1, m4, m0
+    punpcklbw        m0, m1
+    pmaddwd          m3, m0, m13
+    paddd            m3, m2
+    psrldq           m2, 4                  ; shift top to next pixel
+    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
+    pslldq           m3, 4
+    pand             m3, m5
+    paddw            m0, m3
+    packsswb         m0, m0
+    movd    [bufq+xq-2], m0
+    psrldq           m0, 1
+    inc              xq
+    jz .x_loop_ar2_end
+    test             xq, 3
+    jnz .x_loop_ar2_inner
+    jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+    add            bufq, 82
+    add           bufyq, 82<<%3
+    dec              hd
+    jg .y_loop_ar2
+    RET
+
+.ar3:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+%endif
+    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+    movifnidn     bufyq, bufymp
+%if ARCH_X86_32
+    ALLOC_STACK  -15*16
+%else
+    SUB             rsp, 16*7
+%assign stack_size_padded (stack_size_padded+16*7)
+%assign stack_size (stack_size+16*7)
+%endif
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    imul            uvd, 28
+
+    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-15
+    pxor             m3, m3
+    pcmpgtb          m3, m0
+    punpckhbw        m1, m0, m3
+    punpcklbw        m0, m3
+    pshufd           m2, m0, q1111
+    pshufd           m3, m0, q2222
+    pshufd           m4, m0, q3333
+    pshufd           m0, m0, q0000
+    pshufd           m5, m1, q1111
+    pshufd           m6, m1, q2222
+    pshufd           m7, m1, q3333
+    pshufd           m1, m1, q0000
+    mova    [rsp+ 0*16], m0
+    mova    [rsp+ 1*16], m2
+    mova    [rsp+ 2*16], m3
+    mova    [rsp+ 3*16], m4
+    mova    [rsp+ 4*16], m1
+    mova    [rsp+ 5*16], m5
+    mova    [rsp+ 6*16], m6
+    SCRATCH           7, 8, 7
+
+    movu             m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-24 [24=luma]
+    pxor             m4, m4
+    pcmpgtb          m4, m2
+    punpckhbw        m5, m2, m4
+    punpcklbw        m2, m4
+    pshufd           m4, m2, q3232
+    punpcklwd        m3, m4, m5
+    pshuflw          m5, m4, q3321
+    pshufd           m4, m3, q0000
+    pshufd           m3, m2, q1111
+    pshufd           m2, m2, q0000
+    pinsrw           m5, [base+round_vals+shiftq*2-10], 3
+    SCRATCH           2, 9,  8
+    SCRATCH           3, 10, 9
+    SCRATCH           4, 11, 10
+    SCRATCH           5, 12, 11
+
+    movd             m2, [base+round_vals-12+shiftq*2]
+%if %2
+    movd             m1, [base+pb_1]
+    movd             m3, [base+hmul_bits+2+%3*2]
+%endif
+    pxor             m0, m0
+    punpcklwd        m2, m0
+%if %2
+    punpcklwd        m3, m3
+%endif
+    pshufd           m2, m2, q0000
+%if %2
+    pshufd           m1, m1, q0000
+    pshufd           m3, m3, q0000
+    SCRATCH           1, 13, 12
+%endif
+    SCRATCH           2, 14, 13
+%if %2
+    SCRATCH           3, 15, 14
+%endif
+
+    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+%if %2
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*69+3
+%endif
+    add           bufyq, 79+82*3
+    mov              hd, 70-35*%3
+.y_loop_ar3:
+    mov              xq, -(76>>%2)
+
+.x_loop_ar3:
+    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
+    pxor             m4, m4
+    pcmpgtb          m4, m0
+    punpckhbw        m3, m0, m4
+    punpcklbw        m0, m4
+
+    psrldq           m5, m0, 2
+    psrldq           m6, m0, 4
+    psrldq           m7, m0, 6
+    punpcklwd        m4, m0, m5
+    punpcklwd        m6, m7
+    pmaddwd          m4, [rsp+ 0*16]
+    pmaddwd          m6, [rsp+ 1*16]
+    paddd            m4, m6
+
+    palignr          m2, m3, m0, 10
+    palignr          m3, m0, 12
+    psrldq           m0, 8
+
+    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
+    pxor             m6, m6
+    pcmpgtb          m6, m1
+    punpckhbw        m5, m1, m6
+    punpcklbw        m1, m6
+
+    punpcklwd        m0, m2
+    punpcklwd        m3, m1
+    pmaddwd          m0, [rsp+ 2*16]
+    pmaddwd          m3, [rsp+ 3*16]
+    paddd            m0, m3
+    paddd            m0, m4
+
+    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
+    pxor             m7, m7
+    pcmpgtb          m7, m2
+    punpckhbw        m6, m2, m7
+    punpcklbw        m2, m7
+
+    palignr          m3, m5, m1, 10
+    palignr          m5, m1, 12
+    psrldq           m4, m2, 2
+
+    punpcklwd        m3, m5
+    punpcklwd        m5, m2, m4
+    pmaddwd          m3, [rsp+ 6*16]
+    pmaddwd          m5, m8
+    paddd            m3, m5
+    paddd            m0, m3
+
+    psrldq           m3, m1, 2
+    psrldq           m4, m1, 4
+    psrldq           m5, m1, 6
+    psrldq           m1, 8
+
+    punpcklwd        m3, m4
+    punpcklwd        m5, m1
+    pmaddwd          m3, [rsp+ 4*16]
+    pmaddwd          m5, [rsp+ 5*16]
+    paddd            m3, m5
+    paddd            m0, m3
+
+%if %2
+    movq             m1, [bufyq+xq*2]
+%if %3
+    movq             m3, [bufyq+xq*2+82]
+%endif
+    pmaddubsw        m7, m13, m1
+%if %3
+    pmaddubsw        m5, m13, m3
+    paddw            m7, m5
+%endif
+    pmulhrsw         m7, m15
+%else
+    movd             m7, [bufyq+xq]
+    pxor             m1, m1
+    pcmpgtb          m1, m7
+    punpcklbw        m7, m1
+%endif
+
+    psrldq           m1, m2, 4
+    psrldq           m3, m2, 6
+    palignr          m4, m6, m2, 10
+    palignr          m6, m2, 12
+    psrldq           m2, 8
+
+    punpcklwd        m1, m3
+    punpcklwd        m2, m4
+    punpcklwd        m6, m7
+    pmaddwd          m1, m9
+    pmaddwd          m2, m10
+    pmaddwd          m6, m11
+    paddd            m1, m2
+    paddd            m0, m6
+    paddd            m0, m1
+    paddd            m0, m14
+
+    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
+    pxor             m4, m4
+    movd             m5, [base+byte_blend]
+.x_loop_ar3_inner:
+    pcmpgtb          m2, m4, m1
+    punpcklbw        m3, m1, m2
+    pmaddwd          m2, m3, m12
+    pshufd           m3, m2, q1111
+    paddd            m2, m3                 ; left+cur
+    paddd            m2, m0                 ; add top
+    psrldq           m0, 4
+    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
+    ; don't packssdw, we only care about one value
+    packsswb         m2, m2
+    pandn            m3, m5, m1
+    pslld            m2, 24
+    pand             m2, m5
+    por              m1, m2, m3
+    movd    [bufq+xq-3], m1
+    psrldq           m1, 1
+    inc              xq
+    jz .x_loop_ar3_end
+    test             xq, 3
+    jnz .x_loop_ar3_inner
+    jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+    add            bufq, 82
+    add           bufyq, 82<<%3
+    dec              hd
+    jg .y_loop_ar3
+    RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
+
+%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
+%assign %%idx 0
+%define %%tmp %2
+%if %0 == 6
+%define %%tmp %6
+%endif
+%rep 4
+%if %%idx == 0
+    movd        %5 %+ d, %2
+    pshuflw       %%tmp, %2, q3232
+%else
+    movd        %5 %+ d, %%tmp
+%if %%idx == 2
+    punpckhqdq    %%tmp, %%tmp
+%elif %%idx == 4
+    psrlq         %%tmp, 32
+%endif
+%endif
+    movzx       %4 %+ d, %5 %+ w
+    shr         %5 %+ d, 16
+
+%if %%idx == 0
+    movd             %1, [%3+%4]
+%else
+    pinsrw           %1, [%3+%4], %%idx + 0
+%endif
+    pinsrw           %1, [%3+%5], %%idx + 1
+%assign %%idx %%idx+2
+%endrep
+%endmacro
+
+INIT_XMM ssse3
+; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \
+        dst, src, scaling, unused1, fg_data, picptr, unused2
+    ; copy stack arguments to new position post-alignment, so that we
+    ; don't have to keep the old stack location in a separate register
+    mov              r0, r0m
+    mov              r1, r2m
+    mov              r2, r4m
+    mov              r3, r6m
+    mov              r4, r7m
+    mov              r5, r8m
+
+    mov [rsp+6*mmsize+ 3*gprsize], r0
+    mov [rsp+6*mmsize+ 5*gprsize], r1
+    mov [rsp+6*mmsize+ 7*gprsize], r2
+    mov [rsp+6*mmsize+ 9*gprsize], r3
+    mov [rsp+6*mmsize+10*gprsize], r4
+    mov [rsp+6*mmsize+11*gprsize], r5
+%else
+cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \
+        dst, src, scaling, unused1, fg_data, picptr, unused2
+%endif
+    mov            srcq, srcm
+    mov        fg_dataq, r3m
+    mov        scalingq, r5m
+%if STACK_ALIGNMENT < mmsize
+%define r0m [rsp+6*mmsize+ 3*gprsize]
+%define r1m [rsp+6*mmsize+ 4*gprsize]
+%define r2m [rsp+6*mmsize+ 5*gprsize]
+%define r3m [rsp+6*mmsize+ 6*gprsize]
+%define r4m [rsp+6*mmsize+ 7*gprsize]
+%define r5m [rsp+6*mmsize+ 8*gprsize]
+%define r6m [rsp+6*mmsize+ 9*gprsize]
+%define r7m [rsp+6*mmsize+10*gprsize]
+%define r8m [rsp+6*mmsize+11*gprsize]
+%endif
+    LEA              r5, pb_mask
+%define base r5-pb_mask
+    mov             r5m, picptrq
+%else
+cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
+    lea              r7, [pb_mask]
+%define base r7-pb_mask
+%endif
+    mov             r6d, [fg_dataq+FGData.scaling_shift]
+    movd             m3, [base+mul_bits+r6*2-14]
+    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
+    pcmpeqw          m2, m2
+    psrldq           m2, 14
+    movd             m4, [base+max+r6*4]
+    movd             m5, [base+min+r6*2]
+    punpcklwd        m3, m3
+    punpcklwd        m4, m4
+    punpcklwd        m5, m5
+    pshufd           m3, m3, q0000
+    pshufd           m4, m4, q0000
+    pshufd           m5, m5, q0000
+    SCRATCH           2, 10, 0
+    SCRATCH           3, 11, 1
+    SCRATCH           4, 12, 2
+    SCRATCH           5, 13, 3
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+%endif
+
+    mov            sbyd, r8m
+    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
+    test       overlapd, overlapd
+    jz .no_vertical_overlap
+    mova             m6, [base+pw_1024]
+    movd             m7, [base+pb_27_17_17_27]
+    SCRATCH           6, 14, 4
+    SCRATCH           7, 15, 5
+    test           sbyd, sbyd
+    jnz .vertical_overlap
+    ; fall-through
+
+.no_vertical_overlap:
+    mov             r8m, overlapd
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
+    imul           seed, (173 << 24) | 37
+%else
+    imul           seed, sbyd, (173 << 24) | 37
+%endif
+    add            seed, (105 << 24) | 178
+    rol            seed, 8
+    movzx          seed, seew
+    xor            seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+    mov             r3m, seed
+    mov              wq, r4m
+%else
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                unused1, unused2, see, unused3
+%endif
+
+    lea        src_bakq, [srcq+wq]
+    neg              wq
+    sub           dstmp, srcq
+%if ARCH_X86_32
+    mov             r1m, src_bakq
+    mov             r4m, wq
+    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
+%endif
+
+.loop_x:
+%if ARCH_X86_32
+    mov            seed, r3m
+%endif
+    mov             r6d, seed
+    or             seed, 0xEFF4
+    shr             r6d, 1
+    test           seeb, seeh
+    lea            seed, [r6+0x8000]
+    cmovp          seed, r6d                ; updated seed
+%if ARCH_X86_32
+    mov             r3m, seed
+
+    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+    mov           offxd, offyd
+%else
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                offx, offy, see, unused
+
+    mov           offyd, seed
+    mov           offxd, seed
+%endif
+    ror           offyd, 8
+    shr           offxd, 12
+    and           offyd, 0xf
+    imul          offyd, 164
+    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
+    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
+    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                h, offxy, see, unused
+%endif
+
+.loop_x_odd:
+    mov              hd, r7m
+    mov      grain_lutq, grain_lutmp
+.loop_y:
+    ; src
+    mova             m0, [srcq]
+    pxor             m2, m2
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+
+    ; scaling[src]
+%if ARCH_X86_32
+    vpgatherdw       m4, m0, scalingq, r0, r5, m3
+    vpgatherdw       m5, m1, scalingq, r0, r5, m3
+%else
+    vpgatherdw       m4, m0, scalingq, r12, r13, m3
+    vpgatherdw       m5, m1, scalingq, r12, r13, m3
+%endif
+    pcmpeqw          m3, m3
+    psrlw            m3, 8
+    pand             m4, m3
+    pand             m5, m3
+
+    ; grain = grain_lut[offy+y][offx+x]
+    movu             m3, [grain_lutq+offxyq]
+    pcmpgtb          m7, m2, m3
+    punpcklbw        m2, m3, m7
+    punpckhbw        m3, m7
+
+    ; noise = round2(scaling[src] * grain, scaling_shift)
+    pmullw           m2, m4
+    pmullw           m3, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m3, m11
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m2
+    paddw            m1, m3
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+    movifnidn      dstq, dstmp
+    mova    [dstq+srcq], m0
+
+    add            srcq, r2mp
+    add      grain_lutq, 82
+    dec              hd
+    jg .loop_y
+
+%if ARCH_X86_32
+    add            r4mp, 16
+%else
+    add              wq, 16
+%endif
+    jge .end
+%if ARCH_X86_32
+    mov            srcq, r1mp
+    add            srcq, r4mp
+%else
+    lea            srcq, [src_bakq+wq]
+%endif
+    btc       dword r8m, 2
+    jc .next_blk
+
+    add          offxyd, 16
+    test      dword r8m, 2              ; r8m & 2 = have_top_overlap
+    jz .loop_x_odd
+
+%if ARCH_X86_32
+    add dword [rsp+6*mmsize+1*gprsize], 16
+%else
+    add            r11d, 16             ; top_offxyd
+%endif
+    jnz .loop_x_odd_v_overlap
+
+.next_blk:
+    test      dword r8m, 1
+    jz .loop_x
+
+    test      dword r8m, 2
+    jnz .loop_x_hv_overlap
+
+    ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+%if ARCH_X86_32
+    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
+    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
+    DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
+
+    add          offxyd, 16                 ; left_offxyd
+    mov [rsp+6*mmsize+0*gprsize], offxyd
+
+    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
+
+    mov            seed, r3m
+%else
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                offx, offy, see, left_offxy
+
+    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
+%endif
+
+    mov             r6d, seed
+    or             seed, 0xEFF4
+    shr             r6d, 1
+    test           seeb, seeh
+    lea            seed, [r6+0x8000]
+    cmovp          seed, r6d                ; updated seed
+
+%if ARCH_X86_32
+    mov             r3m, seed
+
+    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+    mov           offxd, offyd
+%else
+    mov           offyd, seed
+    mov           offxd, seed
+%endif
+    ror           offyd, 8
+    shr           offxd, 12
+    and           offyd, 0xf
+    imul          offyd, 164
+    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                h, offxy, see, left_offxy
+%endif
+
+    mov              hd, r7m
+    mov      grain_lutq, grain_lutmp
+.loop_y_h_overlap:
+    ; src
+    mova             m0, [srcq]
+    pxor             m2, m2
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+
+    ; scaling[src]
+%if ARCH_X86_32
+    vpgatherdw       m4, m0, scalingq, r0, r5, m3
+    vpgatherdw       m5, m1, scalingq, r0, r5, m3
+%else
+    vpgatherdw       m4, m0, scalingq, r12, r13, m3
+    vpgatherdw       m5, m1, scalingq, r12, r13, m3
+%endif
+    pcmpeqw          m3, m3
+    psrlw            m3, 8
+    pand             m4, m3
+    pand             m5, m3
+
+    ; grain = grain_lut[offy+y][offx+x]
+    movu             m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+    mov              r5, [rsp+6*mmsize+0*gprsize]
+    movd             m7, [grain_lutq+r5]
+%else
+    movd             m7, [grain_lutq+left_offxyq]
+%endif
+    punpcklbw        m7, m3
+    pmaddubsw        m6, m15, m7
+    pmulhrsw         m6, m14
+    packsswb         m6, m6
+    pand             m6, m10
+    pandn            m7, m10, m3
+    por              m6, m7
+    pcmpgtb          m2, m6
+    punpcklbw        m7, m6, m2
+    punpckhbw        m6, m2
+
+    ; noise = round2(scaling[src] * grain, scaling_shift)
+    pmullw           m7, m4
+    pmullw           m6, m5
+    pmulhrsw         m7, m11
+    pmulhrsw         m6, m11
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m7
+    paddw            m1, m6
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+    movifnidn      dstq, dstmp
+    mova    [dstq+srcq], m0
+
+    add            srcq, r2mp
+    add      grain_lutq, 82
+    dec              hd
+    jg .loop_y_h_overlap
+
+%if ARCH_X86_32
+    add            r4mp, 16
+%else
+    add              wq, 16
+%endif
+    jge .end
+%if ARCH_X86_32
+    mov            srcq, r1m
+    add            srcq, r4m
+%else
+    lea            srcq, [src_bakq+wq]
+%endif
+    xor       dword r8m, 4
+    add          offxyd, 16
+
+    ; since this half-block had left-overlap, the next does not
+    test      dword r8m, 2              ; have_top_overlap
+    jz .loop_x_odd
+%if ARCH_X86_32
+    add dword [rsp+6*mmsize+1*gprsize], 16
+%else
+    add            r11d, 16             ; top_offxyd
+%endif
+    jmp .loop_x_odd_v_overlap
+
+.end:
+    RET
+
+.vertical_overlap:
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
+%endif
+
+    or         overlapd, 2                  ; top_overlap: overlap & 2
+    mov             r8m, overlapd
+    movzx          sbyd, sbyb
+%if ARCH_X86_32
+    imul             r4, [fg_dataq+FGData.seed], 0x00010001
+    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+    imul           seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+    imul           tmpd, sbyd, 173 * 0x00010001
+    imul           sbyd, 37 * 0x01000100
+    add            tmpd, (105 << 16) | 188
+    add            sbyd, (178 << 24) | (141 << 8)
+    and            tmpd, 0x00ff00ff
+    and            sbyd, 0xff00ff00
+    xor            seed, tmpd
+%if ARCH_X86_32
+    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+    mov             r3m, seed
+    mov              wq, r4m
+%else
+    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                tmp, unused2, see, unused3
+%endif
+
+    lea        src_bakq, [srcq+wq]
+    neg              wq
+    sub           dstmp, srcq
+%if ARCH_X86_32
+    mov             r1m, src_bakq
+    mov             r4m, wq
+    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%endif
+
+.loop_x_v_overlap:
+%if ARCH_X86_32
+    mov            seed, r3m
+%endif
+    ; we assume from the block above that bits 8-15 of tmpd are zero'ed,
+    ; because of the 'and tmpd, 0x00ff00ff' above
+    mov             r6d, seed
+    or             seed, 0xeff4eff4
+    test           seeb, seeh
+    setp           tmpb                     ; parity of top_seed
+    shr            seed, 16
+    shl            tmpd, 16
+    test           seeb, seeh
+    setp           tmpb                     ; parity of cur_seed
+    or              r6d, 0x00010001
+    xor            tmpd, r6d
+    mov            seed, tmpd
+    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+    mov             r3m, seed
+
+    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+    mov           offxd, offyd
+%else
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                offx, offy, see, unused, top_offxy
+
+    mov           offyd, seed
+    mov           offxd, seed
+%endif
+
+    ror           offyd, 8
+    ror           offxd, 12
+    and           offyd, 0xf000f
+    and           offxd, 0xf000f
+    imul          offyd, 164
+    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                h, offxy, see, unused, top_offxy
+%endif
+
+    movzx    top_offxyd, offxyw
+%if ARCH_X86_32
+    mov [rsp+6*mmsize+1*gprsize], top_offxyd
+
+    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+    shr          offxyd, 16
+
+.loop_x_odd_v_overlap:
+%if ARCH_X86_32
+    mov              r5, r5m
+    lea              r5, [base+pb_27_17]
+    mov [rsp+5*mmsize+8], r5
+%else
+    mova             m8, [pb_27_17]
+%endif
+    mov              hd, r7m
+    mov      grain_lutq, grain_lutmp
+.loop_y_v_overlap:
+    ; src
+    mova             m0, [srcq]
+    pxor             m2, m2
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+
+    ; scaling[src]
+%if ARCH_X86_32
+    vpgatherdw       m4, m0, scalingq, r0, r5, m3
+    vpgatherdw       m5, m1, scalingq, r0, r5, m3
+%else
+    vpgatherdw       m4, m0, scalingq, r12, r13, m3
+    vpgatherdw       m5, m1, scalingq, r12, r13, m3
+%endif
+    pcmpeqw          m3, m3
+    psrlw            m3, 8
+    pand             m4, m3
+    pand             m5, m3
+
+    ; grain = grain_lut[offy+y][offx+x]
+    movu             m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+    mov              r5, [rsp+6*mmsize+1*gprsize]
+    movu             m7, [grain_lutq+r5]
+%else
+    movu             m7, [grain_lutq+top_offxyq]
+%endif
+    punpckhbw        m6, m7, m3
+    punpcklbw        m7, m3
+%if ARCH_X86_32
+    mov              r5, [rsp+5*mmsize+8]
+    pmaddubsw        m3, [r5], m6
+    pmaddubsw        m6, [r5], m7
+%else
+    pmaddubsw        m3, m8, m6
+    pmaddubsw        m6, m8, m7
+%endif
+    pmulhrsw         m3, m14
+    pmulhrsw         m6, m14
+    packsswb         m6, m3
+    pcmpgtb          m7, m2, m6
+    punpcklbw        m2, m6, m7
+    punpckhbw        m6, m7
+
+    ; noise = round2(scaling[src] * grain, scaling_shift)
+    pmullw           m2, m4
+    pmullw           m6, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m6, m11
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m2
+    paddw            m1, m6
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+    movifnidn      dstq, dstmp
+    mova    [dstq+srcq], m0
+
+%if ARCH_X86_32
+    add dword [rsp+5*mmsize+8], mmsize
+%else
+    mova             m8, [pb_17_27]
+%endif
+    add            srcq, r2mp
+    add      grain_lutq, 82
+    dec              hw
+    jz .end_y_v_overlap
+    ; 2 lines get vertical overlap, then fall back to non-overlap code for
+    ; remaining (up to) 30 lines
+    btc              hd, 16
+    jnc .loop_y_v_overlap
+    jmp .loop_y
+
+.end_y_v_overlap:
+%if ARCH_X86_32
+    add            r4mp, 16
+%else
+    add              wq, 16
+%endif
+    jge .end_hv
+%if ARCH_X86_32
+    mov            srcq, r1mp
+    add            srcq, r4mp
+%else
+    lea            srcq, [src_bakq+wq]
+%endif
+    btc       dword r8m, 2
+    jc .loop_x_hv_overlap
+    add          offxyd, 16
+%if ARCH_X86_32
+    add dword [rsp+6*mmsize+1*gprsize], 16
+%else
+    add      top_offxyd, 16
+%endif
+    jmp .loop_x_odd_v_overlap
+
+.loop_x_hv_overlap:
+%if ARCH_X86_32
+    mov              r5, r5m
+    lea              r5, [base+pb_27_17]
+    mov [rsp+5*mmsize+8], r5
+
+    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
+
+    mov              r5, [rsp+6*mmsize+1*gprsize]
+    mov              r4, offxyd
+    add              r5, 16
+    add              r4, 16
+    mov [rsp+6*mmsize+2*gprsize], r5        ; topleft_offxy
+    mov [rsp+6*mmsize+0*gprsize], r4        ; left_offxy
+
+    DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
+
+    xor            tmpd, tmpd
+    mov            seed, r3m
+%else
+    mova             m8, [pb_27_17]
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                tmp, unused2, see, unused3
+
+    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+%endif
+    mov             r6d, seed
+    or             seed, 0xeff4eff4
+    test           seeb, seeh
+    setp           tmpb                     ; parity of top_seed
+    shr            seed, 16
+    shl            tmpd, 16
+    test           seeb, seeh
+    setp           tmpb                     ; parity of cur_seed
+    or              r6d, 0x00010001
+    xor            tmpd, r6d
+    mov            seed, tmpd
+    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+    mov             r3m, seed
+
+    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+    mov           offxd, offyd
+%else
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                offx, offy, see, left_offxy, top_offxy, topleft_offxy
+
+    lea  topleft_offxyq, [top_offxyq+16]
+    lea     left_offxyq, [offyq+16]
+    mov           offyd, seed
+    mov           offxd, seed
+%endif
+    ror           offyd, 8
+    ror           offxd, 12
+    and           offyd, 0xf000f
+    and           offxd, 0xf000f
+    imul          offyd, 164
+    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+
+    movzx            r5, offxyw             ; top_offxy
+    mov [rsp+6*mmsize+1*gprsize], r5
+%else
+    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+                h, offxy, see, left_offxy, top_offxy, topleft_offxy
+
+    movzx    top_offxyd, offxyw
+%endif
+    shr          offxyd, 16
+
+    mov              hd, r7m
+    mov      grain_lutq, grain_lutmp
+.loop_y_hv_overlap:
+    ; grain = grain_lut[offy+y][offx+x]
+    movu             m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+    mov              r5, [rsp+6*mmsize+1*gprsize]   ; top_offxy
+    mov              r0, [rsp+6*mmsize+0*gprsize]   ; left_offxy
+    movu             m6, [grain_lutq+r5]
+    mov              r5, [rsp+6*mmsize+2*gprsize]   ; topleft_offxy
+    movd             m4, [grain_lutq+r0]
+    movd             m7, [grain_lutq+r5]
+%else
+    movu             m6, [grain_lutq+top_offxyq]
+    movd             m4, [grain_lutq+left_offxyq]
+    movd             m7, [grain_lutq+topleft_offxyq]
+%endif
+    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+    punpcklbw        m4, m3
+    punpcklbw        m7, m6
+    pmaddubsw        m2, m15, m4
+    pmaddubsw        m4, m15, m7
+    pmulhrsw         m2, m14
+    pmulhrsw         m4, m14
+    packsswb         m2, m2
+    packsswb         m4, m4
+    pand             m2, m10
+    pand             m4, m10
+    pandn            m7, m10, m3
+    pandn            m3, m10, m6
+    por              m7, m2
+    por              m3, m4
+    ; followed by v interpolation (top | cur -> cur)
+    punpckhbw        m4, m3, m7
+    punpcklbw        m3, m7
+%if ARCH_X86_32
+    mov              r5, [rsp+5*mmsize+8]
+    pmaddubsw        m7, [r5], m4
+    pmaddubsw        m4, [r5], m3
+%else
+    pmaddubsw        m7, m8, m4
+    pmaddubsw        m4, m8, m3
+%endif
+    pmulhrsw         m7, m14
+    pmulhrsw         m4, m14
+    packsswb         m4, m7
+    pxor             m2, m2
+    pcmpgtb          m7, m2, m4
+    punpcklbw        m3, m4, m7
+    punpckhbw        m4, m7
+
+    ; src
+    mova             m0, [srcq]
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+
+    ; scaling[src]
+%if ARCH_X86_32
+    vpgatherdw       m5, m0, scalingq, r0, r5, m7
+    vpgatherdw       m6, m1, scalingq, r0, r5, m7
+%else
+    vpgatherdw       m5, m0, scalingq, r13, r14, m7
+    vpgatherdw       m6, m1, scalingq, r13, r14, m7
+%endif
+    pcmpeqw          m7, m7
+    psrlw            m7, 8
+    pand             m5, m7
+    pand             m6, m7
+
+    ; noise = round2(scaling[src] * grain, scaling_shift)
+    pmullw           m3, m5
+    pmullw           m4, m6
+    pmulhrsw         m3, m11
+    pmulhrsw         m4, m11
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m3
+    paddw            m1, m4
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+    movifnidn      dstq, dstmp
+    mova    [dstq+srcq], m0
+
+%if ARCH_X86_32
+    add dword [rsp+5*mmsize+8], mmsize
+%else
+    mova             m8, [pb_17_27]
+%endif
+    add            srcq, r2mp
+    add      grain_lutq, 82
+    dec              hw
+    jz .end_y_hv_overlap
+    ; 2 lines get vertical overlap, then fall back to non-overlap code for
+    ; remaining (up to) 30 lines
+    btc              hd, 16
+    jnc .loop_y_hv_overlap
+    jmp .loop_y_h_overlap
+
+.end_y_hv_overlap:
+%if ARCH_X86_32
+    add            r4mp, 16
+%else
+    add              wq, 16
+%endif
+    jge .end_hv
+%if ARCH_X86_32
+    mov            srcq, r1m
+    add            srcq, r4m
+%else
+    lea            srcq, [src_bakq+wq]
+%endif
+    xor       dword r8m, 4
+    add          offxyd, 16
+%if ARCH_X86_32
+    add dword [rsp+6*mmsize+1*gprsize], 16
+%else
+    add      top_offxyd, 16
+%endif
+    jmp .loop_x_odd_v_overlap
+
+.end_hv:
+    RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+INIT_XMM ssse3
+%if ARCH_X86_32
+; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
+;                         sby, luma, lstride, uv_pl, is_id)
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
+cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
+        tmp, src, scaling, h, fg_data, picptr, unused
+    mov              r0, r0m
+    mov              r1, r2m
+    mov              r2, r4m
+    mov              r3, r6m
+    mov              r4, r7m
+    mov [rsp+8*mmsize+3*gprsize], r0
+    mov [rsp+8*mmsize+5*gprsize], r1
+    mov [rsp+8*mmsize+7*gprsize], r2
+    mov [rsp+8*mmsize+9*gprsize], r3
+    mov [rsp+8*mmsize+10*gprsize], r4
+
+    mov              r0, r8m
+    mov              r1, r9m
+    mov              r2, r10m
+    mov              r4, r11m
+    mov              r3, r12m
+    mov [rsp+8*mmsize+11*gprsize], r0
+    mov [rsp+8*mmsize+12*gprsize], r1
+    mov [rsp+8*mmsize+13*gprsize], r2
+    mov [rsp+8*mmsize+14*gprsize], r4
+%else
+cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
+        tmp, src, scaling, h, fg_data, picptr, unused
+%endif
+    mov            srcq, srcm
+    mov        fg_dataq, r3m
+    mov        scalingq, r5m
+%if STACK_ALIGNMENT < mmsize
+%define r0m [rsp+8*mmsize+ 3*gprsize]
+%define r1m [rsp+8*mmsize+ 4*gprsize]
+%define r2m [rsp+8*mmsize+ 5*gprsize]
+%define r3m [rsp+8*mmsize+ 6*gprsize]
+%define r4m [rsp+8*mmsize+ 7*gprsize]
+%define r5m [rsp+8*mmsize+ 8*gprsize]
+%define r6m [rsp+8*mmsize+ 9*gprsize]
+%define r7m [rsp+8*mmsize+10*gprsize]
+%define r8m [rsp+8*mmsize+11*gprsize]
+%define r9m [rsp+8*mmsize+12*gprsize]
+%define r10m [rsp+8*mmsize+13*gprsize]
+%define r11m [rsp+8*mmsize+14*gprsize]
+%define r12m [rsp+8*mmsize+15*gprsize]
+%endif
+    LEA              r5, pb_mask
+%define base r5-pb_mask
+    mov             r5m, r5
+%else
+cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+                                     grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
+    lea              r8, [pb_mask]
+%define base r8-pb_mask
+%endif
+    mov             r6d, [fg_dataq+FGData.scaling_shift]
+    pcmpeqw          m2, m2
+    movd             m3, [base+mul_bits+r6*2-14]
+    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
+    lea            tmpd, [r6d*2]
+%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
+    test             r3, r3
+%else
+    cmp      dword r12m, 0                      ; is_idm
+%endif
+    movd             m5, [base+min+r6*2]
+    cmovne          r6d, tmpd
+    movd             m4, [base+max+r6*2]
+    psrldq           m2, 14+%2
+    punpcklwd        m3, m3
+    punpcklwd        m5, m5
+    punpcklwd        m4, m4
+    pshufd           m3, m3, q0000
+    pshufd           m5, m5, q0000
+    pshufd           m4, m4, q0000
+    SCRATCH           2, 10, 0
+    SCRATCH           3, 11, 1
+    SCRATCH           4, 12, 2
+    SCRATCH           5, 13, 3
+
+    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+    jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+%endif
+
+%if %1
+    mov             r6d, dword r11m
+    movd             m0, [fg_dataq+FGData.uv_mult+r6*4]
+    movd             m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
+    punpcklbw        m6, m1, m0
+    movd             m7, [fg_dataq+FGData.uv_offset+r6*4]
+    punpcklwd        m6, m6
+    punpcklwd        m7, m7
+    pshufd           m6, m6, q0000
+    pshufd           m7, m7, q0000
+    SCRATCH           6, 14, 4
+    SCRATCH           7, 15, 5
+%endif
+
+    mov            sbyd, r8m
+    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
+    test       overlapd, overlapd
+    jz %%no_vertical_overlap
+%if ARCH_X86_32
+%if %2
+    movd             m1, [base+pb_23_22]
+%else
+    movd             m1, [base+pb_27_17_17_27]
+%endif
+    mova             m0, [base+pw_1024]
+%else
+%if %2
+    movd             m1, [pb_23_22]
+%else
+    movd             m1, [pb_27_17_17_27]
+%endif
+    mova             m0, [pw_1024]
+%endif
+    pshufd           m1, m1, q0000
+    SCRATCH           0, 8, 6
+    SCRATCH           1, 9, 7
+    test           sbyd, sbyd
+    jnz %%vertical_overlap
+    ; fall-through
+
+%%no_vertical_overlap:
+    mov             r8m, overlapd
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
+    imul           seed, (173 << 24) | 37
+%else
+    imul           seed, sbyd, (173 << 24) | 37
+%endif
+    add            seed, (105 << 24) | 178
+    rol            seed, 8
+    movzx          seed, seew
+    xor            seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+    mov             r3m, seed
+
+    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
+%define luma_bakq lumaq
+
+    mov              wq, r4m
+%if %3
+    shl           r10mp, 1
+%endif
+%else
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
+
+    mov        lstrideq, r10mp
+%endif
+
+    mov           lumaq, r9mp
+    lea        src_bakq, [srcq+wq]
+    lea       luma_bakq, [lumaq+wq*(1+%2)]
+    neg              wq
+    sub            r0mp, srcq
+%if ARCH_X86_32
+    mov             r1m, src_bakq
+    mov            r11m, luma_bakq
+    mov             r4m, wq
+
+    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%else
+    mov           r11mp, src_bakq
+    mov           r12mp, strideq
+%endif
+
+%%loop_x:
+%if ARCH_X86_32
+    mov            seed, r3m
+%endif
+    mov             r6d, seed
+    or             seed, 0xEFF4
+    shr             r6d, 1
+    test           seeb, seeh
+    lea            seed, [r6+0x8000]
+    cmovp          seed, r6d               ; updated seed
+%if ARCH_X86_32
+    mov             r3m, seed
+
+    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+    mov           offxd, offyd
+%else
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                offx, offy, see, overlap, unused1, unused2, lstride
+
+    mov           offyd, seed
+    mov           offxd, seed
+%endif
+    ror           offyd, 8
+    shr           offxd, 12
+    and           offyd, 0xf
+    imul          offyd, 164>>%3
+    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
+
+%if ARCH_X86_32
+    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%else
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
+%endif
+
+%%loop_x_odd:
+    mov              hd, r7m
+    mov      grain_lutq, grain_lutmp
+%%loop_y:
+    ; src
+%if ARCH_X86_32
+    mov           lumaq, r9mp
+%endif
+%if %2
+    mova             m4, [lumaq+ 0]
+    mova             m6, [lumaq+16]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+    mov              r5, r5m
+    movd             m7, [base+pb_1]
+%else
+    movd             m7, [pb_1]
+%endif
+    pshufd           m7, m7, q0000
+    pxor             m2, m2
+    pmaddubsw        m4, m7
+    pmaddubsw        m6, m7
+    pavgw            m4, m2
+    pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+%endif
+    pxor             m2, m2
+%endif
+
+%if %1
+%if %2
+    packuswb         m4, m6                 ; luma
+%endif
+    punpckhbw        m6, m4, m0
+    punpcklbw        m4, m0                 ; { luma, chroma }
+    pmaddubsw        m6, m14
+    pmaddubsw        m4, m14
+    psraw            m6, 6
+    psraw            m4, 6
+    paddw            m6, m15
+    paddw            m4, m15
+    packuswb         m4, m6                 ; pack+unpack = clip
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%endif
+
+    ; scaling[luma_src]
+%if ARCH_X86_32
+    vpgatherdw       m7, m4, scalingq, r0, r5
+    vpgatherdw       m5, m6, scalingq, r0, r5
+%else
+    vpgatherdw       m7, m4, scalingq, r12, r2
+    vpgatherdw       m5, m6, scalingq, r12, r2
+%endif
+    pcmpeqw          m1, m1
+    psrlw            m1, 8
+    pand             m7, m1
+    pand             m5, m1
+
+    ; unpack chroma_source
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+
+    ; grain = grain_lut[offy+y][offx+x]
+    movu             m3, [grain_lutq+offxyq+ 0]
+    pcmpgtb          m6, m2, m3
+    punpcklbw        m2, m3, m6
+    punpckhbw        m3, m6
+
+    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+    pmullw           m2, m7
+    pmullw           m3, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m3, m11
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m2
+    paddw            m1, m3
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+    movifnidn      dstq, dstmp
+    mova    [dstq+srcq], m0
+
+%if ARCH_X86_32
+    add            srcq, r2mp
+    ; we already incremented lumaq above
+%else
+    add            srcq, r12mp
+%if %3
+    lea           lumaq, [lumaq+lstrideq*2]
+%else
+    add           lumaq, lstrideq
+%endif
+%endif
+    add      grain_lutq, 82
+    dec              hw
+    jg %%loop_y
+
+%if ARCH_X86_32
+    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+    mov              wq, r4m
+%endif
+    add              wq, 16
+    jge %%end
+%if ARCH_X86_32
+    mov            srcq, r1mp
+    mov           lumaq, r11mp
+%else
+    mov            srcq, r11mp
+%endif
+    lea           lumaq, [luma_bakq+wq*(1+%2)]
+    add            srcq, wq
+%if ARCH_X86_32
+    mov             r4m, wq
+    mov             r9m, lumaq
+%endif
+%if %2 == 0
+    ; adjust top_offxy
+%if ARCH_X86_32
+    add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+    add            r11d, 16
+%endif
+    add          offxyd, 16
+    btc       dword r8m, 2
+    jc %%loop_x_even
+    test      dword r8m, 2
+    jz %%loop_x_odd
+    jmp %%loop_x_odd_v_overlap
+%%loop_x_even:
+%endif
+    test      dword r8m, 1
+    jz %%loop_x
+
+    ; r8m = sbym
+    test      dword r8m, 2
+    jne %%loop_x_hv_overlap
+
+    ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+%if ARCH_X86_32
+%if %2
+    lea              r6, [offxyd+16]
+    mov [rsp+8*mmsize+0*gprsize], r6
+%else
+    mov [rsp+8*mmsize+0*gprsize], offxyd
+%endif
+
+    DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
+
+    mov            seed, r3m
+%else
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                offx, offy, see, left_offxy, unused1, unused2, lstride
+
+%if %2
+    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
+%else
+    mov     left_offxyd, offyd
+%endif
+%endif
+    mov             r6d, seed
+    or             seed, 0xEFF4
+    shr             r6d, 1
+    test           seeb, seeh
+    lea            seed, [r6+0x8000]
+    cmovp          seed, r6d                ; updated seed
+
+%if ARCH_X86_32
+    mov             r3m, seed
+
+    DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
+
+    mov          offxd, offyd
+%else
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                offx, offy, see, left_offxy, unused1, unused2, lstride
+
+    mov           offyd, seed
+    mov           offxd, seed
+%endif
+    ror           offyd, 8
+    shr           offxd, 12
+    and           offyd, 0xf
+    imul          offyd, 164>>%3
+    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
+
+%if ARCH_X86_32
+    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%else
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
+%endif
+
+    mov              hd, r7m
+    mov      grain_lutq, grain_lutmp
+%%loop_y_h_overlap:
+    ; src
+%if ARCH_X86_32
+    mov           lumaq, r9mp
+%endif
+%if %2
+    mova             m4, [lumaq+ 0]
+    mova             m6, [lumaq+16]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+    mov              r5, r5m
+    movd             m7, [base+pb_1]
+%else
+    movd             m7, [pb_1]
+%endif
+    pshufd           m7, m7, q0000
+    pxor             m2, m2
+    pmaddubsw        m4, m7
+    pmaddubsw        m6, m7
+    pavgw            m4, m2
+    pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+%endif
+    pxor             m2, m2
+%endif
+
+%if %1
+%if %2
+    packuswb         m4, m6                 ; luma
+%endif
+    punpckhbw        m6, m4, m0
+    punpcklbw        m4, m0                 ; { luma, chroma }
+    pmaddubsw        m6, m14
+    pmaddubsw        m4, m14
+    psraw            m6, 6
+    psraw            m4, 6
+    paddw            m6, m15
+    paddw            m4, m15
+    packuswb         m4, m6                 ; pack+unpack = clip
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%endif
+
+    ; scaling[luma_src]
+%if ARCH_X86_32
+    vpgatherdw       m7, m4, scalingq, r0, r5
+    vpgatherdw       m5, m6, scalingq, r0, r5
+%else
+    vpgatherdw       m7, m4, scalingq, r12, r2
+    vpgatherdw       m5, m6, scalingq, r12, r2
+%endif
+    pcmpeqw          m1, m1
+    psrlw            m1, 8
+    pand             m7, m1
+    pand             m5, m1
+
+    ; unpack chroma_source
+    punpckhbw        m1, m0, m2
+    punpcklbw        m0, m2                 ; m0-1: src as word
+
+    ; grain = grain_lut[offy+y][offx+x]
+    movu             m3, [grain_lutq+offxyq+ 0]
+%if ARCH_X86_32
+    mov              r0, [rsp+8*mmsize+0*gprsize]
+    movd             m4, [grain_lutq+r0+ 0]
+%else
+    movd             m4, [grain_lutq+left_offxyq+ 0]
+%endif
+    punpcklbw        m2, m4, m3
+    pmaddubsw        m4, m9, m2
+    pmulhrsw         m4, m8
+    packsswb         m4, m4
+    pand             m4, m10
+    pandn            m2, m10, m3
+    por              m3, m4, m2
+    pxor             m4, m4
+    pcmpgtb          m4, m3
+    punpcklbw        m2, m3, m4
+    punpckhbw        m3, m4
+
+    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+    pmullw           m2, m7
+    pmullw           m3, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m3, m11
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m2
+    paddw            m1, m3
+    pmaxsw           m0, m13
+    pmaxsw           m1, m13
+    pminsw           m0, m12
+    pminsw           m1, m12
+    packuswb         m0, m1
+    movifnidn      dstq, dstmp
+    mova    [dstq+srcq], m0
+
+%if ARCH_X86_32
+    add            srcq, r2mp
+    ; lumaq has already been incremented above
+%else
+    add            srcq, r12mp
+%if %3
+    lea           lumaq, [lumaq+lstrideq*2]
+%else
+    add           lumaq, lstrideq
+%endif
+%endif
+    add      grain_lutq, 82
+    dec              hw
+    jg %%loop_y_h_overlap
+
+%if ARCH_X86_32
+    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+    mov              wq, r4m
+%endif
+    add              wq, 16
+    jge %%end
+%if ARCH_X86_32
+    mov            srcq, r1mp
+    mov           lumaq, r11mp
+%else
+    mov            srcq, r11mp
+%endif
+    lea           lumaq, [luma_bakq+wq*(1+%2)]
+    add            srcq, wq
+%if ARCH_X86_32
+    mov             r4m, wq
+    mov             r9m, lumaq
+%endif
+%if %2 == 0
+    xor       dword r8m, 4
+    ; adjust top_offxyd
+%if ARCH_X86_32
+    add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+    add            r11d, 16
+%endif
+    add          offxyd, 16
+%endif
+
+    ; r8m = sbym
+    test      dword r8m, 2
+%if %2
+    jne %%loop_x_hv_overlap
+    jmp %%loop_x_h_overlap
+%else
+    jne %%loop_x_odd_v_overlap
+    jmp %%loop_x_odd
+%endif
+
+%%end:
+    RET
+
+%%vertical_overlap:
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
+%endif
+
+    or         overlapd, 2                  ; top_overlap: overlap & 2
+    mov             r8m, overlapd
+    movzx          sbyd, sbyb
+%if ARCH_X86_32
+    imul             r4, [fg_dataq+FGData.seed], 0x00010001
+    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+    imul           seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+    imul           tmpd, sbyd, 173 * 0x00010001
+    imul           sbyd, 37 * 0x01000100
+    add            tmpd, (105 << 16) | 188
+    add            sbyd, (178 << 24) | (141 << 8)
+    and            tmpd, 0x00ff00ff
+    and            sbyd, 0xff00ff00
+    xor            seed, tmpd
+%if ARCH_X86_32
+    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
+
+    mov             r3m, seed
+    mov              wq, r4m
+%if %3
+    shl           r10mp, 1
+%endif
+%else
+    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
+
+    mov        lstrideq, r10mp
+%endif
+
+    mov           lumaq, r9mp
+    lea        src_bakq, [srcq+wq]
+    lea       luma_bakq, [lumaq+wq*(1+%2)]
+    neg              wq
+    sub            r0mp, srcq
+%if ARCH_X86_32
+    mov             r1m, src_bakq
+    mov            r11m, luma_bakq
+    mov             r4m, wq
+
+    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%else
+    mov           r11mp, src_bakq
+    mov           r12mp, strideq
+%endif
+
+%%loop_x_v_overlap:
+%if ARCH_X86_32
+    mov            seed, r3m
+    xor            tmpd, tmpd
+%endif
+    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+    mov             r6d, seed
+    or             seed, 0xeff4eff4
+    test           seeb, seeh
+    setp           tmpb                     ; parity of top_seed
+    shr            seed, 16
+    shl            tmpd, 16
+    test           seeb, seeh
+    setp           tmpb                     ; parity of cur_seed
+    or              r6d, 0x00010001
+    xor            tmpd, r6d
+    mov            seed, tmpd
+    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+    mov             r3m, seed
+
+    DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
+
+    mov           offxd, offyd
+%else
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                offx, offy, see, overlap, top_offxy, unused, lstride
+
+    mov           offxd, seed
+    mov           offyd, seed
+%endif
+    ror           offyd, 8
+    ror           offxd, 12
+    and           offyd, 0xf000f
+    and           offxd, 0xf000f
+    imul          offyd, 164>>%3
+    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+    DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
+%else
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
+%endif
+
+    movzx    top_offxyd, offxyw
+    shr          offxyd, 16
+%if ARCH_X86_32
+    mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+%%loop_x_odd_v_overlap:
+    mov              hd, r7m
+    mov      grain_lutq, grain_lutmp
+%if ARCH_X86_32
+    mov              r5, r5m
+    mova             m1, [base+pb_27_17]
+%else
+    mova             m1, [pb_27_17]
+%endif
+%%loop_y_v_overlap:
+%if ARCH_X86_32
+    mov           lumaq, r9mp
+%endif
+%if %2
+    mova             m4, [lumaq+ 0]
+    mova             m6, [lumaq+16]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+    mov              r5, r5m
+    movd             m7, [base+pb_1]
+%else
+    movd             m7, [pb_1]
+%endif
+    pshufd           m7, m7, q0000
+    pxor             m2, m2
+    pmaddubsw        m4, m7
+    pmaddubsw        m6, m7
+    pavgw            m4, m2
+    pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+%endif
+    pxor             m2, m2
+%endif
+
+%if %1
+%if %2
+    packuswb         m4, m6                 ; luma
+%endif
+    punpckhbw        m6, m4, m0
+    punpcklbw        m4, m0                 ; { luma, chroma }
+    pmaddubsw        m6, m14
+    pmaddubsw        m4, m14
+    psraw            m6, 6
+    psraw            m4, 6
+    paddw            m6, m15
+    paddw            m4, m15
+    packuswb         m4, m6                 ; pack+unpack = clip
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%endif
+
+    ; scaling[luma_src]
+%if ARCH_X86_32
+    vpgatherdw       m7, m4, scalingq, r0, r5
+    vpgatherdw       m5, m6, scalingq, r0, r5
+%else
+    vpgatherdw       m7, m4, scalingq, r12, r2
+    vpgatherdw       m5, m6, scalingq, r12, r2
+%endif
+    pcmpeqw          m4, m4
+    psrlw            m4, 8
+    pand             m7, m4
+    pand             m5, m4
+
+    ; grain = grain_lut[offy+y][offx+x]
+    movu             m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+    mov              r0, [rsp+8*mmsize+1*gprsize]
+    movu             m4, [grain_lutq+r0]
+%else
+    movu             m4, [grain_lutq+top_offxyq]
+%endif
+    punpckhbw        m6, m4, m3
+    punpcklbw        m4, m3
+%if %3
+    pmaddubsw        m2, m9, m6
+    pmaddubsw        m3, m9, m4
+%else
+    pmaddubsw        m2, m1, m6
+    pmaddubsw        m3, m1, m4
+%endif
+    pmulhrsw         m2, m8
+    pmulhrsw         m3, m8
+    packsswb         m3, m2
+    pxor             m6, m6
+    pcmpgtb          m6, m3
+    punpcklbw        m2, m3, m6
+    punpckhbw        m3, m6
+
+    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+    pmullw           m2, m7
+    pmullw           m3, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m3, m11
+
+    ; unpack chroma_source
+    pxor             m4, m4
+    punpckhbw        m6, m0, m4
+    punpcklbw        m0, m4                 ; m0-1: src as word
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m2
+    paddw            m6, m3
+    pmaxsw           m0, m13
+    pmaxsw           m6, m13
+    pminsw           m0, m12
+    pminsw           m6, m12
+    packuswb         m0, m6
+    movifnidn      dstq, dstmp
+    mova    [dstq+srcq], m0
+
+    dec              hw
+    je %%end_y_v_overlap
+%if ARCH_X86_32
+    add            srcq, r2mp
+    ; lumaq has already been incremented above
+%else
+    add            srcq, r12mp
+%if %3
+    lea           lumaq, [lumaq+lstrideq*2]
+%else
+    add           lumaq, lstrideq
+%endif
+%endif
+    add      grain_lutq, 82
+%if %3 == 0
+    btc              hd, 16
+%if ARCH_X86_32
+    mov              r5, r5m
+    mova             m1, [base+pb_17_27]
+%else
+    mova             m1, [pb_17_27]
+%endif
+    jnc %%loop_y_v_overlap
+%endif
+    jmp %%loop_y
+
+%%end_y_v_overlap:
+%if ARCH_X86_32
+    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+    mov              wq, r4m
+%endif
+    add              wq, 16
+    jge %%end_hv
+%if ARCH_X86_32
+    mov            srcq, r1mp
+    mov           lumaq, r11mp
+%else
+    mov            srcq, r11mp
+%endif
+    lea           lumaq, [luma_bakq+wq*(1+%2)]
+    add            srcq, wq
+%if ARCH_X86_32
+    mov             r4m, wq
+    mov             r9m, lumaq
+%endif
+
+%if %2
+    ; since fg_dataq.overlap is guaranteed to be set, we never jump
+    ; back to .loop_x_v_overlap, and instead always fall-through to
+    ; h+v overlap
+%else
+%if ARCH_X86_32
+    add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+    add      top_offxyd, 16
+%endif
+    add          offxyd, 16
+    btc       dword r8m, 2
+    jnc %%loop_x_odd_v_overlap
+%endif
+
+%%loop_x_hv_overlap:
+%if ARCH_X86_32
+    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
+
+    mov              r6, [rsp+8*mmsize+1*gprsize]
+%if %2
+    lea              r0, [r3d+16]
+    add              r6, 16
+    mov [rsp+8*mmsize+0*gprsize], r0        ; left_offxy
+%else
+    mov [rsp+8*mmsize+0*gprsize], r3        ; left_offxy
+%endif
+    mov [rsp+8*mmsize+2*gprsize], r6        ; topleft_offxy
+
+    DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
+
+    mov            seed, r3m
+    xor            tmpd, tmpd
+%else
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+%if %2
+    lea  topleft_offxyq, [top_offxyq+16]
+    lea     left_offxyq, [offxyq+16]
+%else
+    mov  topleft_offxyq, top_offxyq
+    mov     left_offxyq, offxyq
+%endif
+
+    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+%endif
+    mov             r6d, seed
+    or             seed, 0xeff4eff4
+    test           seeb, seeh
+    setp           tmpb                     ; parity of top_seed
+    shr            seed, 16
+    shl            tmpd, 16
+    test           seeb, seeh
+    setp           tmpb                     ; parity of cur_seed
+    or              r6d, 0x00010001
+    xor            tmpd, r6d
+    mov            seed, tmpd
+    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+    mov             r3m, seed
+
+    DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
+
+    mov           offxd, offyd
+%else
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+    mov           offxd, seed
+    mov           offyd, seed
+%endif
+    ror           offyd, 8
+    ror           offxd, 12
+    and           offyd, 0xf000f
+    and           offxd, 0xf000f
+    imul          offyd, 164>>%3
+    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
+%endif
+
+    movzx    top_offxyd, offxyw
+    shr          offxyd, 16
+%if ARCH_X86_32
+    mov [rsp+8*mmsize+1*gprsize], top_offxyd
+%endif
+
+    mov              hd, r7m
+    mov      grain_lutq, grain_lutmp
+%if ARCH_X86_32
+    mov              r5, r5m
+    mova             m3, [base+pb_27_17]
+%else
+    mova             m3, [pb_27_17]
+%endif
+%%loop_y_hv_overlap:
+    ; src
+%if ARCH_X86_32
+    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+    mov           lumaq, r9mp
+%endif
+%if %2
+    mova             m4, [lumaq+ 0]
+    mova             m6, [lumaq+16]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+    mov              r5, r5m
+    movd             m7, [base+pb_1]
+%else
+    movd             m7, [pb_1]
+%endif
+    pshufd           m7, m7, q0000
+    pxor             m2, m2
+    pmaddubsw        m4, m7
+    pmaddubsw        m6, m7
+    pavgw            m4, m2
+    pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+%endif
+    pxor             m2, m2
+%endif
+
+%if %1
+%if %2
+    packuswb         m4, m6                 ; luma
+%endif
+    punpckhbw        m6, m4, m0
+    punpcklbw        m4, m0                 ; { luma, chroma }
+    pmaddubsw        m6, m14
+    pmaddubsw        m4, m14
+    psraw            m6, 6
+    psraw            m4, 6
+    paddw            m6, m15
+    paddw            m4, m15
+    packuswb         m4, m6                 ; pack+unpack = clip
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
+%endif
+
+    ; scaling[src]
+%if ARCH_X86_32
+    vpgatherdw       m7, m4, scalingq, r0, r5
+    vpgatherdw       m5, m6, scalingq, r0, r5
+%else
+    movd             m1, [grain_lutq+topleft_offxyq]
+%if %3
+    vpgatherdw       m7, m4, scalingq, r2, r12
+    vpgatherdw       m5, m6, scalingq, r2, r12
+%else
+    vpgatherdw       m7, m4, scalingq, r2, r13
+    vpgatherdw       m5, m6, scalingq, r2, r13
+%endif
+%endif
+    pcmpeqw          m2, m2
+    psrlw            m2, 8
+    pand             m7, m2
+    pand             m5, m2
+
+    ; grain = grain_lut[offy+y][offx+x]
+%if ARCH_X86_32
+    mov              r0, [rsp+8*mmsize+2*gprsize]       ; topleft_offxy
+    mov              r5, [rsp+8*mmsize+1*gprsize]       ; top_offxy
+    movd             m1, [grain_lutq+r0]
+    mov              r0, [rsp+8*mmsize+0*gprsize]       ; left_offxy
+%endif
+    movu             m2, [grain_lutq+offxyq]
+%if ARCH_X86_32
+    movu             m6, [grain_lutq+r5]
+    movd             m4, [grain_lutq+r0]
+%else
+    movu             m6, [grain_lutq+top_offxyq]
+    movd             m4, [grain_lutq+left_offxyq]
+%endif
+    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+    punpcklbw        m1, m6
+    punpcklbw        m4, m2
+%if %2
+    punpcklwd        m4, m1
+%else
+    punpckldq        m4, m1
+%endif
+    pmaddubsw        m1, m9, m4
+    pmulhrsw         m1, m8
+    packsswb         m1, m1
+    pandn            m4, m10, m2
+    pandn            m2, m10, m6
+    psrldq           m6, m1, 2-%2
+    pand             m1, m10
+    pand             m6, m10
+    por              m4, m1
+    por              m2, m6
+    ; followed by v interpolation (top | cur -> cur)
+    punpckhbw        m1, m2, m4
+    punpcklbw        m2, m4
+%if %3
+    pmaddubsw        m4, m9, m1
+    pmaddubsw        m1, m9, m2
+%else
+    pmaddubsw        m4, m3, m1
+    pmaddubsw        m1, m3, m2
+%endif
+    pmulhrsw         m4, m8
+    pmulhrsw         m1, m8
+    packsswb         m1, m4
+    pxor             m4, m4
+    pcmpgtb          m4, m1
+    punpcklbw        m2, m1, m4
+    punpckhbw        m1, m4
+
+    ; noise = round2(scaling[src] * grain, scaling_shift)
+    pmullw           m2, m7
+    pmullw           m1, m5
+    pmulhrsw         m2, m11
+    pmulhrsw         m1, m11
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+    ; unpack chroma source
+    pxor             m4, m4
+    punpckhbw        m5, m0, m4
+    punpcklbw        m0, m4                 ; m0-1: src as word
+
+    ; dst = clip_pixel(src, noise)
+    paddw            m0, m2
+    paddw            m5, m1
+    pmaxsw           m0, m13
+    pmaxsw           m5, m13
+    pminsw           m0, m12
+    pminsw           m5, m12
+    packuswb         m0, m5
+    movifnidn      dstq, dstmp
+    mova    [dstq+srcq], m0
+
+%if ARCH_X86_32
+    add            srcq, r2mp
+    ; lumaq has been adjusted above already
+%else
+    add            srcq, r12mp
+%if %3
+    lea           lumaq, [lumaq+lstrideq*(1+%2)]
+%else
+    add           lumaq, r10mp
+%endif
+%endif
+    add      grain_lutq, 82
+    dec              hw
+%if %3
+    jg %%loop_y_h_overlap
+%else
+    jle %%end_y_hv_overlap
+%if ARCH_X86_32
+    mov              r5, r5m
+    mova             m3, [base+pb_17_27]
+%else
+    mova             m3, [pb_17_27]
+%endif
+    btc              hd, 16
+    jnc %%loop_y_hv_overlap
+%if ARCH_X86_64
+    mov        lstrideq, r10mp
+%endif
+    jmp %%loop_y_h_overlap
+%%end_y_hv_overlap:
+%if ARCH_X86_64
+    mov        lstrideq, r10mp
+%endif
+%endif
+
+%if ARCH_X86_32
+    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+    mov              wq, r4m
+%endif
+    add              wq, 16
+    jge %%end_hv
+%if ARCH_X86_32
+    mov            srcq, r1mp
+    mov           lumaq, r11mp
+%else
+    mov            srcq, r11mp
+%endif
+    lea           lumaq, [luma_bakq+wq*(1+%2)]
+    add            srcq, wq
+%if ARCH_X86_32
+    mov             r4m, wq
+    mov             r9m, lumaq
+%endif
+%if %2
+    jmp %%loop_x_hv_overlap
+%else
+%if ARCH_X86_32
+    add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+    add      top_offxyd, 16
+%endif
+    add          offxyd, 16
+    xor       dword r8m, 4
+    jmp %%loop_x_odd_v_overlap
+%endif
+
+%%end_hv:
+    RET
+%endmacro
+
+    %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+    %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 422, 1, 0
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 444, 0, 0
diff --git a/src/x86/ipred.asm b/src/x86/ipred.asm
new file mode 100644 (file)
index 0000000..ad05b3b
--- /dev/null
@@ -0,0 +1,5386 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+%macro SMOOTH_WEIGHT_TABLE 1-*
+    %rep %0
+        db %1-128, 127-%1
+        %rotate 1
+    %endrep
+%endmacro
+
+; sm_weights[], but modified to precalculate x and 256-x with offsets to
+; enable efficient use of pmaddubsw (which requires signed values)
+smooth_weights: SMOOTH_WEIGHT_TABLE         \
+      0,   0, 255, 128, 255, 149,  85,  64, \
+    255, 197, 146, 105,  73,  50,  37,  32, \
+    255, 225, 196, 170, 145, 123, 102,  84, \
+     68,  54,  43,  33,  26,  20,  17,  16, \
+    255, 240, 225, 210, 196, 182, 169, 157, \
+    145, 133, 122, 111, 101,  92,  83,  74, \
+     66,  59,  52,  45,  39,  34,  29,  25, \
+     21,  17,  14,  12,  10,   9,   8,   8, \
+    255, 248, 240, 233, 225, 218, 210, 203, \
+    196, 189, 182, 176, 169, 163, 156, 150, \
+    144, 138, 133, 127, 121, 116, 111, 106, \
+    101,  96,  91,  86,  82,  77,  73,  69, \
+     65,  61,  57,  54,  50,  47,  44,  41, \
+     38,  35,  32,  29,  27,  25,  22,  20, \
+     18,  16,  15,  13,  12,  10,   9,   8, \
+      7,   6,   6,   5,   5,   4,   4,   4
+
+pb_1to32:     db  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
+              db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+pb_32to1:     db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17
+pb_16to1:     db 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1
+z_filter_wh:  db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+              db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
+z_filter_k:   db  0, 16,  0, 16,  0, 20,  0, 20,  8, 16,  8, 16
+              db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
+              db  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  8,  0
+z_filter_s:   db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
+              db  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
+              db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
+pb_128:       times 4 db 128 ; those are just placed here for alignment.
+pb_36_m4:     times 2 db 36, -4
+z3_shuf:      db  8,  7,  7,  6,  6,  5,  5,  4,  4,  3,  3,  2,  2,  1,  1,  0
+z_filter_t0:  db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
+z_filter_t1:  db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
+z_upsample1:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+z_upsample2:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  8,  8
+z2_upsample:  db  7,  6, 15, 14,  5,  4, 13, 12,  3,  2, 11, 10,  1,  0,  9,  8
+z1_shuf_w4:   db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
+z2_shuf_h2:   db  3,  2,  7,  6, 11, 10, 15, 14,  2,  1,  6,  5, 10,  9, 14, 13
+z2_shuf_h4:   db  7,  6, 15, 14,  6,  5, 14, 13,  5,  4, 13, 12,  4,  3, 12, 11
+z3_shuf_w4:   db  4,  3,  3,  2,  2,  1,  1,  0, 12, 11, 11, 10, 10,  9,  9,  8
+z_transpose4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+z_base_inc:   dw   0*64,   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64
+              dw  16*64,  17*64,  18*64,  19*64,  20*64,  21*64,  22*64,  23*64
+z2_base_inc:  dw   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64,   8*64
+              dw   9*64,  10*64,  11*64,  12*64,  13*64,  14*64,  15*64,  16*64
+z2_ymul:      dw  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
+z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7
+              db 32, 32, 32, 32, 12, 12, 12, 12,  1,  0,  1,  0,  5, -1, -1, -1 ; 0, 4, 1, 5
+; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
+filter_shuf1: db 10,  4, 10,  4, 37,  6,  5,  6,103,  9,  7,  9, 72, -1,  8, -1
+              db 16,  4,  0,  4, 53,  6,  5,  6,119, 11,  7, 11, 95, -1, 15, -1
+filter_shuf2: db  3,  4,  3,  4,  5,  6,  5,  6,  7,  2,  7,  2,  1, -1,  1, -1
+filter_shuf3: db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11; 15, -1, 15, -1
+pb_127_m127:  times 2 db 127, -127
+ipred_v_shuf: db  0,  1,  0,  1,  4,  5,  4,  5,  8,  9,  8,  9, 12, 13, 12, 13
+              db  2,  3,  2,  3,  6,  7,  6,  7, 10, 11, 10, 11, 14, 15, 14, 15
+ipred_h_shuf: db  7,  7,  7,  7,  3,  3,  3,  3,  5,  5,  5,  5,  1,  1,  1,  1
+              db  6,  6,  6,  6,  2,  2,  2,  2,  4,  4,  4,  4;  0,  0,  0,  0
+pw_64:        times 2 dw 64
+
+cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1
+                             times 9 db 7, -1
+cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
+                        db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+                        ; w=8, w_pad=1 as well as second half of previous one
+cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
+                        times 5 db 6, 7
+                        ; w=16,w_pad=2
+                        db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+                        times 8 db 14, 15
+                        ; w=16,w_pad=3
+                        db 0, 1, 2, 3, 4, 5
+                        times 13 db 6, 7
+pb_15to0:               db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+%define pb_0to15 cfl_ac_w16_pad_shuffle
+%define pb_1  (ipred_h_shuf+12)
+%define pb_2  (ipred_h_shuf+20)
+%define pb_3  (ipred_h_shuf+ 4)
+%define pb_4  (ipred_h_shuf+24)
+%define pb_5  (ipred_h_shuf+ 8)
+%define pb_7  (ipred_h_shuf+ 0)
+%define pb_8  (z_upsample2 +12)
+%define pb_12 (z2_y_shuf_h4+20)
+%define pb_14 (z2_y_shuf_h4+ 4)
+%define pb_15 (z_filter_s  +32)
+%define pb_27 (z2_y_shuf_h4+ 8)
+%define pb_31 (z2_y_shuf_h4+12)
+%define pb_32 (z2_y_shuf_h4+16)
+%define pb_90 (z2_y_shuf_h4+ 0)
+%define pw_1  (z2_y_shuf_h4+24)
+%define pw_8  (z_filter_k  +32)
+
+pw_62:    times 2 dw 62
+pw_128:   times 2 dw 128
+pw_255:   times 2 dw 255
+pw_512:   times 2 dw 512
+
+%macro JMP_TABLE 3-*
+    %xdefine %1_%2_table (%%table - 2*4)
+    %xdefine %%base mangle(private_prefix %+ _%1_%2)
+    %%table:
+    %rep %0 - 2
+        dd %%base %+ .%3 - (%%table - 2*4)
+        %rotate 1
+    %endrep
+%endmacro
+
+%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
+%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4)
+
+JMP_TABLE ipred_smooth,     avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v,   avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h,   avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth,      avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_filter,     avx2, w4, w8, w16, w32
+JMP_TABLE ipred_dc,         avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+                                  s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left,    avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h,          avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1,         avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2,         avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3,         avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_cfl,        avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
+                                  s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left,   avx2, h4, h8, h16, h32
+JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
+JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3
+JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32
+JMP_TABLE pal_pred,         avx2, w4, w8, w16, w32, w64
+
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
+    lea                  r5, [ipred_dc_left_avx2_table]
+    tzcnt                wd, wm
+    inc                 tlq
+    movu                 m0, [tlq]
+    movifnidn            hd, hm
+    mov                 r6d, 0x8000
+    shrx                r6d, r6d, wd
+    movd                xm3, r6d
+    movsxd               r6, [r5+wq*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, r5
+    add                  r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    jmp                  r6
+
+cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
+    mov                  hd, hm ; zero upper half
+    tzcnt               r6d, hd
+    sub                 tlq, hq
+    tzcnt                wd, wm
+    movu                 m0, [tlq]
+    mov                 r5d, 0x8000
+    shrx                r5d, r5d, r6d
+    movd                xm3, r5d
+    lea                  r5, [ipred_dc_left_avx2_table]
+    movsxd               r6, [r5+r6*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, r5
+    add                  r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    jmp                  r6
+.h64:
+    movu                 m1, [tlq+32] ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+.h32:
+    vextracti128        xm1, m0, 1
+    paddw               xm0, xm1
+.h16:
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+.h8:
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+.h4:
+    pmaddwd             xm0, xm2
+    pmulhrsw            xm0, xm3
+    lea            stride3q, [strideq*3]
+    vpbroadcastb         m0, xm0
+    mova                 m1, m0
+    jmp                  wq
+
+cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
+    movifnidn            hd, hm
+    movifnidn            wd, wm
+    tzcnt               r6d, hd
+    lea                 r5d, [wq+hq]
+    movd                xm4, r5d
+    tzcnt               r5d, r5d
+    movd                xm5, r5d
+    lea                  r5, [ipred_dc_avx2_table]
+    tzcnt                wd, wd
+    movsxd               r6, [r5+r6*4]
+    movsxd               wq, [r5+wq*4+5*4]
+    pcmpeqd              m3, m3
+    psrlw               xm4, 1
+    add                  r6, r5
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  r6
+.h4:
+    movd                xm0, [tlq-4]
+    pmaddubsw           xm0, xm3
+    jmp                  wq
+.w4:
+    movd                xm1, [tlq+1]
+    pmaddubsw           xm1, xm3
+    psubw               xm0, xm4
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    cmp                  hd, 4
+    jg .w4_mul
+    psrlw               xm0, 3
+    jmp .w4_end
+.w4_mul:
+    punpckhqdq          xm1, xm0, xm0
+    lea                 r2d, [hq*2]
+    mov                 r6d, 0x55563334
+    paddw               xm0, xm1
+    shrx                r6d, r6d, r2d
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    movd                xm1, r6d
+    psrlw               xm0, 2
+    pmulhuw             xm0, xm1
+.w4_end:
+    vpbroadcastb        xm0, xm0
+.s4:
+    movd   [dstq+strideq*0], xm0
+    movd   [dstq+strideq*1], xm0
+    movd   [dstq+strideq*2], xm0
+    movd   [dstq+stride3q ], xm0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s4
+    RET
+ALIGN function_align
+.h8:
+    movq                xm0, [tlq-8]
+    pmaddubsw           xm0, xm3
+    jmp                  wq
+.w8:
+    movq                xm1, [tlq+1]
+    vextracti128        xm2, m0, 1
+    pmaddubsw           xm1, xm3
+    psubw               xm0, xm4
+    paddw               xm0, xm2
+    punpckhqdq          xm2, xm0, xm0
+    paddw               xm0, xm2
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 8
+    je .w8_end
+    mov                 r6d, 0x5556
+    mov                 r2d, 0x3334
+    cmp                  hd, 32
+    cmove               r6d, r2d
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w8_end:
+    vpbroadcastb        xm0, xm0
+.s8:
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xm0
+    movq   [dstq+strideq*2], xm0
+    movq   [dstq+stride3q ], xm0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s8
+    RET
+ALIGN function_align
+.h16:
+    mova                xm0, [tlq-16]
+    pmaddubsw           xm0, xm3
+    jmp                  wq
+.w16:
+    movu                xm1, [tlq+1]
+    vextracti128        xm2, m0, 1
+    pmaddubsw           xm1, xm3
+    psubw               xm0, xm4
+    paddw               xm0, xm2
+    paddw               xm0, xm1
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 16
+    je .w16_end
+    mov                 r6d, 0x5556
+    mov                 r2d, 0x3334
+    test                 hb, 8|32
+    cmovz               r6d, r2d
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w16_end:
+    vpbroadcastb        xm0, xm0
+.s16:
+    mova   [dstq+strideq*0], xm0
+    mova   [dstq+strideq*1], xm0
+    mova   [dstq+strideq*2], xm0
+    mova   [dstq+stride3q ], xm0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s16
+    RET
+ALIGN function_align
+.h32:
+    mova                 m0, [tlq-32]
+    pmaddubsw            m0, m3
+    jmp                  wq
+.w32:
+    movu                 m1, [tlq+1]
+    pmaddubsw            m1, m3
+    paddw                m0, m1
+    vextracti128        xm1, m0, 1
+    psubw               xm0, xm4
+    paddw               xm0, xm1
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 32
+    je .w32_end
+    lea                 r2d, [hq*2]
+    mov                 r6d, 0x33345556
+    shrx                r6d, r6d, r2d
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w32_end:
+    vpbroadcastb         m0, xm0
+.s32:
+    mova   [dstq+strideq*0], m0
+    mova   [dstq+strideq*1], m0
+    mova   [dstq+strideq*2], m0
+    mova   [dstq+stride3q ], m0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s32
+    RET
+ALIGN function_align
+.h64:
+    mova                 m0, [tlq-64]
+    mova                 m1, [tlq-32]
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    paddw                m0, m1
+    jmp                  wq
+.w64:
+    movu                 m1, [tlq+ 1]
+    movu                 m2, [tlq+33]
+    pmaddubsw            m1, m3
+    pmaddubsw            m2, m3
+    paddw                m0, m1
+    paddw                m0, m2
+    vextracti128        xm1, m0, 1
+    psubw               xm0, xm4
+    paddw               xm0, xm1
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 64
+    je .w64_end
+    mov                 r6d, 0x33345556
+    shrx                r6d, r6d, hd
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w64_end:
+    vpbroadcastb         m0, xm0
+    mova                 m1, m0
+.s64:
+    mova [dstq+strideq*0+32*0], m0
+    mova [dstq+strideq*0+32*1], m1
+    mova [dstq+strideq*1+32*0], m0
+    mova [dstq+strideq*1+32*1], m1
+    mova [dstq+strideq*2+32*0], m0
+    mova [dstq+strideq*2+32*1], m1
+    mova [dstq+stride3q +32*0], m0
+    mova [dstq+stride3q +32*1], m1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s64
+    RET
+
+cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
+    lea                  r5, [ipred_dc_splat_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    vpbroadcastd         m0, [r5-ipred_dc_splat_avx2_table+pb_128]
+    mova                 m1, m0
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+
+cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
+    lea                  r5, [ipred_dc_splat_avx2_table]
+    tzcnt                wd, wm
+    movu                 m0, [tlq+ 1]
+    movu                 m1, [tlq+33]
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+
+%macro IPRED_H 2 ; w, store_type
+    vpbroadcastb         m0, [tlq-1]
+    vpbroadcastb         m1, [tlq-2]
+    vpbroadcastb         m2, [tlq-3]
+    sub                 tlq, 4
+    vpbroadcastb         m3, [tlq+0]
+    mov%2  [dstq+strideq*0], m0
+    mov%2  [dstq+strideq*1], m1
+    mov%2  [dstq+strideq*2], m2
+    mov%2  [dstq+stride3q ], m3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w%1
+    RET
+ALIGN function_align
+%endmacro
+
+INIT_XMM avx2
+cglobal ipred_h, 3, 6, 4, dst, stride, tl, w, h, stride3
+    lea                  r5, [ipred_h_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    IPRED_H               4, d
+.w8:
+    IPRED_H               8, q
+.w16:
+    IPRED_H              16, a
+INIT_YMM avx2
+.w32:
+    IPRED_H              32, a
+.w64:
+    vpbroadcastb         m0, [tlq-1]
+    vpbroadcastb         m1, [tlq-2]
+    vpbroadcastb         m2, [tlq-3]
+    sub                 tlq, 4
+    vpbroadcastb         m3, [tlq+0]
+    mova [dstq+strideq*0+32*0], m0
+    mova [dstq+strideq*0+32*1], m0
+    mova [dstq+strideq*1+32*0], m1
+    mova [dstq+strideq*1+32*1], m1
+    mova [dstq+strideq*2+32*0], m2
+    mova [dstq+strideq*2+32*1], m2
+    mova [dstq+stride3q +32*0], m3
+    mova [dstq+stride3q +32*1], m3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w64
+    RET
+
+%macro PAETH 2 ; top, ldiff
+    pavgb                m1, m%1, m3 ; Calculating tldiff normally requires
+    pxor                 m0, m%1, m3 ; 10-bit intermediates, but we can do it
+    pand                 m0, m4      ; in 8-bit with some tricks which avoids
+    psubusb              m2, m5, m1  ; having to unpack everything to 16-bit.
+    psubb                m1, m0
+    psubusb              m1, m5
+    por                  m1, m2
+    paddusb              m1, m1
+    por                  m1, m0      ; min(tldiff, 255)
+    psubusb              m2, m5, m3
+    psubusb              m0, m3, m5
+    por                  m2, m0      ; tdiff
+    pminub               m2, m%2
+    pcmpeqb              m0, m%2, m2 ; ldiff <= tdiff
+    vpblendvb            m0, m%1, m3, m0
+    pminub               m1, m2
+    pcmpeqb              m1, m2      ; ldiff <= tldiff || tdiff <= tldiff
+    vpblendvb            m0, m5, m0, m1
+%endmacro
+
+cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h
+%define base r5-ipred_paeth_avx2_table
+    lea                  r5, [ipred_paeth_avx2_table]
+    tzcnt                wd, wm
+    vpbroadcastb         m5, [tlq]   ; topleft
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    vpbroadcastd         m4, [base+pb_1]
+    add                  wq, r5
+    jmp                  wq
+.w4:
+    vpbroadcastd         m6, [tlq+1] ; top
+    mova                 m8, [base+ipred_h_shuf]
+    lea                  r3, [strideq*3]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0      ; ldiff
+.w4_loop:
+    sub                 tlq, 8
+    vpbroadcastq         m3, [tlq]
+    pshufb               m3, m8      ; left
+    PAETH                 6, 7
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    movd   [dstq+strideq*1], xm1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r3       ], xm1, 2
+    cmp                  hd, 4
+    je .ret
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm0, 3
+    pextrd [dstq+r3       ], xm1, 3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 8
+    jg .w4_loop
+.ret:
+    RET
+ALIGN function_align
+.w8:
+    vpbroadcastq         m6, [tlq+1]
+    mova                 m8, [base+ipred_h_shuf]
+    lea                  r3, [strideq*3]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+.w8_loop:
+    sub                 tlq, 4
+    vpbroadcastd         m3, [tlq]
+    pshufb               m3, m8
+    PAETH                 6, 7
+    vextracti128        xm1, m0, 1
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+r3       ], xm1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w8_loop
+    RET
+ALIGN function_align
+.w16:
+    vbroadcasti128       m6, [tlq+1]
+    mova                xm8, xm4 ; lower half = 1, upper half = 0
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+.w16_loop:
+    sub                 tlq, 2
+    vpbroadcastd         m3, [tlq]
+    pshufb               m3, m8
+    PAETH                 6, 7
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w16_loop
+    RET
+ALIGN function_align
+.w32:
+    movu                 m6, [tlq+1]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+.w32_loop:
+    dec                 tlq
+    vpbroadcastb         m3, [tlq]
+    PAETH                 6, 7
+    mova             [dstq], m0
+    add                dstq, strideq
+    dec                  hd
+    jg .w32_loop
+    RET
+ALIGN function_align
+.w64:
+    movu                 m6, [tlq+ 1]
+    movu                 m7, [tlq+33]
+%if WIN64
+    movaps              r4m, xmm9
+%endif
+    psubusb              m8, m5, m6
+    psubusb              m0, m6, m5
+    psubusb              m9, m5, m7
+    psubusb              m1, m7, m5
+    por                  m8, m0
+    por                  m9, m1
+.w64_loop:
+    dec                 tlq
+    vpbroadcastb         m3, [tlq]
+    PAETH                 6, 8
+    mova        [dstq+32*0], m0
+    PAETH                 7, 9
+    mova        [dstq+32*1], m0
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_loop
+%if WIN64
+    movaps             xmm9, r4m
+%endif
+    RET
+
+%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
+    ; w * a         = (w - 128) * a + 128 * a
+    ; (256 - w) * b = (127 - w) * b + 129 * b
+    pmaddubsw            m0, m%3, m%1
+    pmaddubsw            m1, m%4, m%2
+    paddw                m0, m%5
+    paddw                m1, m%6
+    psrlw                m0, 8
+    psrlw                m1, 8
+    packuswb             m0, m1
+%endmacro
+
+cglobal ipred_smooth_v, 3, 7, 0, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_avx2_table
+    lea                  r6, [ipred_smooth_v_avx2_table]
+    tzcnt                wd, wm
+    mov                  hd, hm
+    movsxd               wq, [r6+wq*4]
+    vpbroadcastd         m0, [base+pb_127_m127]
+    vpbroadcastd         m1, [base+pw_128]
+    lea            weightsq, [base+smooth_weights+hq*4]
+    neg                  hq
+    vpbroadcastb         m5, [tlq+hq] ; bottom
+    add                  wq, r6
+    jmp                  wq
+.w4:
+    vpbroadcastd         m2, [tlq+1]
+    punpcklbw            m2, m5 ; top, bottom
+    mova                 m5, [base+ipred_v_shuf]
+    lea                  r3, [strideq*3]
+    punpckldq            m4, m5, m5
+    punpckhdq            m5, m5
+    pmaddubsw            m3, m2, m0
+    paddw                m1, m2 ;   1 * top + 256 * bottom + 128, overflow is ok
+    paddw                m3, m1 ; 128 * top + 129 * bottom + 128
+.w4_loop:
+    vbroadcasti128       m1, [weightsq+hq*2]
+    pshufb               m0, m1, m4
+    pshufb               m1, m5
+    SMOOTH                0, 1, 2, 2, 3, 3
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    movd   [dstq+strideq*1], xm1
+    pextrd [dstq+strideq*2], xm0, 1
+    pextrd [dstq+r3       ], xm1, 1
+    cmp                  hd, -4
+    je .ret
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm1, 2
+    pextrd [dstq+strideq*2], xm0, 3
+    pextrd [dstq+r3       ], xm1, 3
+    lea                dstq, [dstq+strideq*4]
+    add                  hq, 8
+    jl .w4_loop
+.ret:
+    RET
+ALIGN function_align
+.w8:
+    vpbroadcastq         m2, [tlq+1]
+    punpcklbw            m2, m5
+    mova                 m5, [base+ipred_v_shuf]
+    lea                  r3, [strideq*3]
+    pshufd               m4, m5, q0000
+    pshufd               m5, m5, q1111
+    pmaddubsw            m3, m2, m0
+    paddw                m1, m2
+    paddw                m3, m1
+.w8_loop:
+    vpbroadcastq         m1, [weightsq+hq*2]
+    pshufb               m0, m1, m4
+    pshufb               m1, m5
+    SMOOTH                0, 1, 2, 2, 3, 3
+    vextracti128        xm1, m0, 1
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+r3       ], xm1
+    lea                dstq, [dstq+strideq*4]
+    add                  hq, 4
+    jl .w8_loop
+    RET
+ALIGN function_align
+.w16:
+    WIN64_SPILL_XMM       7
+    vbroadcasti128       m3, [tlq+1]
+    mova                 m6, [base+ipred_v_shuf]
+    punpcklbw            m2, m3, m5
+    punpckhbw            m3, m5
+    pmaddubsw            m4, m2, m0
+    pmaddubsw            m5, m3, m0
+    paddw                m0, m1, m2
+    paddw                m1, m3
+    paddw                m4, m0
+    paddw                m5, m1
+.w16_loop:
+    vpbroadcastd         m1, [weightsq+hq*2]
+    pshufb               m1, m6
+    SMOOTH                1, 1, 2, 3, 4, 5
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    lea                dstq, [dstq+strideq*2]
+    add                  hq, 2
+    jl .w16_loop
+    RET
+ALIGN function_align
+.w32:
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM       6
+    movu                 m3, [tlq+1]
+    punpcklbw            m2, m3, m5
+    punpckhbw            m3, m5
+    pmaddubsw            m4, m2, m0
+    pmaddubsw            m5, m3, m0
+    paddw                m0, m1, m2
+    paddw                m1, m3
+    paddw                m4, m0
+    paddw                m5, m1
+.w32_loop:
+    vpbroadcastw         m1, [weightsq+hq*2]
+    SMOOTH                1, 1, 2, 3, 4, 5
+    mova             [dstq], m0
+    add                dstq, strideq
+    inc                  hq
+    jl .w32_loop
+    RET
+ALIGN function_align
+.w64:
+    WIN64_SPILL_XMM      11
+    movu                 m4, [tlq+ 1]
+    movu                 m8, [tlq+33]
+    punpcklbw            m3, m4, m5
+    punpckhbw            m4, m5
+    punpcklbw            m7, m8, m5
+    punpckhbw            m8, m5
+    pmaddubsw            m5, m3, m0
+    pmaddubsw            m6, m4, m0
+    pmaddubsw            m9, m7, m0
+    pmaddubsw           m10, m8, m0
+    paddw                m2, m1, m3
+    paddw                m5, m2
+    paddw                m2, m1, m4
+    paddw                m6, m2
+    paddw                m0, m1, m7
+    paddw                m9, m0
+    paddw                m1, m8
+    paddw               m10, m1
+.w64_loop:
+    vpbroadcastw         m2, [weightsq+hq*2]
+    SMOOTH                2, 2, 3, 4, 5, 6
+    mova        [dstq+32*0], m0
+    SMOOTH                2, 2, 7, 8, 9, 10
+    mova        [dstq+32*1], m0
+    add                dstq, strideq
+    inc                  hq
+    jl .w64_loop
+    RET
+
+%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used
+    %assign stack_offset 0
+    %assign stack_size_padded 0
+    %assign regs_used %2
+    %xdefine rstk rsp
+    SETUP_STACK_POINTER %1
+    %if regs_used != %2 && WIN64
+        PUSH r%2
+    %endif
+    ALLOC_STACK %1, %3
+%endmacro
+
+cglobal ipred_smooth_h, 3, 7, 0, dst, stride, tl, w, h
+%define base r6-ipred_smooth_h_avx2_table
+    lea                  r6, [ipred_smooth_h_avx2_table]
+    mov                  wd, wm
+    vpbroadcastb         m3, [tlq+wq] ; right
+    tzcnt                wd, wd
+    mov                  hd, hm
+    movsxd               wq, [r6+wq*4]
+    vpbroadcastd         m4, [base+pb_127_m127]
+    vpbroadcastd         m5, [base+pw_128]
+    add                  wq, r6
+    jmp                  wq
+.w4:
+    WIN64_SPILL_XMM       8
+    vpbroadcastq         m6, [base+smooth_weights+4*2]
+    mova                 m7, [base+ipred_h_shuf]
+    sub                 tlq, 8
+    sub                 tlq, hq
+    lea                  r3, [strideq*3]
+.w4_loop:
+    vpbroadcastq         m2, [tlq+hq]
+    pshufb               m2, m7
+    punpcklbw            m1, m2, m3 ; left, right
+    punpckhbw            m2, m3
+    pmaddubsw            m0, m1, m4 ; 127 * left - 127 * right
+    paddw                m0, m1     ; 128 * left + 129 * right
+    pmaddubsw            m1, m6
+    paddw                m1, m5
+    paddw                m0, m1
+    pmaddubsw            m1, m2, m4
+    paddw                m1, m2
+    pmaddubsw            m2, m6
+    paddw                m2, m5
+    paddw                m1, m2
+    psrlw                m0, 8
+    psrlw                m1, 8
+    packuswb             m0, m1
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    movd   [dstq+strideq*1], xm1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r3       ], xm1, 2
+    cmp                  hd, 4
+    je .ret
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm0, 3
+    pextrd [dstq+r3       ], xm1, 3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 8
+    jg .w4_loop
+.ret:
+    RET
+ALIGN function_align
+.w8:
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM       8
+    vbroadcasti128       m6, [base+smooth_weights+8*2]
+    mova                 m7, [base+ipred_h_shuf]
+    sub                 tlq, 4
+    lea                  r3, [strideq*3]
+    sub                 tlq, hq
+.w8_loop:
+    vpbroadcastd         m2, [tlq+hq]
+    pshufb               m2, m7
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    pmaddubsw            m0, m1, m4
+    paddw                m0, m1
+    pmaddubsw            m1, m6
+    paddw                m1, m5
+    paddw                m0, m1
+    pmaddubsw            m1, m2, m4
+    paddw                m1, m2
+    pmaddubsw            m2, m6
+    paddw                m2, m5
+    paddw                m1, m2
+    psrlw                m0, 8
+    psrlw                m1, 8
+    packuswb             m0, m1
+    vextracti128        xm1, m0, 1
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+r3       ], xm1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w8_loop
+    RET
+ALIGN function_align
+.w16:
+    SETUP_STACK_FRAME  32*4, 7, 8
+    lea                  r3, [rsp+64*2-4]
+    call .prep ; only worthwhile for for w16 and above
+    sub                 tlq, 2
+    vpbroadcastd        xm6, [base+pb_1]
+    mova                xm7, [base+ipred_v_shuf+16]
+    vinserti128          m7, [base+ipred_v_shuf+ 0], 1
+    vbroadcasti128       m4, [base+smooth_weights+16*2]
+    vbroadcasti128       m5, [base+smooth_weights+16*3]
+.w16_loop:
+    vpbroadcastd         m1, [tlq+hq]
+    vpbroadcastd         m2, [r3+hq*2]
+    pshufb               m1, m6
+    punpcklbw            m1, m3
+    pshufb               m2, m7
+    SMOOTH                4, 5, 1, 1, 2, 2
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w16_loop
+    RET
+ALIGN function_align
+.w32:
+    SETUP_STACK_FRAME  32*4, 7, 6
+    lea                  r3, [rsp+64*2-2]
+    call .prep
+    dec                 tlq
+    mova                xm4, [base+smooth_weights+16*4]
+    vinserti128          m4, [base+smooth_weights+16*6], 1
+    mova                xm5, [base+smooth_weights+16*5]
+    vinserti128          m5, [base+smooth_weights+16*7], 1
+.w32_loop:
+    vpbroadcastb         m1, [tlq+hq]
+    punpcklbw            m1, m3
+    vpbroadcastw         m2, [r3+hq*2]
+    SMOOTH                4, 5, 1, 1, 2, 2
+    mova             [dstq], m0
+    add                dstq, strideq
+    dec                  hd
+    jg .w32_loop
+    RET
+ALIGN function_align
+.w64:
+    SETUP_STACK_FRAME  32*4, 7, 9
+    lea                  r3, [rsp+64*2-2]
+    call .prep
+    add                  r6, smooth_weights+16*15-ipred_smooth_h_avx2_table
+    dec                 tlq
+    mova                xm5, [r6-16*7]
+    vinserti128          m5, [r6-16*5], 1
+    mova                xm6, [r6-16*6]
+    vinserti128          m6, [r6-16*4], 1
+    mova                xm7, [r6-16*3]
+    vinserti128          m7, [r6-16*1], 1
+    mova                xm8, [r6-16*2]
+    vinserti128          m8, [r6-16*0], 1
+.w64_loop:
+    vpbroadcastb         m2, [tlq+hq]
+    punpcklbw            m2, m3
+    vpbroadcastw         m4, [r3+hq*2]
+    SMOOTH                5, 6, 2, 2, 4, 4
+    mova        [dstq+32*0], m0
+    SMOOTH                7, 8, 2, 2, 4, 4
+    mova        [dstq+32*1], m0
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_loop
+    RET
+ALIGN function_align
+.prep:
+    vpermq               m2, [tlq-32*1], q3120
+    punpckhbw            m1, m2, m3
+    punpcklbw            m2, m3
+    pmaddubsw            m0, m1, m4 ; 127 * left - 127 * right
+    paddw                m1, m5     ;   1 * left + 256 * right + 128
+    paddw                m0, m1     ; 128 * left + 129 * right + 128
+    pmaddubsw            m1, m2, m4
+    paddw                m2, m5
+    paddw                m1, m2
+    vpermq               m2, [tlq-32*2], q3120
+    mova [rsp+gprsize+32*3], m0
+    mova [rsp+gprsize+32*2], m1
+    punpckhbw            m1, m2, m3
+    punpcklbw            m2, m3
+    pmaddubsw            m0, m1, m4
+    paddw                m1, m5
+    paddw                m0, m1
+    pmaddubsw            m1, m2, m4
+    paddw                m2, m5
+    paddw                m1, m2
+    mova [rsp+gprsize+32*1], m0
+    mova [rsp+gprsize+32*0], m1
+    sub                  r3, hq
+    sub                 tlq, hq
+    sub                  r3, hq
+    ret
+
+%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
+    pmaddubsw            m0, m%3, m%1
+    pmaddubsw            m1, m%4, m%2
+%ifnum %5
+    paddw                m0, m%5
+%else
+    paddw                m0, %5
+%endif
+%ifnum %6
+    paddw                m1, m%6
+%else
+    paddw                m1, %6
+%endif
+    pavgw                m0, m2
+    pavgw                m1, m3
+    psrlw                m0, 8
+    psrlw                m1, 8
+    packuswb             m0, m1
+%endmacro
+
+cglobal ipred_smooth, 3, 7, 0, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_avx2_table
+    lea                  r6, [ipred_smooth_avx2_table]
+    mov                  wd, wm
+    vpbroadcastb         m4, [tlq+wq] ; right
+    tzcnt                wd, wd
+    mov                  hd, hm
+    mov                  r5, tlq
+    sub                  r5, hq
+    movsxd               wq, [r6+wq*4]
+    vpbroadcastd         m5, [base+pb_127_m127]
+    vpbroadcastb         m0, [r5] ; bottom
+    vpbroadcastd         m3, [base+pw_255]
+    add                  wq, r6
+    lea          v_weightsq, [base+smooth_weights+hq*2]
+    jmp                  wq
+.w4:
+    WIN64_SPILL_XMM      12
+    mova                m10, [base+ipred_h_shuf]
+    vpbroadcastq        m11, [base+smooth_weights+4*2]
+    mova                 m7, [base+ipred_v_shuf]
+    vpbroadcastd         m8, [tlq+1]
+    sub                 tlq, 8
+    lea                  r3, [strideq*3]
+    sub                 tlq, hq
+    punpcklbw            m8, m0 ; top, bottom
+    pshufd               m6, m7, q2200
+    pshufd               m7, m7, q3311
+    pmaddubsw            m9, m8, m5
+    paddw                m3, m8 ;   1 * top + 255 * bottom + 255
+    paddw                m9, m3 ; 128 * top + 129 * bottom + 255
+.w4_loop:
+    vpbroadcastq         m1, [tlq+hq]
+    pshufb               m1, m10
+    punpcklbw            m0, m1, m4 ; left, right
+    punpckhbw            m1, m4
+    pmaddubsw            m2, m0, m5 ; 127 * left - 127 * right
+    pmaddubsw            m3, m1, m5
+    paddw                m2, m0     ; 128 * left + 129 * right
+    paddw                m3, m1
+    pmaddubsw            m0, m11
+    pmaddubsw            m1, m11
+    paddw                m2, m0
+    paddw                m3, m1
+    vbroadcasti128       m1, [v_weightsq]
+    add          v_weightsq, 16
+    pshufb               m0, m1, m6
+    pshufb               m1, m7
+    SMOOTH_2D_END         0, 1, 8, 8, 9, 9
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    movd   [dstq+strideq*1], xm1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r3       ], xm1, 2
+    cmp                  hd, 4
+    je .ret
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm0, 3
+    pextrd [dstq+r3       ], xm1, 3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 8
+    jg .w4_loop
+.ret:
+    RET
+ALIGN function_align
+.w8:
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM      12
+    mova                m10, [base+ipred_h_shuf]
+    vbroadcasti128      m11, [base+smooth_weights+8*2]
+    mova                 m7, [base+ipred_v_shuf]
+    vpbroadcastq         m8, [tlq+1]
+    sub                 tlq, 4
+    lea                  r3, [strideq*3]
+    sub                 tlq, hq
+    punpcklbw            m8, m0
+    pshufd               m6, m7, q0000
+    pshufd               m7, m7, q1111
+    pmaddubsw            m9, m8, m5
+    paddw                m3, m8
+    paddw                m9, m3
+.w8_loop:
+    vpbroadcastd         m1, [tlq+hq]
+    pshufb               m1, m10
+    punpcklbw            m0, m1, m4
+    punpckhbw            m1, m4
+    pmaddubsw            m2, m0, m5
+    pmaddubsw            m3, m1, m5
+    paddw                m2, m0
+    paddw                m3, m1
+    pmaddubsw            m0, m11
+    pmaddubsw            m1, m11
+    paddw                m2, m0
+    paddw                m3, m1
+    vpbroadcastq         m1, [v_weightsq]
+    add          v_weightsq, 8
+    pshufb               m0, m1, m6
+    pshufb               m1, m7
+    SMOOTH_2D_END         0, 1, 8, 8, 9, 9
+    vextracti128        xm1, m0, 1
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+r3       ], xm1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w8_loop
+    RET
+ALIGN function_align
+.w16:
+    SETUP_STACK_FRAME  32*4, 7, 14
+    vbroadcasti128      m11, [tlq+1]
+    lea                  r3, [rsp+64*2-4]
+    punpcklbw           m10, m11, m0 ; top, bottom
+    punpckhbw           m11, m0
+    call .prep_v
+    sub                 tlq, 2
+    pmaddubsw           m12, m10, m5
+    pmaddubsw           m13, m11, m5
+    vpbroadcastd        xm5, [base+pb_1]
+    mova                 m9, [base+ipred_v_shuf]
+    vbroadcasti128       m6, [base+smooth_weights+16*2]
+    vbroadcasti128       m7, [base+smooth_weights+16*3]
+    vpermq               m8, m9, q1032
+    paddw                m0, m10, m3
+    paddw                m3, m11
+    paddw               m12, m0
+    paddw               m13, m3
+.w16_loop:
+    vpbroadcastd         m3, [tlq+hq]
+    vpbroadcastd         m0, [r3+hq*2]
+    vpbroadcastd         m1, [v_weightsq]
+    add          v_weightsq, 4
+    pshufb               m3, m5
+    punpcklbw            m3, m4 ; left, right
+    pmaddubsw            m2, m3, m6
+    pmaddubsw            m3, m7
+    pshufb               m0, m8
+    pshufb               m1, m9
+    paddw                m2, m0
+    paddw                m3, m0
+    SMOOTH_2D_END         1, 1, 10, 11, 12, 13
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w16_loop
+    RET
+ALIGN function_align
+.w32:
+    SETUP_STACK_FRAME  32*4, 7, 11
+    movu                 m8, [tlq+1]
+    lea                  r3, [rsp+64*2-2]
+    punpcklbw            m7, m8, m0
+    punpckhbw            m8, m0
+    call .prep_v
+    dec                 tlq
+    pmaddubsw            m9, m7, m5
+    pmaddubsw           m10, m8, m5
+    mova                xm5, [base+smooth_weights+16*4]
+    vinserti128          m5, [base+smooth_weights+16*6], 1
+    mova                xm6, [base+smooth_weights+16*5]
+    vinserti128          m6, [base+smooth_weights+16*7], 1
+    paddw                m0, m7, m3
+    paddw                m3, m8
+    paddw                m9, m0
+    paddw               m10, m3
+.w32_loop:
+    vpbroadcastb         m3, [tlq+hq]
+    punpcklbw            m3, m4
+    vpbroadcastw         m0, [r3+hq*2]
+    vpbroadcastw         m1, [v_weightsq]
+    add          v_weightsq, 2
+    pmaddubsw            m2, m3, m5
+    pmaddubsw            m3, m6
+    paddw                m2, m0
+    paddw                m3, m0
+    SMOOTH_2D_END         1, 1, 7, 8, 9, 10
+    mova             [dstq], m0
+    add                dstq, strideq
+    dec                  hd
+    jg .w32_loop
+    RET
+ALIGN function_align
+.w64:
+    SETUP_STACK_FRAME  32*8, 7, 16
+    movu                m13, [tlq+1 ]
+    movu                m15, [tlq+33]
+    add                  r6, smooth_weights+16*15-ipred_smooth_avx2_table
+    lea                  r3, [rsp+64*2-2]
+    punpcklbw           m12, m13, m0
+    punpckhbw           m13, m0
+    punpcklbw           m14, m15, m0
+    punpckhbw           m15, m0
+    call .prep_v
+    dec                 tlq
+    pmaddubsw            m0, m12, m5
+    pmaddubsw            m1, m13, m5
+    pmaddubsw            m2, m14, m5
+    pmaddubsw            m5, m15, m5
+    mova                xm8, [r6-16*7]
+    vinserti128          m8, [r6-16*5], 1
+    mova                xm9, [r6-16*6]
+    vinserti128          m9, [r6-16*4], 1
+    mova               xm10, [r6-16*3]
+    vinserti128         m10, [r6-16*1], 1
+    mova               xm11, [r6-16*2]
+    vinserti128         m11, [r6-16*0], 1
+    lea                  r6, [rsp+32*4]
+    paddw                m0, m3
+    paddw                m1, m3
+    paddw                m2, m3
+    paddw                m3, m5
+    paddw                m0, m12
+    paddw                m1, m13
+    paddw                m2, m14
+    paddw                m3, m15
+    mova          [r6+32*0], m0
+    mova          [r6+32*1], m1
+    mova          [r6+32*2], m2
+    mova          [r6+32*3], m3
+.w64_loop:
+    vpbroadcastb         m5, [tlq+hq]
+    punpcklbw            m5, m4
+    vpbroadcastw         m6, [r3+hq*2]
+    vpbroadcastw         m7, [v_weightsq]
+    add          v_weightsq, 2
+    pmaddubsw            m2, m5, m8
+    pmaddubsw            m3, m5, m9
+    paddw                m2, m6
+    paddw                m3, m6
+    SMOOTH_2D_END         7, 7, 12, 13, [r6+32*0], [r6+32*1]
+    mova        [dstq+32*0], m0
+    pmaddubsw            m2, m5, m10
+    pmaddubsw            m3, m5, m11
+    paddw                m2, m6
+    paddw                m3, m6
+    SMOOTH_2D_END         7, 7, 14, 15, [r6+32*2], [r6+32*3]
+    mova        [dstq+32*1], m0
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_loop
+    RET
+ALIGN function_align
+.prep_v:
+    vpermq               m2, [tlq-32*1], q3120
+    punpckhbw            m1, m2, m4
+    punpcklbw            m2, m4
+    pmaddubsw            m0, m1, m5 ; 127 * left - 127 * right
+    paddw                m0, m1     ; 128 * left + 129 * right
+    pmaddubsw            m1, m2, m5
+    paddw                m1, m2
+    vpermq               m2, [tlq-32*2], q3120
+    mova [rsp+gprsize+32*3], m0
+    mova [rsp+gprsize+32*2], m1
+    punpckhbw            m1, m2, m4
+    punpcklbw            m2, m4
+    pmaddubsw            m0, m1, m5
+    paddw                m0, m1
+    pmaddubsw            m1, m2, m5
+    paddw                m1, m2
+    mova [rsp+gprsize+32*1], m0
+    mova [rsp+gprsize+32*0], m1
+    sub                  r3, hq
+    sub                 tlq, hq
+    sub                  r3, hq
+    ret
+
+cglobal ipred_z1, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
+    %assign org_stack_offset stack_offset
+    lea                  r6, [ipred_z1_avx2_table]
+    tzcnt                wd, wm
+    movifnidn        angled, anglem
+    movifnidn            hd, hm
+    lea                  r7, [dr_intra_derivative]
+    inc                 tlq
+    movsxd               wq, [r6+wq*4]
+    add                  wq, r6
+    mov                 dxd, angled
+    and                 dxd, 0x7e
+    add              angled, 165 ; ~90
+    movzx               dxd, word [r7+dxq]
+    xor              angled, 0x4ff ; d = 90 - angle
+    vpbroadcastd         m3, [pw_512]
+    vpbroadcastd         m4, [pw_62]
+    vpbroadcastd         m5, [pw_64]
+    jmp                  wq
+.w4:
+    cmp              angleb, 40
+    jae .w4_no_upsample
+    lea                 r3d, [angleq-1024]
+    sar                 r3d, 7
+    add                 r3d, hd
+    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+    ALLOC_STACK         -32, 8
+    mova                xm1, [tlq-1]
+    pshufb              xm0, xm1, [z_upsample1]
+    pshufb              xm1, [z_upsample2]
+    vpbroadcastd        xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
+    add                 dxd, dxd        ; pw_512 (which is already in m3)
+    pmaddubsw           xm0, xm2        ; for rounding instead of pw_2048
+    pextrd         [rsp+16], xm1, 3 ; top[max_base_x]
+    pmaddubsw           xm1, xm2
+    movd                xm7, dxd
+    mov                 r3d, dxd ; xpos
+    vpbroadcastw         m7, xm7
+    paddw               xm1, xm0
+    movq                xm0, [tlq]
+    pmulhrsw            xm1, xm3
+    pslldq               m6, m7, 8
+    paddw               xm2, xm7, xm7
+    lea                  r2, [strideq*3]
+    paddw                m6, m7
+    packuswb            xm1, xm1
+    paddw                m6, m2 ; xpos2 xpos3 xpos0 xpos1
+    punpcklbw           xm0, xm1
+    psllw                m7, 2
+    mova              [rsp], xm0
+.w4_upsample_loop:
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base0
+    vpbroadcastq         m1, [rsp+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base1
+    vpbroadcastq         m2, [rsp+r5]
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base2
+    movq                xm0, [rsp+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base3
+    movhps              xm0, [rsp+r5]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m4, m6 ; frac
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m5, m2 ; 64-frac
+    psllw                m2, 8
+    por                  m1, m2     ; 64-frac, frac
+    pmaddubsw            m0, m1
+    paddw                m6, m7     ; xpos += dx
+    pmulhrsw             m0, m3
+    packuswb             m0, m0
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*2], xm0
+    pextrd [dstq+r2       ], xm0, 1
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4_upsample_loop
+    RET
+ALIGN function_align
+.filter_strength: ; w4/w8/w16
+    ; The C version uses a lot of branches, but we can do all the comparisons
+    ; in parallel and use popcnt to get the final filter strength value.
+%define base r3-z_filter_t0
+    lea                  r3, [z_filter_t0]
+    movd                xm0, maxbased
+    movd                xm2, angled
+    shr              angled, 8 ; is_sm << 1
+    vpbroadcastb         m0, xm0
+    vpbroadcastb         m2, xm2
+    pcmpeqb              m1, m0, [base+z_filter_wh]
+    pand                 m1, m2
+    mova                xm2, [r3+angleq*8] ; upper ymm half zero in both cases
+    pcmpgtb              m1, m2
+    pmovmskb            r5d, m1
+    ret
+.w4_no_upsample:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -16, 11
+    mov            maxbased, 7
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .w4_main
+    lea            maxbased, [hq+3]
+    call .filter_strength
+    mov            maxbased, 7
+    test                r5d, r5d
+    jz .w4_main ; filter_strength == 0
+    popcnt              r5d, r5d
+    vpbroadcastd         m7, [base+pb_8]
+    vbroadcasti128       m2, [tlq-1]
+    pminub               m1, m7, [base+z_filter_s]
+    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
+    pminub               m7, [base+z_filter_s+8]
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
+    pshufb               m0, m2, m1
+    shufps               m1, m7, q2121
+    pmaddubsw            m0, m8
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m9
+    pshufb               m2, m7
+    pmaddubsw            m2, m10
+    paddw                m0, m1
+    paddw                m0, m2
+    pmulhrsw             m0, m3
+    mov                 r3d, 9
+    mov                 tlq, rsp
+    cmp                  hd, 4
+    cmovne         maxbased, r3d
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    mova              [tlq], xm0
+.w4_main:
+    movd                xm6, dxd
+    vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
+    vpbroadcastb         m7, [tlq+maxbaseq]
+    shl            maxbased, 6
+    vpbroadcastw         m6, xm6
+    mov                 r3d, dxd ; xpos
+    movd                xm9, maxbased
+    vpbroadcastw         m9, xm9
+    vbroadcasti128       m8, [z1_shuf_w4]
+    psrlw                m7, 8  ; top[max_base_x]
+    paddw               m10, m6, m6
+    psubw                m9, m0 ; max_base_x
+    vpblendd             m6, m10, 0xcc
+    mova                xm0, xm10
+    paddw                m6, m0 ; xpos2 xpos3 xpos0 xpos1
+    paddw               m10, m10
+.w4_loop:
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base0
+    vpbroadcastq         m1, [tlq+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base1
+    vpbroadcastq         m2, [tlq+r5]
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base2
+    movq                xm0, [tlq+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base3
+    movhps              xm0, [tlq+r5]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m4, m6 ; frac
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m5, m2 ; 64-frac
+    psllw                m2, 8
+    pshufb               m0, m8
+    por                  m1, m2     ; 64-frac, frac
+    pmaddubsw            m0, m1
+    pcmpgtw              m1, m9, m6 ; base < max_base_x
+    pmulhrsw             m0, m3
+    paddw                m6, m10    ; xpos += dx
+    lea                  r5, [dstq+strideq*2]
+    vpblendvb            m0, m7, m0, m1
+    packuswb             m0, m0
+    vextracti128        xm1, m0, 1
+    movd   [r5  +strideq*0], xm0
+    pextrd [r5  +strideq*1], xm0, 1
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    sub                  hd, 4
+    jz .w4_end
+    lea                dstq, [dstq+strideq*4]
+    cmp                 r3d, maxbased
+    jb .w4_loop
+    packuswb            xm7, xm7
+    lea                  r6, [strideq*3]
+.w4_end_loop:
+    movd   [dstq+strideq*0], xm7
+    movd   [dstq+strideq*1], xm7
+    movd   [dstq+strideq*2], xm7
+    movd   [dstq+r6       ], xm7
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4_end_loop
+.w4_end:
+    RET
+ALIGN function_align
+.w8:
+    lea                 r3d, [angleq+216]
+    mov                 r3b, hb
+    cmp                 r3d, 8
+    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -32, 8
+    movu                xm2, [z_filter_s+6]
+    mova                xm0, [tlq-1]
+    movd                xm6, hd
+    vinserti128          m0, [tlq+7], 1
+    vpbroadcastb        xm6, xm6
+    vbroadcasti128       m1, [z_upsample1]
+    pminub              xm6, xm2
+    vpbroadcastd         m7, [pb_36_m4]
+    vinserti128          m2, xm6, 1
+    add                 dxd, dxd
+    pshufb               m1, m0, m1
+    pshufb               m2, m0, m2
+    movd                xm6, dxd
+    pmaddubsw            m1, m7
+    pmaddubsw            m2, m7
+    vpbroadcastw         m6, xm6
+    mov                 r3d, dxd
+    psrldq               m0, 1
+    lea                  r2, [strideq*3]
+    paddw                m7, m6, m6
+    paddw                m1, m2
+    vpblendd             m6, m7, 0xf0
+    pmulhrsw             m1, m3
+    pslldq               m2, m7, 8
+    paddw                m7, m7
+    paddw                m6, m2
+    packuswb             m1, m1
+    punpcklbw            m0, m1
+    mova              [rsp], m0
+.w8_upsample_loop:
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base0
+    movu                xm0, [rsp+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base1
+    vinserti128          m0, [rsp+r5], 1
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base2
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    punpcklqdq           m1, m2, m2 ; frac0 frac1
+    pmaddubsw            m0, m1
+    movu                xm1, [rsp+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base3
+    vinserti128          m1, [rsp+r5], 1
+    punpckhqdq           m2, m2 ; frac2 frac3
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    paddw                m6, m7
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    vextracti128        xm1, m0, 1
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*2], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+r2       ], xm1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w8_upsample_loop
+    RET
+.w8_no_intra_edge_filter:
+    and            maxbased, 7
+    or             maxbased, 8 ; imin(h+7, 15)
+    jmp .w8_main
+.w8_no_upsample:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -32, 10
+    lea            maxbased, [hq+7]
+    test             angled, 0x400
+    jnz .w8_no_intra_edge_filter
+    call .filter_strength
+    test                r5d, r5d
+    jz .w8_main ; filter_strength == 0
+    popcnt              r5d, r5d
+    movu                xm2, [tlq]
+    pminub              xm1, xm0, [base+z_filter_s+14]
+    vinserti128          m2, [tlq-1], 1
+    vinserti128          m1, [base+z_filter_s+ 0], 1
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
+    pminub              xm0, [base+z_filter_s+22]
+    vinserti128          m0, [base+z_filter_s+ 8], 1
+    pshufb               m6, m2, m1
+    pmaddubsw            m6, m7
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
+    movzx               r3d, byte [tlq+15]
+    shufps               m1, m0, q2121
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m7
+    paddw                m1, m6
+    sub                 r5d, 3
+    jnz .w8_3tap
+    ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
+    ; which also results in an awkward edge case where out[w*2] is
+    ; slightly different from out[max_base_x] when h > w.
+    vpbroadcastd         m7, [z_filter_k+4*8]
+    movzx               r2d, byte [tlq+14]
+    pshufb               m2, m0
+    pmaddubsw            m2, m7
+    sub                 r2d, r3d
+    lea                 r2d, [r2+r3*8+4]
+    shr                 r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
+    mov            [rsp+16], r2b
+    paddw                m1, m2
+.w8_3tap:
+    pmulhrsw             m1, m3
+    sar                 r5d, 1
+    mov                 tlq, rsp
+    add                 r5d, 17 ; w*2 + (filter_strength == 3)
+    cmp                  hd, 16
+    cmovns         maxbased, r5d
+    mov            [tlq+r5], r3b
+    vextracti128        xm0, m1, 1
+    packuswb            xm0, xm1
+    mova              [tlq], xm0
+.w8_main:
+    movd                xm2, dxd
+    vbroadcasti128       m0, [z_base_inc]
+    vpbroadcastw         m2, xm2
+    vpbroadcastb         m7, [tlq+maxbaseq]
+    shl            maxbased, 6
+    movd                xm9, maxbased
+    vbroadcasti128       m8, [z_filter_s+2]
+    vpbroadcastw         m9, xm9
+    psrlw                m7, 8
+    psubw                m9, m0
+    mov                 r3d, dxd
+    paddw                m6, m2, m2
+    vpblendd             m2, m6, 0xf0
+.w8_loop:
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6
+    pand                 m0, m4, m2
+    psubw                m1, m5, m0
+    psllw                m0, 8
+    por                  m1, m0
+    movu                xm0, [tlq+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base1
+    vinserti128          m0, [tlq+r5], 1
+    pshufb               m0, m8
+    pmaddubsw            m0, m1
+    pcmpgtw              m1, m9, m2
+    paddw                m2, m6
+    pmulhrsw             m0, m3
+    vpblendvb            m0, m7, m0, m1
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xm0
+    sub                  hd, 2
+    jz .w8_end
+    lea                dstq, [dstq+strideq*2]
+    cmp                 r3d, maxbased
+    jb .w8_loop
+    packuswb            xm7, xm7
+.w8_end_loop:
+    movq   [dstq+strideq*0], xm7
+    movq   [dstq+strideq*1], xm7
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w8_end_loop
+.w8_end:
+    RET
+.w16_no_intra_edge_filter:
+    and            maxbased, 15
+    or             maxbased, 16 ; imin(h+15, 31)
+    jmp .w16_main
+ALIGN function_align
+.w16:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -64, 12
+    lea            maxbased, [hq+15]
+    test             angled, 0x400
+    jnz .w16_no_intra_edge_filter
+    call .filter_strength
+    test                r5d, r5d
+    jz .w16_main ; filter_strength == 0
+    popcnt              r5d, r5d
+    vpbroadcastd         m1, [base+pb_12]
+    vbroadcasti128       m6, [base+z_filter_s+8]
+    vinserti128          m2, m6, [base+z_filter_s], 0
+    vinserti128          m6, [base+z_filter_s+16], 1
+    mova               xm10, [tlq-1]
+    vinserti128         m10, [tlq+3], 1
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
+    vbroadcasti128       m7, [base+z_filter_s+14]
+    vinserti128          m8, m7, [base+z_filter_s+6], 0
+    vinserti128          m7, [base+z_filter_s+22], 1
+    psubw                m0, m1
+    movu               xm11, [tlq+12]
+    vinserti128         m11, [tlq+16], 1
+    pminub               m8, m0
+    pminub               m7, m0
+    pshufb               m0, m10, m2
+    shufps               m2, m6, q2121
+    pmaddubsw            m0, m9
+    pshufb               m1, m11, m8
+    shufps               m8, m7, q2121
+    pmaddubsw            m1, m9
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    movzx               r3d, byte [tlq+31]
+    pshufb               m2, m10, m2
+    pmaddubsw            m2, m9
+    pshufb               m8, m11, m8
+    pmaddubsw            m8, m9
+    paddw                m0, m2
+    paddw                m1, m8
+    sub                 r5d, 3
+    jnz .w16_3tap
+    vpbroadcastd         m9, [z_filter_k+4*8]
+    movzx               r2d, byte [tlq+30]
+    pshufb              m10, m6
+    pmaddubsw           m10, m9
+    pshufb              m11, m7
+    pmaddubsw           m11, m9
+    sub                 r2d, r3d
+    lea                 r2d, [r2+r3*8+4]
+    shr                 r2d, 3
+    mov            [rsp+32], r2b
+    paddw                m0, m10
+    paddw                m1, m11
+.w16_3tap:
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    sar                 r5d, 1
+    mov                 tlq, rsp
+    add                 r5d, 33
+    cmp                  hd, 32
+    cmovns         maxbased, r5d
+    mov            [tlq+r5], r3b
+    packuswb             m0, m1
+    vpermq               m0, m0, q3120
+    mova              [tlq], m0
+.w16_main:
+    movd                xm6, dxd
+    vbroadcasti128       m0, [z_base_inc]
+    vpbroadcastb         m7, [tlq+maxbaseq]
+    shl            maxbased, 6
+    vpbroadcastw         m6, xm6
+    movd                xm9, maxbased
+    vbroadcasti128       m8, [z_filter_s+2]
+    vpbroadcastw         m9, xm9
+    mov                 r3d, dxd
+    psubw                m9, m0
+    paddw               m11, m6, m6
+    psubw               m10, m9, m3 ; 64*8
+    vpblendd             m6, m11, 0xf0
+.w16_loop:
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base0
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    movu                xm0, [tlq+r3+0]
+    movu                xm1, [tlq+r3+8]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base1
+    vinserti128          m0, [tlq+r5+0], 1
+    vinserti128          m1, [tlq+r5+8], 1
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m9, m6
+    pcmpgtw              m2, m10, m6
+    packsswb             m1, m2
+    paddw                m6, m11
+    vpblendvb            m0, m7, m0, m1
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    sub                  hd, 2
+    jz .w16_end
+    lea                dstq, [dstq+strideq*2]
+    cmp                 r3d, maxbased
+    jb .w16_loop
+.w16_end_loop:
+    mova   [dstq+strideq*0], xm7
+    mova   [dstq+strideq*1], xm7
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w16_end_loop
+.w16_end:
+    RET
+ALIGN function_align
+.w32:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -96, 15
+    lea                 r3d, [hq+31]
+    mov            maxbased, 63
+    cmp                  hd, 32
+    cmovs          maxbased, r3d
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .w32_main
+    vbroadcasti128       m0, [pb_0to15]
+    sub                 r3d, 29 ; h+2
+    movu               xm13, [tlq+29]    ; 32-39
+    movd                xm1, r3d
+    movu               xm14, [tlq+37]    ; 40-47
+    sub                 r3d, 8 ; h-6
+    vinserti128         m14, [tlq+51], 1 ; 56-63
+    vpbroadcastb        xm1, xm1
+    mova               xm11, [tlq- 1]    ;  0- 7
+    vinserti128         m11, [tlq+13], 1 ; 16-23
+    movd                xm2, r3d
+    movu               xm12, [tlq+ 5]    ;  8-15
+    vinserti128         m12, [tlq+19], 1 ; 24-31
+    pminub              xm1, xm0 ; clip 32x8
+    mova                 m7, [z_filter_s+0]
+    pshufb             xm13, xm1
+    vpbroadcastd         m1, [pb_12]
+    vpbroadcastb        xm2, xm2
+    vinserti128         m13, [tlq+43], 1 ; 48-55
+    vinserti128          m8, m7, [z_filter_s+4], 1
+    vpblendd             m2, m1, 0xf0
+    vinserti128          m7, [z_filter_s+12], 0
+    pminub               m2, m0 ; clip 32x16 and 32x(32|64)
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    pshufb              m14, m2
+    pshufb               m0, m11, m8
+    shufps               m8, m7, q1021
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m8
+    pmaddubsw            m2, m9
+    pshufb               m1, m13, m8
+    pmaddubsw            m1, m9
+    pshufb               m6, m14, m8
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    pshufb              m10, m11, m8
+    shufps               m8, m7, q2121
+    pmaddubsw           m10, m9
+    paddw                m0, m10
+    pshufb              m10, m12, m8
+    pmaddubsw           m10, m9
+    paddw                m2, m10
+    pshufb              m10, m13, m8
+    pmaddubsw           m10, m9
+    paddw                m1, m10
+    pshufb              m10, m14, m8
+    pmaddubsw           m10, m9
+    paddw                m6, m10
+    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
+    pshufb              m11, m8
+    pmaddubsw           m11, m9
+    pshufb              m12, m7
+    pmaddubsw           m12, m9
+    movzx               r3d, byte [tlq+63]
+    movzx               r2d, byte [tlq+62]
+    paddw                m0, m11
+    paddw                m2, m12
+    pshufb              m13, m7
+    pmaddubsw           m13, m9
+    pshufb              m14, m7
+    pmaddubsw           m14, m9
+    paddw                m1, m13
+    paddw                m6, m14
+    sub                 r2d, r3d
+    lea                 r2d, [r2+r3*8+4] ; edge case for 32x64
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    shr                 r2d, 3
+    mov            [rsp+64], r2b
+    mov                 tlq, rsp
+    mov            [tlq+65], r3b
+    mov                 r3d, 65
+    cmp                  hd, 64
+    cmove          maxbased, r3d
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova           [tlq+ 0], m0
+    mova           [tlq+32], m1
+.w32_main:
+    movd                xm6, dxd
+    vpbroadcastb         m7, [tlq+maxbaseq]
+    shl            maxbased, 6
+    vpbroadcastw         m6, xm6
+    movd                xm9, maxbased
+    vbroadcasti128       m8, [z_filter_s+2]
+    vpbroadcastw         m9, xm9
+    mov                 r5d, dxd
+    psubw                m9, [z_base_inc]
+    mova                m11, m6
+    psubw               m10, m9, m3 ; 64*8
+.w32_loop:
+    mov                 r3d, r5d
+    shr                 r3d, 6
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    movu                 m0, [tlq+r3+0]
+    movu                 m1, [tlq+r3+8]
+    add                 r5d, dxd
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m9, m6
+    pcmpgtw              m2, m10, m6
+    packsswb             m1, m2
+    paddw                m6, m11
+    vpblendvb            m0, m7, m0, m1
+    mova             [dstq], m0
+    dec                  hd
+    jz .w32_end
+    add                dstq, strideq
+    cmp                 r5d, maxbased
+    jb .w32_loop
+    test                 hb, 1
+    jz .w32_end_loop
+    mova             [dstq], m7
+    add                dstq, strideq
+    dec                  hd
+    jz .w32_end
+.w32_end_loop:
+    mova   [dstq+strideq*0], m7
+    mova   [dstq+strideq*1], m7
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w32_end_loop
+.w32_end:
+    RET
+ALIGN function_align
+.w64:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK        -128, 16
+    lea            maxbased, [hq+63]
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .w64_main
+    mova               xm11, [tlq- 1]    ;  0- 7
+    vinserti128         m11, [tlq+13], 1 ; 16-23
+    movu               xm12, [tlq+ 5]    ;  8-15
+    vinserti128         m12, [tlq+19], 1 ; 24-31
+    mova                 m7, [z_filter_s+0]
+    vinserti128          m8, m7, [z_filter_s+4], 1
+    vinserti128          m7, [z_filter_s+12], 0
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    movu               xm13, [tlq+29]    ; 32-39
+    vinserti128         m13, [tlq+43], 1 ; 48-55
+    movu               xm14, [tlq+37]    ; 40-47
+    vinserti128         m14, [tlq+51], 1 ; 56-63
+    pshufb               m0, m11, m8
+    shufps               m8, m7, q1021
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m8
+    pmaddubsw            m2, m9
+    pshufb               m1, m13, m8
+    pmaddubsw            m1, m9
+    pshufb               m6, m14, m8
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    pshufb              m10, m11, m8
+    shufps              m15, m8, m7, q2121
+    pmaddubsw           m10, m9
+    paddw                m0, m10
+    pshufb              m10, m12, m15
+    pmaddubsw           m10, m9
+    paddw                m2, m10
+    pshufb              m10, m13, m15
+    pmaddubsw           m10, m9
+    paddw                m1, m10
+    pshufb              m10, m14, m15
+    pmaddubsw           m10, m9
+    paddw                m6, m10
+    vpbroadcastd        m10, [z_filter_k+4*2+12*2]
+    pshufb              m11, m15
+    pmaddubsw           m11, m10
+    pshufb              m12, m7
+    pmaddubsw           m12, m10
+    pshufb              m13, m7
+    pmaddubsw           m13, m10
+    pshufb              m14, m7
+    pmaddubsw           m14, m10
+    paddw                m0, m11
+    paddw                m2, m12
+    paddw                m1, m13
+    paddw                m6, m14
+    movu               xm11, [tlq+ 61]    ;  64- 71
+    vinserti128         m11, [tlq+ 75], 1 ;  80- 87
+    movu               xm12, [tlq+ 69]    ;  72- 79
+    vinserti128         m12, [tlq+ 83], 1 ;  88- 95
+    movu               xm13, [tlq+ 93]    ;  96-103
+    vinserti128         m13, [tlq+107], 1 ; 112-119
+    movu               xm14, [tlq+101]    ; 104-111
+    vinserti128         m14, [tlq+115], 1 ; 120-127
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    lea                 r3d, [hq-20]
+    mov                 tlq, rsp
+    packuswb             m0, m2
+    packuswb             m1, m6
+    vpbroadcastd        xm2, [pb_14]
+    vbroadcasti128       m6, [pb_0to15]
+    mova         [tlq+32*0], m0
+    mova         [tlq+32*1], m1
+    movd                xm0, r3d
+    vpbroadcastd         m1, [pb_12]
+    vpbroadcastb         m0, xm0
+    paddb                m0, m2
+    pminub               m0, m6 ; clip 64x16 and 64x32
+    pshufb              m12, m0
+    pminub               m1, m6 ; clip 64x64
+    pshufb              m14, m1
+    pshufb               m0, m11, m7
+    pmaddubsw            m0, m10
+    pshufb               m2, m12, m7
+    pmaddubsw            m2, m10
+    pshufb               m1, m13, m7
+    pmaddubsw            m1, m10
+    pshufb               m6, m14, m7
+    pmaddubsw            m6, m10
+    pshufb               m7, m11, m15
+    pmaddubsw            m7, m9
+    pshufb              m10, m12, m15
+    pmaddubsw           m10, m9
+    paddw                m0, m7
+    pshufb               m7, m13, m15
+    pmaddubsw            m7, m9
+    paddw                m2, m10
+    pshufb              m10, m14, m15
+    pmaddubsw           m10, m9
+    paddw                m1, m7
+    paddw                m6, m10
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    pshufb              m11, m8
+    pmaddubsw           m11, m9
+    pshufb              m12, m8
+    pmaddubsw           m12, m9
+    pshufb              m13, m8
+    pmaddubsw           m13, m9
+    pshufb              m14, m8
+    pmaddubsw           m14, m9
+    paddw                m0, m11
+    paddw                m2, m12
+    paddw                m1, m13
+    paddw                m6, m14
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova         [tlq+32*2], m0
+    mova         [tlq+32*3], m1
+.w64_main:
+    movd               xm12, dxd
+    vpbroadcastb         m7, [tlq+maxbaseq]
+    lea                 r3d, [dxq-64]
+    shl            maxbased, 6
+    vpbroadcastw        m12, xm12
+    sub                 r3d, maxbased
+    vbroadcasti128       m8, [z_filter_s+2]
+    movd                xm6, r3d
+    mov                 r5d, dxd
+    mova                m10, [pb_1to32]
+    vpbroadcastd        m11, [pb_32]
+    vpbroadcastw         m6, xm6
+.w64_loop:
+    mov                 r3d, r5d
+    shr                 r3d, 6
+    movu                 m0, [tlq+r3+ 0]
+    movu                 m1, [tlq+r3+ 8]
+    pand                 m2, m4, m6
+    psubw                m9, m5, m2
+    psllw                m2, 8
+    por                  m9, m2
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m9
+    pmaddubsw            m1, m9
+    psraw                m2, m6, 6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packsswb             m2, m2
+    paddb                m2, m10
+    packuswb             m0, m1
+    vpblendvb            m0, m7, m0, m2
+    mova          [dstq+ 0], m0
+    movu                 m0, [tlq+r3+32]
+    movu                 m1, [tlq+r3+40]
+    add                 r5d, dxd
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m9
+    pmaddubsw            m1, m9
+    paddb                m2, m11
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    paddw                m6, m12
+    packuswb             m0, m1
+    vpblendvb            m0, m7, m0, m2
+    mova          [dstq+32], m0
+    dec                  hd
+    jz .w64_end
+    add                dstq, strideq
+    cmp                 r5d, maxbased
+    jb .w64_loop
+.w64_end_loop:
+    mova          [dstq+ 0], m7
+    mova          [dstq+32], m7
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_end_loop
+.w64_end:
+    RET
+
+cglobal ipred_z2, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy
+%define base r9-z_filter_t0
+    lea                  r9, [ipred_z2_avx2_table]
+    tzcnt                wd, wm
+    movifnidn        angled, anglem
+    movifnidn            hd, hm
+    lea                 dxq, [dr_intra_derivative-90]
+    movsxd               wq, [r9+wq*4]
+    movzx               dyd, angleb
+    xor              angled, 0x400
+    mov                  r8, dxq
+    sub                 dxq, dyq
+    add                  wq, r9
+    add                  r9, z_filter_t0-ipred_z2_avx2_table
+    mova                 m2, [tlq-64]
+    mova                 m0, [tlq-32]
+    mova                 m1, [tlq]
+    and                 dyd, ~1
+    and                 dxq, ~1
+    movzx               dyd, word [r8+dyq]  ; angle - 90
+    movzx               dxd, word [dxq+270] ; 180 - angle
+    vpbroadcastd        m13, [base+pw_512]
+    vpbroadcastd        m14, [base+pw_62]
+    vpbroadcastd        m15, [base+pw_64]
+    mova           [rsp+ 0], m2
+    mova           [rsp+32], m0
+    mova           [rsp+64], m1
+    neg                 dxd
+    neg                 dyd
+    jmp                  wq
+.w4:
+    vpbroadcastq         m6, [base+z2_base_inc] ; base_inc << 6
+    vbroadcasti128      m10, [base+z1_shuf_w4]
+    vbroadcasti128      m11, [base+z2_shuf_h4]
+    lea                 r2d, [dxq+(65<<6)] ; xpos
+    movd                xm5, dyd
+    mov                 r8d, (63-4)<<6
+    mov                 dyq, -4
+    pshuflw             xm5, xm5, q0000
+    pmullw              xm5, [base+z2_ymul]
+    test             angled, 0x400
+    jnz .w4_main ; !enable_intra_edge_filter
+    lea                 r3d, [hq+2]
+    add              angled, 1022
+    shl                 r3d, 6
+    test                r3d, angled
+    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+    vpbroadcastd        xm3, [base+pb_4]
+    call .upsample_above
+    sub              angled, 1075 ; angle - 53
+    lea                 r3d, [hq+3]
+    xor              angled, 0x7f ; 180 - angle
+    call .filter_strength
+    jmp .w4_filter_left
+ALIGN function_align
+.filter_strength:
+    movd                xm8, r3d
+    mov                 r3d, angled
+    movd                xm7, angled
+    vpbroadcastb         m8, xm8
+    shr                 r3d, 8 ; is_sm << 1
+    vpbroadcastb         m7, xm7
+    pcmpeqb              m8, [base+z_filter_wh]
+    mova                xm9, [r9+r3*8]
+    pand                 m0, m8, m7
+    pcmpgtb              m0, m9
+    pmovmskb            r3d, m0
+    ret
+ALIGN function_align
+.upsample_above: ; w4/w8
+    pshufb              xm2, xm1, [base+z_upsample1-2]
+    pminub              xm3, [base+z_filter_s+4]
+    vpbroadcastd        xm4, [base+pb_36_m4]
+    vbroadcasti128      m10, [base+pb_0to15]
+    pshufb              xm3, xm1, xm3
+    pmaddubsw           xm2, xm4
+    pmaddubsw           xm3, xm4
+    lea                 r2d, [r2+dxq+(1<<6)]
+    add                 dxd, dxd
+    paddw               xm2, xm3
+    pmulhrsw            xm2, xm13
+    sub                 r8d, 3<<6
+    paddw                m6, m6
+    packuswb            xm2, xm2
+    punpcklbw           xm1, xm2
+    mova   [rsp+gprsize+64], xm1
+    ret
+ALIGN function_align
+.upsample_left: ; h4/h8
+    mov                 r3d, hd
+    and                 r3d, 4
+    movd                xm2, [rsp+gprsize+64]
+    movddup             xm0, [rsp+gprsize+56]
+    movd                xm1, r3d
+    palignr             xm2, xm0, 1
+    vpbroadcastb        xm1, xm1
+    pshufb              xm2, [base+z_filter_s+18]
+    vpbroadcastd        xm3, [base+pb_36_m4]
+    pmaxub              xm1, [base+z_upsample1-2]
+    pshufb              xm1, xm0, xm1
+    pmaddubsw           xm2, xm3
+    pmaddubsw           xm1, xm3
+    paddw               xm5, xm5
+    add                 dyq, dyq
+    paddw               xm1, xm2
+    pmulhrsw            xm1, xm13
+    vbroadcasti128      m11, [base+z2_upsample]
+    paddw               xm5, xm15
+    packuswb            xm1, xm1
+    punpcklbw           xm0, xm1
+    mova   [rsp+gprsize+48], xm0
+    ret
+.w4_no_upsample_above:
+    lea                 r3d, [hq+3]
+    sub              angled, 1112 ; angle - 90
+    call .filter_strength
+    test                r3d, r3d
+    jz .w4_no_filter_above
+    popcnt              r3d, r3d
+    vpbroadcastd        xm2, [base+pb_4]
+    pminub              xm2, [base+z_filter_s]
+    vpbroadcastd        xm0, [base+z_filter_k-4+r3*4+12*0]
+    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
+    pshufb              xm3, xm1, xm2 ; 00 01 12 23
+    pshufd              xm2, xm2, q0321
+    pmaddubsw           xm0, xm3, xm0
+    pshufb              xm2, xm1, xm2 ; 12 23 34 44
+    pmaddubsw           xm2, xm4
+    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*2]
+    punpckhqdq          xm3, xm3      ; 34 44 44 44
+    pmaddubsw           xm3, xm4
+    movd                xm4, r6m      ; max_width
+    pminsw              xm4, xm15
+    vpbroadcastb        xm4, xm4
+    paddw               xm0, xm2
+    paddw               xm0, xm3
+    pmulhrsw            xm0, xm13
+    psubb               xm4, [base+pb_1to32]
+    psrlq               xm1, 8
+    packuswb            xm0, xm0
+    vpblendvb           xm0, xm1, xm4
+    movd           [rsp+65], xm0
+.w4_no_filter_above:
+    lea                 r3d, [hq+2]
+    add              angled, 973 ; angle + 883
+    shl                 r3d, 6
+    test                r3d, angled
+    jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+    vpbroadcastd        xm0, [base+pb_90]
+    psubb               xm0, xm7 ; 180 - angle
+    pand                xm0, xm8 ; reuse from previous filter_strength call
+    pcmpgtb             xm0, xm9
+    pmovmskb            r3d, xm0
+.w4_filter_left:
+    test                r3d, r3d
+    jz .w4_main
+    popcnt              r3d, r3d
+    mov                 r5d, 10
+    cmp                  hd, 16
+    movu                xm2, [rsp+49]
+    vinserti128          m2, [rsp+43], 1
+    cmovs               r5d, hd
+    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
+    movd                xm0, r5d
+    vbroadcasti128       m1, [base+z_filter_s+12]
+    vbroadcasti128       m4, [base+z_filter_s+16]
+    vinserti128          m3, m1, [z_filter_s+8], 1   ; 56 67 78 89 9a ab bc cd   55 55 56 67 78 89 9a ab
+    vpblendd             m1, m4, 0x0f                ; 78 89 9a ab bc cd de ef   56 67 78 89 9a ab bc cd
+    vinserti128          m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff   78 89 9a ab bc cd de ef
+    vpbroadcastb         m0, xm0
+    pmaxub               m0, m3
+    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*0]
+    pshufb               m0, m2, m0
+    pmaddubsw            m0, m3
+    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*1]
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m3
+    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*2]
+    pshufb               m2, m4
+    pmaddubsw            m2, m3
+    movd                xm4, r7m ; max_height
+    pminsw              xm4, xm15
+    vpbroadcastb        xm4, xm4
+    psubb               xm4, [base+pb_16to1]
+    paddw                m1, m0
+    paddw                m1, m2
+    pmulhrsw             m1, m13
+    vextracti128        xm0, m1, 1
+    packuswb            xm0, xm1
+    vpblendvb           xm0, [rsp+48], xm4
+    mova           [rsp+48], xm0
+    jmp .w4_main
+.w4_upsample_left:
+    call .upsample_left
+.w4_main:
+    movd                xm0, dxd
+    mova                m12, [base+z2_y_shuf_h4]
+    lea                  r5, [rsp+56]  ; left-7
+    vpbroadcastw         m0, xm0
+    lea                  r9, [strideq*3]
+    psraw               xm1, xm5, 6
+    pand                xm5, xm14      ; frac_y
+    pxor                xm2, xm2
+    paddw                m7, m0, m0
+    psubw               xm4, xm2, xm1  ; base_y
+    vpblendd             m0, m7, 0xcc
+    mova                xm1, xm7
+    punpcklwd           xm4, xm2
+    paddw                m0, m1        ; xpos2 xpos3 xpos0 xpos1
+    psubw               xm1, xm15, xm5 ; 64-frac_y
+    psllw               xm5, 8
+    paddw                m7, m7
+    paddw                m6, m0
+    por                 xm5, xm1       ; 64-frac_y, frac_y
+    vpbroadcastq         m5, xm5
+.w4_loop:
+    lea                 r3d, [r2+dxq]
+    shr                 r2d, 6         ; base_x0
+    vpbroadcastq         m1, [rsp+r2]
+    lea                 r2d, [r3+dxq]
+    shr                 r3d, 6         ; base_x1
+    vpbroadcastq         m2, [rsp+r3]
+    lea                 r3d, [r2+dxq]
+    shr                 r2d, 6         ; base_x2
+    movq                xm0, [rsp+r2]
+    lea                 r2d, [r3+dxq]
+    shr                 r3d, 6         ; base_x3
+    movhps              xm0, [rsp+r3]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m14, m6   ; frac_x
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m15, m2   ; 64-frac_x
+    psllw                m2, 8
+    pshufb               m0, m10
+    por                  m1, m2        ; 64-frac_x, frac_x
+    pmaddubsw            m0, m1
+    cmp                 r3d, 64
+    jge .w4_toponly
+    mova                 m1, m7        ; arbitrary negative value
+    vpgatherdq           m3, [r5+xm4], m1
+    pshufb               m1, m3, m11
+    vpermd               m1, m12, m1
+    pmaddubsw            m1, m5
+    psraw                m2, m6, 15    ; base_x < topleft
+    vpblendvb            m0, m1, m2
+.w4_toponly:
+    pmulhrsw             m0, m13
+    paddw                m6, m7        ; xpos += dx
+    add                  r5, dyq
+    packuswb             m0, m0
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*2], xm0
+    pextrd [dstq+r9       ], xm0, 1
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    sub                  hd, 4
+    jz .w4_end
+    lea                dstq, [dstq+strideq*4]
+    cmp                 r2d, r8d
+    jge .w4_loop
+.w4_leftonly_loop:
+    mova                 m1, m7
+    vpgatherdq           m2, [r5+xm4], m1
+    add                  r5, dyq
+    pshufb               m0, m2, m11
+    vpermd               m0, m12, m0
+    pmaddubsw            m0, m5
+    pmulhrsw             m0, m13
+    packuswb             m0, m0
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*2], xm0
+    pextrd [dstq+r9       ], xm0, 1
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4_leftonly_loop
+.w4_end:
+    RET
+.w8:
+    vbroadcasti128       m6, [base+z2_base_inc] ; base_inc << 6
+    movd                xm5, dyd
+    vbroadcasti128      m10, [base+z_filter_s+2]
+    vbroadcasti128      m11, [base+z2_shuf_h4]
+    lea                 r2d, [dxq+(65<<6)] ; xpos
+    vpbroadcastw        xm5, xm5
+    mov                 r8d, (63-8)<<6
+    mov                 dyq, -4
+    pmullw              xm5, [base+z2_ymul]
+    test             angled, 0x400
+    jnz .w8_main
+    lea                 r3d, [angleq+126]
+    mov                 r3b, hb
+    cmp                 r3d, 8
+    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+    vpbroadcastd        xm3, [base+pb_8]
+    movhps         [rsp+80], xm1
+    call .upsample_above
+    sub              angled, 53 ; angle - 53
+    lea                 r3d, [hq+7]
+    xor              angled, 0x7f ; 180 - angle
+    call .filter_strength
+    jmp .w8_filter_left
+.w8_no_upsample_above:
+    lea                 r3d, [hq+7]
+    sub              angled, 90 ; angle - 90
+    call .filter_strength
+    test                r3d, r3d
+    jz .w8_no_filter_above
+    popcnt              r3d, r3d
+    vpbroadcastd        xm3, [base+pb_8]
+    pminub              xm3, [base+z_filter_s+8]
+    vpbroadcastd        xm0, [base+z_filter_k-4+r3*4+12*0]
+    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
+    pshufb              xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67
+    pmaddubsw           xm0, xm2, xm0
+    pshufb              xm3, xm1, xm3               ; 34 45 56 67 78 88 88 88
+    shufps              xm2, xm3, q2121             ; 12 23 34 45 56 67 78 88
+    pmaddubsw           xm2, xm4
+    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*2]
+    pmaddubsw           xm3, xm4
+    movd                xm4, r6m ; max_width
+    pminuw              xm4, xm15
+    vpbroadcastb        xm4, xm4
+    paddw               xm0, xm2
+    paddw               xm0, xm3
+    pmulhrsw            xm0, xm13
+    psubb               xm4, [base+pb_1to32]
+    psrldq              xm1, 1
+    packuswb            xm0, xm0
+    vpblendvb           xm0, xm1, xm4
+    movq           [rsp+65], xm0
+.w8_no_filter_above:
+    lea                 r3d, [angleq-51]
+    mov                 r3b, hb
+    cmp                 r3d, 8
+    jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
+    vpbroadcastd         m0, [base+pb_90]
+    psubb                m0, m7
+    pand                 m0, m8
+    pcmpgtb              m0, m9
+    pmovmskb            r3d, m0
+.w8_filter_left:
+    test                r3d, r3d
+    jz .w8_main
+    popcnt              r3d, r3d
+    vpbroadcastd         m7, [base+z_filter_k-4+r3*4+12*0]
+    vpbroadcastd         m8, [base+z_filter_k-4+r3*4+12*1]
+    vpbroadcastd         m9, [base+z_filter_k-4+r3*4+12*2]
+    cmp                  hd, 32
+    jne .w8_filter_left_h16
+    movu                xm2, [rsp+27]
+    vinserti128          m2, [rsp+35], 1
+    vpbroadcastd        xm0, [base+pb_5]
+    vbroadcasti128       m3, [base+z_filter_s+ 8]
+    vbroadcasti128       m1, [base+z_filter_s+12]
+    vbroadcasti128       m4, [base+z_filter_s+16]
+    pmaxub               m3, m0
+    pshufb               m3, m2, m3
+    pmaddubsw            m3, m7
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m8
+    pshufb               m2, m4
+    pmaddubsw            m2, m9
+    paddw                m3, m1
+    paddw                m3, m2
+    pmulhrsw             m3, m13
+    jmp .w8_filter_left_top16
+.w8_filter_left_h16:
+    mov                 r5d, 10
+    cmp                  hd, 16
+    cmovs               r5d, hd
+    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
+    movd                xm0, r5d
+    vpbroadcastb         m0, xm0
+.w8_filter_left_top16:
+    vbroadcasti128       m1, [base+z_filter_s+12]
+    vinserti128          m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd   55 55 56 67 78 89 9a ab
+    vbroadcasti128       m4, [base+z_filter_s+16]
+    vpblendd             m1, m4, 0x0f                   ; 78 89 9a ab bc cd de ef   56 67 78 89 9a ab bc cd
+    vinserti128          m4, [base+z_filter_s+20], 0    ; 9a ab bc cd de ef ff ff   78 89 9a ab bc cd de ef
+    pmaxub               m0, m2
+    movu                xm2, [rsp+49]
+    vinserti128          m2, [rsp+43], 1
+    pshufb               m0, m2, m0
+    pmaddubsw            m0, m7
+    movd                xm7, r7m ; max_height
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m8
+    pshufb               m2, m4
+    pmaddubsw            m2, m9
+    pminsw              xm7, xm15
+    paddw                m1, m0
+    vpbroadcastb         m7, xm7
+    paddw                m1, m2
+    pmulhrsw             m1, m13
+    psubb                m7, [base+pb_32to1]
+    packuswb             m3, m1
+    vpermq               m3, m3, q1320
+    vpblendvb            m3, [rsp+32], m7
+    mova           [rsp+32], m3
+    jmp .w8_main
+.w8_upsample_left:
+    call .upsample_left
+.w8_main:
+    movd                xm3, dxd
+    lea                  r5, [rsp+56]  ; left-7
+    pshufd              xm1, xm5, q3120
+    pand                xm5, xm14
+    vpbroadcastw         m3, xm3
+    pxor                xm0, xm0
+    psubw               xm2, xm15, xm5
+    psraw               xm1, 6
+    lea                  r9, [strideq*3]
+    paddw                m7, m3, m3
+    psubw               xm9, xm0, xm1  ; base_y
+    psllw               xm5, 8
+    punpcklwd           xm8, xm9, xm0  ; base_y 0, 1, 4, 5
+    vpblendd             m3, m7, 0xf0  ; xpos0 xpos1
+    por                 xm5, xm2       ; 64-frac_y, frac_y
+    punpckhwd           xm9, xm0       ; base_y 2, 3, 6, 7
+    paddw                m6, m3
+    vinserti128         m12, m5, xm5, 1
+.w8_loop:
+    lea                 r3d, [r2+dxq]
+    shr                 r2d, 6         ; base_x0
+    movu                xm0, [rsp+r2]
+    lea                 r2d, [r3+dxq]
+    shr                 r3d, 6         ; base_x1
+    vinserti128          m0, [rsp+r3], 1
+    lea                 r3d, [r2+dxq]
+    shr                 r2d, 6         ; base_x2
+    movu                xm1, [rsp+r2]
+    lea                 r2d, [r3+dxq]
+    shr                 r3d, 6         ; base_x3
+    vinserti128          m1, [rsp+r3], 1
+    pand                 m2, m14, m6
+    paddsw               m4, m6, m7
+    psubw                m5, m15, m2
+    psllw                m2, 8
+    pshufb               m0, m10
+    por                  m2, m5
+    pmaddubsw            m0, m2
+    pand                 m2, m14, m4
+    psubw                m5, m15, m2
+    psllw                m2, 8
+    pshufb               m1, m10
+    por                  m2, m5
+    pmaddubsw            m1, m2
+    cmp                 r3d, 64
+    jge .w8_toponly
+    mova                 m5, m7
+    vpgatherdq           m3, [r5+xm9], m7
+    mova                 m7, m5
+    vpgatherdq           m2, [r5+xm8], m5
+    pshufb               m3, m11
+    pshufb               m2, m11
+    punpckldq            m5, m2, m3    ; a0 b0 c0 d0 a1 b1 c1 d1   e0 f0 g0 h0 e1 f1 g1 h1
+    punpckhdq            m2, m3        ; a2 b2 c2 d2 a3 b3 c3 d3   e2 f2 g2 h2 e3 f3 g3 h3
+    vpermq               m5, m5, q3120 ; y0 y1
+    vpermq               m2, m2, q3120 ; y2 y3
+    pmaddubsw            m5, m12
+    pmaddubsw            m2, m12
+    psraw                m6, 15        ; base_x < topleft
+    vpblendvb            m0, m5, m6
+    psraw                m3, m4, 15
+    vpblendvb            m1, m2, m3
+.w8_toponly:
+    pmulhrsw             m0, m13
+    pmulhrsw             m1, m13
+    paddw                m6, m4, m7     ; xpos += dx
+    add                  r5, dyq
+    packuswb             m0, m1
+    vextracti128        xm1, m0, 1
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*2], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+r9       ], xm1
+    sub                  hd, 4
+    jz .w8_end
+    lea                dstq, [dstq+strideq*4]
+    cmp                 r2d, r8d
+    jge .w8_loop
+.w8_leftonly_loop:
+    mova                 m0, m7
+    vpgatherdq           m5, [r5+xm9], m7
+    mova                 m7, m0
+    vpgatherdq           m3, [r5+xm8], m0
+    add                  r5, dyq
+    pshufb               m2, m5, m11
+    pshufb               m1, m3, m11
+    punpckldq            m0, m1, m2
+    punpckhdq            m1, m2
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q3120
+    pmaddubsw            m0, m12
+    pmaddubsw            m1, m12
+    pmulhrsw             m0, m13
+    pmulhrsw             m1, m13
+    packuswb             m0, m1
+    vextracti128        xm1, m0, 1
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*2], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+r9       ], xm1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w8_leftonly_loop
+.w8_end:
+    RET
+.w16:
+    mov                 r8d, hd
+    test             angled, 0x400
+    jnz .w16_main
+    lea                 r3d, [hq+15]
+    sub              angled, 90
+    call .filter_strength
+    test                r3d, r3d
+    jz .w16_no_filter_above
+    popcnt              r3d, r3d
+    vbroadcasti128       m6, [tlq+1]
+    mova                xm2, [base+z_filter_s]
+    vinserti128          m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67   67 78 89 9a ab bc cd de
+    movu                xm3, [base+z_filter_s+8]
+    vinserti128          m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab   ab bc cd de ef ff ff ff
+    vpblendd             m1, m6, 0xf0
+    vpbroadcastd         m0, [base+z_filter_k-4+r3*4+12*0]
+    vpbroadcastd         m4, [base+z_filter_k-4+r3*4+12*1]
+    vpbroadcastd         m5, [base+z_filter_k-4+r3*4+12*2]
+    pshufb               m2, m1, m2
+    pshufb               m1, m3
+    pmaddubsw            m0, m2, m0
+    shufps               m2, m1, q2121                ; 12 23 34 45 56 67 78 89   89 9a ab bc cd de ef ff
+    pmaddubsw            m2, m4
+    pmaddubsw            m1, m5
+    movd                xm4, r6m ; max_width
+    pminsw              xm4, xm15
+    vpbroadcastb        xm4, xm4
+    paddw                m0, m2
+    paddw                m0, m1
+    pmulhrsw             m0, m13
+    psubb               xm4, [base+pb_1to32]
+    vextracti128        xm2, m0, 1
+    packuswb            xm0, xm2
+    vpblendvb           xm0, xm6, xm4
+    movu           [rsp+65], xm0
+.w16_no_filter_above:
+    vpbroadcastd         m0, [base+pb_90]
+    psubb                m0, m7
+    pand                 m0, m8
+    pcmpgtb              m0, m9
+    pmovmskb            r3d, m0
+    test                r3d, r3d
+    jz .w16_main
+    popcnt              r3d, r3d
+    vpbroadcastd         m7, [base+z_filter_k-4+r3*4+12*0]
+    vpbroadcastd         m8, [base+z_filter_k-4+r3*4+12*1]
+    vpbroadcastd         m9, [base+z_filter_k-4+r3*4+12*2]
+.w16_filter_left:
+    movd                xm6, r7m ; max_height
+    pminsw              xm6, xm15
+    vpbroadcastb         m6, xm6
+    cmp                  hd, 32
+    jl .w16_filter_left_h16
+    vpbroadcastd        xm0, [base+pb_5]
+    vbroadcasti128      m10, [base+z_filter_s+ 8]
+    vbroadcasti128      m11, [base+z_filter_s+12]
+    vbroadcasti128      m12, [base+z_filter_s+16]
+    je .w16_filter_left_h32
+    movu                 m3, [tlq-69]
+    movu                 m5, [tlq-61]
+    pmaxub               m1, m10, m0
+    pshufb               m1, m3, m1
+    pmaddubsw            m1, m7
+    pshufb               m2, m3, m11
+    pmaddubsw            m2, m8
+    pshufb               m3, m12
+    pmaddubsw            m3, m9
+    paddw                m1, m2
+    pshufb               m2, m5, m10
+    pmaddubsw            m2, m7
+    pshufb               m4, m5, m11
+    pmaddubsw            m4, m8
+    pshufb               m5, m12
+    pmaddubsw            m5, m9
+    paddw                m1, m3
+    vpbroadcastd         m3, [base+pb_32]
+    paddb                m3, [base+pb_32to1]
+    paddw                m2, m4
+    paddw                m2, m5
+    pmulhrsw             m1, m13
+    pmulhrsw             m2, m13
+    psubb                m3, m6, m3
+    packuswb             m1, m2
+    vpblendvb            m1, [tlq-64], m3
+    mova              [rsp], m1
+    jmp .w16_filter_left_top32
+.w16_filter_left_h32:
+    pmaxub              m10, m0
+.w16_filter_left_top32:
+    movu                xm2, [tlq-37]
+    vinserti128          m2, [tlq-29], 1
+    pshufb               m3, m2, m10
+    pshufb               m1, m2, m11
+    pshufb               m2, m12
+    pmaddubsw            m3, m7
+    pmaddubsw            m1, m8
+    pmaddubsw            m2, m9
+    paddw                m3, m1
+    paddw                m3, m2
+    pmulhrsw             m3, m13
+    jmp .w16_filter_left_top16
+.w16_filter_left_h16:
+    mov                 r5d, 10
+    cmp                  hd, 16
+    cmovs               r5d, hd
+    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
+    movd                xm0, r5d
+    vpbroadcastb         m0, xm0
+.w16_filter_left_top16:
+    movu                xm2, [tlq-15]
+    vinserti128          m2, [tlq-21], 1
+    vbroadcasti128       m1, [base+z_filter_s+12]
+    vbroadcasti128       m4, [base+z_filter_s+16]
+    vinserti128          m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd   34 45 56 67 78 89 9a ab
+    vpblendd             m1, m4, 0x0f                   ; 78 89 9a ab bc cd de ef   56 67 78 89 9a ab bc cd
+    vinserti128          m4, [base+z_filter_s+20], 0    ; 9a ab bc cd de ef ff ff   78 89 9a ab bc cd de ef
+    pmaxub               m0, m5
+    pshufb               m0, m2, m0
+    pmaddubsw            m0, m7
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m8
+    pshufb               m2, m4
+    pmaddubsw            m2, m9
+    psubb                m6, [base+pb_32to1]
+    paddw                m1, m0
+    paddw                m1, m2
+    pmulhrsw             m1, m13
+    packuswb             m3, m1
+    vpermq               m3, m3, q1320
+    vpblendvb            m3, [tlq-32], m6
+    mova           [rsp+32], m3
+.w16_main:
+    movd                xm1, dyd
+    vbroadcasti128      m10, [base+z_filter_s+2]
+    movd                xm7, dxd
+    vbroadcasti128      m11, [base+z2_shuf_h2]
+    vpbroadcastw         m1, xm1
+    vpbroadcastw         m7, xm7
+    mov                  r7, dstq
+    pmullw               m0, m1, [base+z2_ymul]
+    psllw               xm1, 4
+    paddw                m6, m7, [base+z2_base_inc]
+    lea                 r9d, [dxq+(65<<6)] ; xpos
+    movd          [rsp+156], xm1
+.w16_loop0:
+    mov                 r2d, r9d
+    mova          [rsp+160], m0
+    lea                  r5, [rsp+60] ; left-3
+    mova          [rsp+192], m6
+    pxor                 m1, m1
+    psraw                m2, m0, 6
+    pand                 m0, m14
+    psubw                m9, m1, m2   ; base_y
+    psubw               m12, m15, m0
+    punpcklwd            m8, m9, m1   ; base_y  0,  1,  2,  3,     8,  9, 10, 11
+    psllw                m0, 8
+    punpckhwd            m9, m1       ; base_y  4,  5,  6,  7,    12, 13, 14, 15
+    por                 m12, m0       ; 64-frac_y, frac_y
+.w16_loop:
+    lea                 r3d, [r2+dxq]
+    shr                 r2d, 6        ; base_x0
+    movu                xm0, [rsp+r2]
+    vinserti128          m0, [rsp+r2+8], 1
+    lea                 r2d, [r3+dxq]
+    shr                 r3d, 6        ; base_x1
+    movu                xm1, [rsp+r3]
+    vinserti128          m1, [rsp+r3+8], 1
+    pand                 m2, m14, m6
+    paddsw               m5, m6, m7
+    psubw                m3, m15, m2
+    psllw                m2, 8
+    pshufb               m0, m10
+    por                  m2, m3
+    pmaddubsw            m0, m2
+    pand                 m2, m14, m5
+    psubw                m3, m15, m2
+    psllw                m2, 8
+    pshufb               m1, m10
+    por                  m2, m3
+    pmaddubsw            m1, m2
+    cmp                 r3d, 64
+    jge .w16_toponly
+    punpckhwd            m2, m5, m5   ; mask out unnecessary loads
+    vpgatherdd           m4, [r5+m9], m2
+    punpcklwd            m2, m5, m5
+    vpgatherdd           m3, [r5+m8], m2
+    pshufb               m4, m11      ; e0 f0 g0 h0 e1 f1 g1 h1   m0 n0 o0 p0 m1 n1 o1 p1
+    pshufb               m3, m11      ; a0 b0 c0 d0 a1 b1 c1 d1   i0 j0 k0 l0 i1 j1 k1 l1
+    punpcklqdq           m2, m3, m4   ; y0
+    punpckhqdq           m3, m4       ; y1
+    pmaddubsw            m2, m12
+    pmaddubsw            m3, m12
+    psraw                m6, 15       ; base_x < topleft
+    vpblendvb            m0, m2, m6
+    psraw                m6, m5, 15
+    vpblendvb            m1, m3, m6
+.w16_toponly:
+    pmulhrsw             m0, m13
+    pmulhrsw             m1, m13
+    paddw                m6, m5, m7   ; xpos += dx
+    sub                  r5, 2
+    packuswb             m0, m1
+    vpermq               m0, m0, q3120
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    sub                  hd, 2
+    jz .w16_end
+    lea                dstq, [dstq+strideq*2]
+    cmp                 r2d, (63-16)<<6
+    jge .w16_loop
+.w16_leftonly_loop:
+    mova                 m0, m7
+    vpgatherdd           m4, [r5+m9], m7
+    mova                 m7, m0
+    vpgatherdd           m3, [r5+m8], m0
+    sub                  r5, 2
+    pshufb               m2, m4, m11
+    pshufb               m1, m3, m11
+    punpcklqdq           m0, m1, m2
+    punpckhqdq           m1, m2
+    pmaddubsw            m0, m12
+    pmaddubsw            m1, m12
+    pmulhrsw             m0, m13
+    pmulhrsw             m1, m13
+    packuswb             m0, m1
+    vpermq               m0, m0, q3120
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w16_leftonly_loop
+.w16_end:
+    sub                 r8d, 1<<8
+    jl .w16_ret
+    vpbroadcastd         m0, [rsp+156]
+    paddw                m0, [rsp+160] ; base_y += 16*dy
+    paddw                m6, m13, [rsp+192]
+    add                  r7, 16
+    add                 r9d, 16<<6
+    movzx                hd, r8b
+    mov                dstq, r7
+    paddw                m6, m13 ; base_x += 16*64
+    jmp .w16_loop0
+.w16_ret:
+    RET
+.w32:
+    mova                 m2, [tlq+32]
+    lea                 r8d, [hq+(1<<8)]
+    mova           [rsp+96], m2
+    test             angled, 0x400
+    jnz .w16_main
+    vpbroadcastd         m7, [base+z_filter_k+4*2+12*0]
+    vpbroadcastd         m8, [base+z_filter_k+4*2+12*1]
+    vpbroadcastd         m9, [base+z_filter_k+4*2+12*2]
+    mova                xm5, [base+z_filter_s]
+    vinserti128          m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67   45 56 67 78 89 9a ab bc
+    vinserti128          m1, [tlq+11], 1
+    movu                xm6, [base+z_filter_s+12]
+    vinserti128          m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd   ab bc cd de ef ff ff ff
+    movu                xm3, [tlq+ 6]
+    vinserti128          m3, [tlq+17], 1
+    movd                xm0, r6m ; max_width
+    pminsw              xm0, xm15
+    vpbroadcastb        m10, xm0
+.w32_filter_above:
+    pshufb               m0, m1, m5
+    shufps               m4, m5, m6, q1021           ; 12 23 34 45 56 67 78 89   67 78 89 9a ab bc cd de
+    pmaddubsw            m0, m7
+    pshufb               m2, m1, m4
+    shufps               m5, m6, q2132               ; 34 45 56 67 78 89 9a ab   89 9a ab bc cd de ef ff
+    pmaddubsw            m2, m8
+    pshufb               m1, m5
+    pmaddubsw            m1, m9
+    paddw                m0, m2
+    paddw                m0, m1
+    pshufb               m1, m3, m4
+    pmaddubsw            m1, m7
+    pshufb               m2, m3, m5
+    pmaddubsw            m2, m8
+    pshufb               m3, m6
+    pmaddubsw            m3, m9
+    paddw                m1, m2
+    paddw                m1, m3
+    pmulhrsw             m0, m13
+    pmulhrsw             m1, m13
+    psubb               m10, [base+pb_1to32]
+    packuswb             m0, m1
+    vpblendvb            m0, [tlq+1], m10
+    movu           [rsp+65], m0
+    jmp .w16_filter_left
+.w64:
+    mova                 m2, [tlq+32]
+    mov                 r3d, [tlq+64]
+    lea                 r8d, [hq+(3<<8)]
+    mova          [rsp+ 96], m2
+    mov           [rsp+128], r3d
+    test             angled, 0x400
+    jnz .w16_main
+    vpbroadcastd         m7, [base+z_filter_k+4*2+12*0]
+    vpbroadcastd         m8, [base+z_filter_k+4*2+12*1]
+    vpbroadcastd         m9, [base+z_filter_k+4*2+12*2]
+    movu                xm6, [base+z_filter_s+ 4]
+    vinserti128          m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89   45 56 67 78 89 9a ab bc
+    movu                xm3, [tlq+30]
+    vinserti128          m3, [tlq+43], 1
+    movu                xm5, [base+z_filter_s+16]
+    vinserti128          m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef   ab bc cd de ef ff ff ff
+    pshufb               m0, m3, m6
+    shufps               m4, m6, m5, q1021           ; 34 45 56 67 78 89 9a ab   67 78 89 9a ab bc cd de
+    pmaddubsw            m0, m7
+    pshufb               m2, m3, m4
+    shufps               m6, m5, q2132               ; 56 67 78 89 9a ab bc cd   89 9a ab bc cd de ef ff
+    pmaddubsw            m2, m8
+    pshufb               m3, m6
+    pmaddubsw            m3, m9
+    paddw                m0, m2
+    paddw                m0, m3
+    movu                xm2, [tlq+36]
+    vinserti128          m2, [tlq+49], 1
+    pshufb               m4, m2, m4
+    pmaddubsw            m4, m7
+    pshufb               m3, m2, m6
+    pmaddubsw            m3, m8
+    pshufb               m2, m5
+    pmaddubsw            m2, m9
+    movd                xm5, r6m ; max_width
+    pminsw              xm5, xm15
+    vpbroadcastb        m10, xm5
+    paddw                m3, m4
+    paddw                m2, m3
+    vpbroadcastd         m3, [base+pb_32]
+    pmulhrsw             m0, m13
+    pmulhrsw             m2, m13
+    mova                xm5, [base+z_filter_s]
+    vinserti128          m5, [base+z_filter_s+6], 1
+    psubb                m3, m10, m3
+    psubb                m3, [base+pb_1to32]
+    vinserti128          m1, [tlq+13], 1
+    packuswb             m0, m2
+    vpblendvb            m0, [tlq+33], m3
+    movu                xm3, [tlq+ 6]
+    vinserti128          m3, [tlq+19], 1
+    movu           [rsp+97], m0
+    jmp .w32_filter_above
+
+cglobal ipred_z3, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
+    %assign org_stack_offset stack_offset
+    lea                  r6, [ipred_z3_avx2_table]
+    tzcnt                hd, hm
+    movifnidn        angled, anglem
+    lea                  r7, [dr_intra_derivative+45*2-1]
+    dec                 tlq
+    movsxd               hq, [r6+hq*4]
+    sub              angled, 180
+    add                  hq, r6
+    mov                 dyd, angled
+    neg                 dyd
+    xor              angled, 0x400
+    or                  dyq, ~0x7e
+    movzx               dyd, word [r7+dyq]
+    vpbroadcastd         m3, [pw_512]
+    vpbroadcastd         m4, [pw_62]
+    vpbroadcastd         m5, [pw_64]
+    mov              org_wd, wd
+    jmp                  hq
+.h4:
+    lea                  r7, [strideq*3]
+    cmp              angleb, 40
+    jae .h4_no_upsample
+    lea                 r4d, [angleq-1024]
+    sar                 r4d, 7
+    add                 r4d, wd
+    jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
+    ALLOC_STACK         -32, 9
+    movu                xm8, [tlq-7]
+    pshufb              xm0, xm8, [z_upsample1-4]
+    vpbroadcastb        xm2, xm8
+    pshufb              xm1, xm8, [z_filter_s+2]
+    mova           [rsp+16], xm2 ; top[max_base_y]
+    vpbroadcastd        xm2, [pb_36_m4]
+    add                 dyd, dyd
+    pmaddubsw           xm0, xm2
+    pmaddubsw           xm1, xm2
+    movd                xm7, dyd
+    mov                 r2d, dyd
+    vpbroadcastw         m7, xm7
+    paddw               xm1, xm0
+    pmulhrsw            xm1, xm3
+    pslldq               m6, m7, 8
+    paddw               xm2, xm7, xm7
+    paddw                m6, m7
+    packuswb            xm1, xm1
+    paddw                m6, m2
+    punpcklbw           xm1, xm8
+    mova                xm8, [z_transpose4]
+    psllw                m7, 2
+    pshufb              xm1, [pb_15to0]
+    mova              [rsp], xm1
+.h4_upsample_loop:
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6
+    vpbroadcastq         m1, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6
+    vpbroadcastq         m2, [rsp+r4]
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6
+    movq                xm0, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6
+    movhps              xm0, [rsp+r4]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m4, m6
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m5, m2
+    psllw                m2, 8
+    por                  m1, m2
+    pmaddubsw            m0, m1
+    paddw                m6, m7
+    pmulhrsw             m0, m3
+    vextracti128        xm1, m0, 1
+    packuswb            xm1, xm0
+    pshufb              xm1, xm8
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+r7       ], xm1, 3
+    add                dstq, 4
+    sub                  wd, 4
+    jg .h4_upsample_loop
+    RET
+ALIGN function_align
+.filter_strength: ; h4/h8/h16
+%define base r4-z_filter_t0
+    lea                  r4, [z_filter_t0]
+    movd                xm0, maxbased
+    movd                xm2, angled
+    shr              angled, 8 ; is_sm << 1
+    vpbroadcastb         m0, xm0
+    vpbroadcastb         m2, xm2
+    pcmpeqb              m1, m0, [base+z_filter_wh]
+    pand                 m1, m2
+    mova                xm2, [r4+angleq*8]
+    pcmpgtb              m1, m2
+    pmovmskb            r5d, m1
+    ret
+.h4_no_upsample:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -16, 12
+    mov            maxbased, 7
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .h4_main
+    lea            maxbased, [wq+3]
+    call .filter_strength
+    mov            maxbased, 7
+    test                r5d, r5d
+    jz .h4_main ; filter_strength == 0
+    popcnt              r5d, r5d
+    vpbroadcastd         m7, [base+pb_7]
+    vbroadcasti128       m2, [tlq-14]
+    pmaxub               m1, m7, [base+z_filter_s-4]
+    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
+    pmaxub               m7, [base+z_filter_s+4]
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
+    pshufb               m0, m2, m1
+    shufps               m1, m7, q2121
+    pmaddubsw            m0, m8
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m9
+    pshufb               m2, m7
+    pmaddubsw            m2, m10
+    paddw                m0, m1
+    paddw                m0, m2
+    pmulhrsw             m0, m3
+    mov                 r4d, 9
+    lea                 tlq, [rsp+15]
+    cmp                  wd, 4
+    cmovne         maxbased, r4d
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    mova              [rsp], xm0
+.h4_main:
+    movd                xm6, dyd
+    vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
+    mov                  r4, tlq
+    sub                 tlq, 4
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63] ; ypos
+    movd                xm9, maxbased
+    not            maxbased
+    vbroadcasti128       m8, [z3_shuf_w4]
+    add            maxbased, 64
+    vpbroadcastw         m9, xm9
+    psrlw                m7, 8  ; top[max_base_y]
+    paddw               m10, m6, m6
+    psubw                m9, m0 ; max_base_y
+    vpblendd             m6, m10, 0xcc
+    mova                xm0, xm10
+    paddw                m6, m0 ; ypos2 ypos3 ypos0 ypos1
+    paddw               m10, m10
+    mova               xm11, [z_transpose4]
+.h4_loop:
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6 ; base0
+    vpbroadcastq         m1, [tlq+r4]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6 ; base1
+    vpbroadcastq         m2, [tlq+r5]
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6 ; base2
+    movq                xm0, [tlq+r4]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6 ; base3
+    movhps              xm0, [tlq+r5]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m4, m6 ; frac
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m5, m2 ; 64-frac
+    psllw                m2, 8
+    pshufb               m0, m8
+    por                  m1, m2     ; 64-frac, frac
+    pmaddubsw            m0, m1
+    pcmpgtw              m1, m9, m6 ; base < max_base_y
+    pmulhrsw             m0, m3
+    paddw                m6, m10    ; ypos += dy
+    vpblendvb            m0, m7, m0, m1
+    vextracti128        xm1, m0, 1
+    packuswb            xm1, xm0
+    pshufb              xm1, xm11   ; transpose
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+r7       ], xm1, 3
+    sub                  wd, 4
+    jz .h4_end
+    add                dstq, 4
+    cmp                 r4d, maxbased
+    jg .h4_loop
+    packuswb            xm7, xm7
+.h4_end_loop:
+    movd   [dstq+strideq*0], xm7
+    movd   [dstq+strideq*1], xm7
+    movd   [dstq+strideq*2], xm7
+    movd   [dstq+r7       ], xm7
+    add                dstq, 4
+    sub                  wd, 4
+    jg .h4_end_loop
+.h4_end:
+    RET
+ALIGN function_align
+.h8:
+    lea                 r4d, [angleq+216]
+    mov                 r4b, wb
+    cmp                 r4d, 8
+    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -32, 8
+    and                 r4d, 4
+    mova                xm0, [tlq-15]
+    vinserti128          m0, [tlq- 9], 1
+    movd                xm1, r4d
+    movu                xm2, [z_filter_s+2]
+    vinserti128          m2, [z_filter_s+6], 1
+    vpbroadcastb        xm1, xm1 ; w & 4
+    vpbroadcastd         m7, [pb_36_m4]
+    pmaxub              xm1, [z_upsample1-4] ; clip 4x8
+    vinserti128          m1, [z_upsample1], 1
+    add                 dyd, dyd
+    pshufb               m1, m0, m1
+    pshufb               m2, m0, m2
+    vinserti128          m0, [tlq-7], 1
+    movd                xm6, dyd
+    pmaddubsw            m1, m7
+    pmaddubsw            m2, m7
+    vpbroadcastw         m6, xm6
+    mov                 r2d, dyd
+    lea                  r5, [strideq*3]
+    paddw                m7, m6, m6
+    paddw                m1, m2
+    vpblendd             m6, m7, 0xf0
+    pmulhrsw             m1, m3
+    pslldq               m2, m7, 8
+    paddw                m7, m7
+    paddw                m6, m2
+    vbroadcasti128       m2, [pb_15to0]
+    packuswb             m1, m1
+    punpcklbw            m1, m0
+    pshufb               m1, m2
+    vextracti128   [rsp+ 0], m1, 1
+    mova           [rsp+16], xm1
+.h8_upsample_loop:
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6 ; base0
+    movu                xm0, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6 ; base1
+    vinserti128          m0, [rsp+r4], 1
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6 ; base2
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    punpcklqdq           m1, m2, m2 ; frac0 frac1
+    pmaddubsw            m0, m1
+    movu                xm1, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6 ; base3
+    vinserti128          m1, [rsp+r4], 1
+    punpckhqdq           m2, m2 ; frac2 frac3
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    paddw                m6, m7
+    pmulhrsw             m1, m3
+    lea                  r4, [dstq+strideq*4]
+    psllw                m1, 8
+    por                  m0, m1
+    vextracti128        xm1, m0, 1
+    punpcklbw           xm2, xm0, xm1
+    punpckhbw           xm0, xm1
+    movd   [dstq+strideq*0], xm2
+    pextrd [dstq+strideq*1], xm2, 1
+    pextrd [dstq+strideq*2], xm2, 2
+    pextrd [dstq+r5       ], xm2, 3
+    movd   [r4  +strideq*0], xm0
+    pextrd [r4  +strideq*1], xm0, 1
+    pextrd [r4  +strideq*2], xm0, 2
+    pextrd [r4  +r5       ], xm0, 3
+    add                dstq, 4
+    sub                  wd, 4
+    jg .h8_upsample_loop
+    RET
+.h8_no_intra_edge_filter:
+    and            maxbased, 7
+    or             maxbased, 8 ; imin(w+7, 15)
+    jmp .h8_main
+.h8_no_upsample:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -32, 10
+    lea            maxbased, [wq+7]
+    test             angled, 0x400
+    jnz .h8_no_intra_edge_filter
+    call .filter_strength
+    test                r5d, r5d
+    jz .h8_main ; filter_strength == 0
+    popcnt              r5d, r5d
+    vpbroadcastd        xm6, [base+pb_15]
+    pcmpeqb             xm1, xm1
+    psubusb             xm6, xm0
+    psubb               xm6, xm1 ; w == 4 ? 5 : 1
+    movu                xm2, [tlq-16]
+    pmaxub              xm1, xm6, [base+z_filter_s]
+    vinserti128          m2, [tlq-14], 1
+    vinserti128          m1, [base+z_filter_s+12], 1
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
+    pmaxub              xm6, [base+z_filter_s+ 8]
+    vinserti128          m6, [base+z_filter_s+20], 1
+    pshufb               m0, m2, m1
+    pmaddubsw            m0, m7
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
+    movzx               r4d, byte [tlq-15]
+    shufps               m1, m6, q2121
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m7
+    paddw                m0, m1
+    sub                 r5d, 3
+    jnz .h8_3tap
+    vpbroadcastd         m7, [z_filter_k+4*8]
+    movzx               r2d, byte [tlq-14]
+    pshufb               m2, m6
+    pmaddubsw            m2, m7
+    sub                 r2d, r4d
+    lea                 r2d, [r2+r4*8+4]
+    shr                 r2d, 3
+    mov            [rsp+15], r2b
+    paddw                m0, m2
+.h8_3tap:
+    pmulhrsw             m0, m3
+    sar                 r5d, 1
+    lea                 tlq, [rsp+31]
+    add                 r5d, 17
+    cmp                  wd, 16
+    cmovns         maxbased, r5d
+    neg                  r5
+    mov            [tlq+r5], r4b
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    mova           [tlq-15], xm0
+.h8_main:
+    movd                xm2, dyd
+    vbroadcasti128       m0, [z_base_inc]
+    mov                  r4, tlq
+    sub                 tlq, 8
+    neg                 dyq
+    vpbroadcastw         m2, xm2
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd                xm9, maxbased
+    not            maxbased
+    vbroadcasti128       m8, [z3_shuf]
+    add            maxbased, 64
+    vpbroadcastw         m9, xm9
+    psrlw                m7, 8
+    psubw                m9, m0
+    paddw                m6, m2, m2
+    vpblendd             m2, m6, 0x0f
+.h8_loop:
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6
+    pand                 m0, m4, m2
+    psubw                m1, m5, m0
+    psllw                m0, 8
+    por                  m1, m0
+    vbroadcasti128       m0, [tlq+r4]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6
+    vinserti128          m0, [tlq+r5], 0
+    sub                 rsp, 8*2
+    pshufb               m0, m8
+    pmaddubsw            m0, m1
+    pcmpgtw              m1, m9, m2
+    paddw                m2, m6
+    pmulhrsw             m0, m3
+    vpblendvb            m0, m7, m0, m1
+    vextracti128        xm1, m0, 1
+    psllw               xm0, 8
+    por                 xm0, xm1 ; interleave rows (partial transpose)
+    mova              [rsp], xm0
+    sub                  wd, 2
+    jz .h8_transpose
+    cmp                 r4d, maxbased
+    jg .h8_loop
+    packuswb            xm0, xm7, xm7
+.h8_end_loop:
+    sub                 rsp, 8*2
+    mova              [rsp], xm0
+    sub                  wd, 2
+    jg .h8_end_loop
+.h8_transpose:
+    mova                xm2, [rsp+16*1]
+    sub              org_wd, 8
+    lea                  r2, [strideq*3]
+    lea                  r6, [dstq+org_wq]
+    cmovns             dstq, r6
+    punpcklwd           xm1, xm2, xm0
+    punpckhwd           xm2, xm0
+    lea                  r6, [dstq+strideq*4]
+    jge .h8_w8
+    add                 rsp, 16*2
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+r2       ], xm1, 3
+    movd   [r6  +strideq*0], xm2
+    pextrd [r6  +strideq*1], xm2, 1
+    pextrd [r6  +strideq*2], xm2, 2
+    pextrd [r6  +r2       ], xm2, 3
+    jmp .h8_end
+.h8_w8_loop:
+    mova                xm0, [rsp+16*0]
+    mova                xm2, [rsp+16*1]
+    punpcklwd           xm1, xm2, xm0
+    punpckhwd           xm2, xm0
+.h8_w8: ; w8/w16/w32
+    mova                xm0, [rsp+16*2]
+    mova                xm4, [rsp+16*3]
+    add                 rsp, 16*4
+    punpcklwd           xm3, xm4, xm0
+    punpckhwd           xm4, xm0
+    punpckldq           xm0, xm3, xm1
+    punpckhdq           xm3, xm1
+    punpckldq           xm1, xm4, xm2
+    punpckhdq           xm4, xm2
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xm0
+    movq   [dstq+strideq*2], xm3
+    movhps [dstq+r2       ], xm3
+    movq   [r6  +strideq*0], xm1
+    movhps [r6  +strideq*1], xm1
+    movq   [r6  +strideq*2], xm4
+    movhps [r6  +r2       ], xm4
+    sub                dstq, 8
+    sub                  r6, 8
+    sub              org_wd, 8
+    jge .h8_w8_loop
+.h8_end:
+    RET
+.h16_no_intra_edge_filter:
+    and            maxbased, 15
+    or             maxbased, 16 ; imin(w+15, 31)
+    jmp .h16_main
+ALIGN function_align
+.h16:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -64, 12
+    lea            maxbased, [wq+15]
+    test             angled, 0x400
+    jnz .h16_no_intra_edge_filter
+    call .filter_strength
+    test                r5d, r5d
+    jz .h16_main ; filter_strength == 0
+    popcnt              r5d, r5d
+    vpbroadcastd        m11, [base+pb_27]
+    vpbroadcastd         m1, [base+pb_1]
+    vbroadcasti128       m6, [base+z_filter_s+12]
+    vinserti128          m2, m6, [base+z_filter_s+4], 0
+    vinserti128          m6, [base+z_filter_s+20], 1
+    movu               xm10, [tlq-18]
+    vinserti128         m10, [tlq-14], 1
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
+    vbroadcasti128       m7, [base+z_filter_s+8]
+    vinserti128          m8, m7, [base+z_filter_s+0], 0
+    vinserti128          m7, [base+z_filter_s+16], 1
+    psubusb             m11, m0
+    por                  m1, m11
+    movu               xm11, [tlq-32]
+    vinserti128         m11, [tlq-28], 1
+    pmaxub               m8, m1
+    pmaxub               m7, m1
+    pshufb               m0, m10, m2
+    shufps               m2, m6, q2121
+    pmaddubsw            m0, m9
+    pshufb               m1, m11, m8
+    shufps               m8, m7, q2121
+    pmaddubsw            m1, m9
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    movzx               r4d, byte [tlq-31]
+    pshufb               m2, m10, m2
+    pmaddubsw            m2, m9
+    pshufb               m8, m11, m8
+    pmaddubsw            m8, m9
+    paddw                m0, m2
+    paddw                m1, m8
+    sub                 r5d, 3
+    jnz .h16_3tap
+    vpbroadcastd         m9, [z_filter_k+4*8]
+    movzx               r2d, byte [tlq-30]
+    pshufb              m10, m6
+    pmaddubsw           m10, m9
+    pshufb              m11, m7
+    pmaddubsw           m11, m9
+    sub                 r2d, r4d
+    lea                 r2d, [r2+r4*8+4]
+    shr                 r2d, 3
+    mov            [rsp+31], r2b
+    paddw                m0, m10
+    paddw                m1, m11
+.h16_3tap:
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    sar                 r5d, 1
+    lea                 tlq, [rsp+63]
+    add                 r5d, 33
+    cmp                  wd, 32
+    cmovns         maxbased, r5d
+    neg                  r5
+    mov            [tlq+r5], r4b
+    packuswb             m0, m1
+    vpermq               m0, m0, q2031
+    mova           [tlq-31], m0
+.h16_main:
+    movd                xm6, dyd
+    vbroadcasti128       m0, [z_base_inc]
+    mov                  r4, tlq
+    sub                 tlq, 8
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd                xm9, maxbased
+    not            maxbased
+    vbroadcasti128       m8, [z3_shuf]
+    add            maxbased, 64
+    vpbroadcastw         m9, xm9
+    psubw                m9, m0
+    paddw               m11, m6, m6
+    psubw               m10, m9, m3 ; 64*8
+    vpblendd             m6, m11, 0xf0
+.h16_loop:
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    movu                xm0, [tlq+r4-0]
+    movu                xm1, [tlq+r4-8]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6
+    vinserti128          m0, [tlq+r5-0], 1
+    vinserti128          m1, [tlq+r5-8], 1
+    sub                 rsp, 32
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m9, m6
+    pcmpgtw              m2, m10, m6
+    packsswb             m1, m2
+    paddw                m6, m11
+    vpblendvb            m0, m7, m0, m1
+    vpermq               m0, m0, q3120
+    mova              [rsp], m0
+    sub                  wd, 2
+    jz .h16_transpose
+    cmp                 r4d, maxbased
+    jg .h16_loop
+    mova                 m0, m7
+.h16_end_loop:
+    sub                 rsp, 32
+    mova              [rsp], m7
+    sub                  wd, 2
+    jg .h16_end_loop
+.h16_transpose:
+    mova                 m2, [rsp+32*1]
+    sub              org_wd, 8
+    lea                  r2, [strideq*3]
+    lea                  r6, [dstq+org_wq]
+    cmovns             dstq, r6
+    punpcklbw            m1, m2, m0
+    punpckhbw            m2, m0
+    lea                  r3, [strideq*5]
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    lea                  r4, [strideq+r2*2] ; stride*7
+    jge .h16_w8
+    add                 rsp, 32*2
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r2       ], xm0, 3
+    vextracti128        xm0, m0, 1
+    movd   [dstq+strideq*4], xm1
+    pextrd [dstq+r3       ], xm1, 1
+    pextrd [dstq+r2*2     ], xm1, 2
+    pextrd [dstq+r4       ], xm1, 3
+    lea                dstq, [dstq+strideq*8]
+    vextracti128        xm1, m1, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r2       ], xm0, 3
+    movd   [dstq+strideq*4], xm1
+    pextrd [dstq+r3       ], xm1, 1
+    pextrd [dstq+r2*2     ], xm1, 2
+    pextrd [dstq+r4       ], xm1, 3
+    jmp .h16_end
+.h16_w8_loop:
+    mova                 m0, [rsp+32*0]
+    mova                 m2, [rsp+32*1]
+    punpcklbw            m1, m2, m0
+    punpckhbw            m2, m0
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+.h16_w8:
+    mova                 m2, [rsp+32*2]
+    mova                 m4, [rsp+32*3]
+    lea                  r6, [dstq+strideq*8]
+    add                 rsp, 32*4
+    punpcklbw            m3, m4, m2
+    punpckhbw            m4, m2
+    punpcklbw            m2, m3, m4
+    punpckhbw            m3, m4
+    punpckldq            m4, m2, m0
+    punpckhdq            m2, m0
+    punpckldq            m0, m3, m1
+    punpckhdq            m3, m1
+    movq   [dstq+strideq*0], xm4
+    movhps [dstq+strideq*1], xm4
+    vextracti128        xm4, m4, 1
+    movq   [dstq+strideq*2], xm2
+    movhps [dstq+r2       ], xm2
+    vextracti128        xm2, m2, 1
+    movq   [dstq+strideq*4], xm0
+    movhps [dstq+r3       ], xm0
+    vextracti128        xm0, m0, 1
+    movq   [dstq+r2*2     ], xm3
+    movhps [dstq+r4       ], xm3
+    vextracti128        xm3, m3, 1
+    movq     [r6+strideq*0], xm4
+    movhps   [r6+strideq*1], xm4
+    movq     [r6+strideq*2], xm2
+    movhps   [r6+r2       ], xm2
+    movq     [r6+strideq*4], xm0
+    movhps   [r6+r3       ], xm0
+    movq     [r6+r2*2     ], xm3
+    movhps   [r6+r4       ], xm3
+    sub                dstq, 8
+    sub              org_wd, 8
+    jge .h16_w8_loop
+.h16_end:
+    RET
+ALIGN function_align
+.h32:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -96, 15
+    lea            maxbased, [wq+31]
+    and            maxbased, 31
+    or             maxbased, 32 ; imin(w+31, 63)
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .h32_main
+    vbroadcasti128       m0, [pb_0to15]
+    mov                 r4d, 21
+    mov                 r5d, 3
+    movu               xm11, [tlq-66]    ; 56-63
+    vinserti128         m11, [tlq-52], 1 ; 40-47
+    sub                 r4d, wd ; 21-w
+    cmovns              r5d, r4d
+    movu               xm12, [tlq-58]    ; 48-55
+    vinserti128         m12, [tlq-44], 1 ; 32-39
+    sub                 r4d, 8 ; 13-w
+    movd                xm1, r5d
+    movu               xm13, [tlq-34]    ; 24-31
+    vinserti128         m13, [tlq-20], 1 ;  8-15
+    movd                xm2, r4d
+    vpbroadcastb         m1, xm1
+    movu               xm14, [tlq-28]    ; 16-23
+    vinserti128         m14, [tlq-14], 1 ;  0- 7
+    vpbroadcastb         m2, xm2
+    pmaxsb               m1, m0 ; clip 16x32 and (32|64)x32
+    movu                 m7, [z_filter_s+4]
+    pshufb              m11, m1
+    vinserti128          m8, m7, [z_filter_s+8], 1
+    vinserti128          m7, [z_filter_s+16], 0
+    pmaxsb               m2, m0 ; clip 8x32
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    pshufb              m12, m2
+    pshufb               m0, m11, m8
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m8
+    pmaddubsw            m2, m9
+    pshufb               m1, m13, m8
+    pmaddubsw            m1, m9
+    shufps               m8, m7, q1021
+    pshufb               m6, m14, m8
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    pshufb              m10, m11, m8
+    pmaddubsw           m10, m9
+    paddw                m0, m10
+    pshufb              m10, m12, m8
+    pmaddubsw           m10, m9
+    paddw                m2, m10
+    pshufb              m10, m13, m8
+    pmaddubsw           m10, m9
+    shufps               m8, m7, q2121
+    paddw                m1, m10
+    pshufb              m10, m14, m8
+    pmaddubsw           m10, m9
+    paddw                m6, m10
+    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
+    pshufb              m11, m8
+    pmaddubsw           m11, m9
+    pshufb              m12, m8
+    pmaddubsw           m12, m9
+    movzx               r4d, byte [tlq-63]
+    movzx               r2d, byte [tlq-62]
+    paddw                m0, m11
+    paddw                m2, m12
+    pshufb              m13, m8
+    pmaddubsw           m13, m9
+    pshufb              m14, m7
+    pmaddubsw           m14, m9
+    paddw                m1, m13
+    paddw                m6, m14
+    sub                 r2d, r4d
+    lea                 r2d, [r2+r4*8+4] ; edge case for 64x32
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    shr                 r2d, 3
+    mov            [rsp+31], r2b
+    lea                 tlq, [rsp+95]
+    mov            [tlq-65], r4b
+    mov                 r4d, 65
+    cmp                  wd, 64
+    cmove          maxbased, r4d
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova           [tlq-63], m0
+    mova           [tlq-31], m1
+.h32_main:
+    movd                xm6, dyd
+    mov                  r4, tlq
+    sub                 tlq, 8
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd                xm9, maxbased
+    not            maxbased
+    vbroadcasti128       m8, [z3_shuf]
+    add            maxbased, 64
+    vpbroadcastw         m9, xm9
+    psubw                m9, [z_base_inc]
+    mova                m11, m6
+    psubw               m10, m9, m3 ; 64*8
+.h32_loop:
+    mov                  r5, r4
+    sar                  r5, 6
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    movu                xm0, [tlq+r5- 0]
+    vinserti128          m0, [tlq+r5-16], 1
+    movu                xm1, [tlq+r5- 8]
+    vinserti128          m1, [tlq+r5-24], 1
+    sub                 rsp, 32
+    add                  r4, dyq
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m9, m6
+    pcmpgtw              m2, m10, m6
+    packsswb             m1, m2
+    paddw                m6, m11
+    vpblendvb            m0, m7, m0, m1
+    mova              [rsp], m0
+    dec                  wd
+    jz .h32_transpose
+    cmp                 r4d, maxbased
+    jg .h32_loop
+.h32_end_loop:
+    sub                 rsp, 32
+    mova              [rsp], m7
+    dec                  wd
+    jg .h32_end_loop
+.h32_transpose:
+    lea                dstq, [dstq+org_wq-8]
+    lea                  r2, [strideq*3]
+    lea                  r3, [strideq*5]
+    lea                  r4, [strideq+r2*2] ; stride*7
+.h32_w8_loop:
+    mova                 m7, [rsp+32*0]
+    mova                 m6, [rsp+32*1]
+    mova                 m5, [rsp+32*2]
+    mova                 m4, [rsp+32*3]
+    mova                 m3, [rsp+32*4]
+    mova                 m2, [rsp+32*5]
+    mova                 m1, [rsp+32*6]
+    mova                 m0, [rsp+32*7]
+    lea                  r6, [dstq+strideq*8]
+    add                 rsp, 32*8
+    punpcklbw            m8, m0, m1
+    punpckhbw            m0, m1
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    punpcklbw            m3, m4, m5
+    punpckhbw            m4, m5
+    punpcklbw            m5, m6, m7
+    punpckhbw            m6, m7
+    punpcklwd            m7, m8, m1
+    punpckhwd            m8, m1
+    punpcklwd            m1, m0, m2
+    punpckhwd            m0, m2
+    punpcklwd            m2, m3, m5
+    punpckhwd            m3, m5
+    punpcklwd            m5, m4, m6
+    punpckhwd            m4, m6
+    punpckldq            m6, m7, m2
+    punpckhdq            m7, m2
+    punpckldq            m2, m8, m3
+    punpckhdq            m8, m3
+    punpckldq            m3, m1, m5
+    punpckhdq            m1, m5
+    punpckldq            m5, m0, m4
+    punpckhdq            m0, m4
+    movq   [dstq+strideq*0], xm6
+    movhps [dstq+strideq*1], xm6
+    vextracti128        xm6, m6, 1
+    movq   [dstq+strideq*2], xm7
+    movhps [dstq+r2       ], xm7
+    vextracti128        xm7, m7, 1
+    movq   [dstq+strideq*4], xm2
+    movhps [dstq+r3       ], xm2
+    vextracti128        xm2, m2, 1
+    movq   [dstq+r2*2     ], xm8
+    movhps [dstq+r4       ], xm8
+    vextracti128        xm8, m8, 1
+    movq     [r6+strideq*0], xm3
+    movhps   [r6+strideq*1], xm3
+    vextracti128        xm3, m3, 1
+    movq     [r6+strideq*2], xm1
+    movhps   [r6+r2       ], xm1
+    vextracti128        xm1, m1, 1
+    movq     [r6+strideq*4], xm5
+    movhps   [r6+r3       ], xm5
+    vextracti128        xm5, m5, 1
+    movq     [r6+r2*2     ], xm0
+    movhps   [r6+r4       ], xm0
+    lea                  r6, [r6+strideq*8]
+    vextracti128        xm0, m0, 1
+    movq     [r6+strideq*0], xm6
+    movhps   [r6+strideq*1], xm6
+    movq     [r6+strideq*2], xm7
+    movhps   [r6+r2       ], xm7
+    movq     [r6+strideq*4], xm2
+    movhps   [r6+r3       ], xm2
+    movq     [r6+r2*2     ], xm8
+    movhps   [r6+r4       ], xm8
+    lea                  r6, [r6+strideq*8]
+    movq     [r6+strideq*0], xm3
+    movhps   [r6+strideq*1], xm3
+    movq     [r6+strideq*2], xm1
+    movhps   [r6+r2       ], xm1
+    movq     [r6+strideq*4], xm5
+    movhps   [r6+r3       ], xm5
+    movq     [r6+r2*2     ], xm0
+    movhps   [r6+r4       ], xm0
+    sub                dstq, 8
+    sub              org_wd, 8
+    jg .h32_w8_loop
+    RET
+ALIGN function_align
+.h64:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK        -128, 16
+    lea            maxbased, [wq+63]
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .h64_main
+    mov                 r4d, 21
+    vpbroadcastb       xm11, [tlq-127]
+    vpblendd           xm11, [tlq-130], 0x0e ; 120-127
+    sub                 r4d, wd ; 21-w
+    mov                 r5d, 3
+    vinserti128         m11, [tlq-116], 1    ; 104-111
+    movu                 m7, [z_filter_s+4]
+    cmp                  wd, 32
+    cmove               r4d, r5d
+    vinserti128          m8, m7, [z_filter_s+8], 1
+    vbroadcasti128       m6, [pb_0to15]
+    movd                xm1, r4d
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    movu               xm12, [tlq-122]       ; 112-119
+    vinserti128         m12, [tlq-108], 1    ;  96-103
+    vpbroadcastb         m1, xm1
+    movu               xm13, [tlq- 98]       ;  88- 95
+    vinserti128         m13, [tlq- 84], 1    ;  72- 79
+    movu               xm14, [tlq- 90]       ;  80- 87
+    vinserti128         m14, [tlq- 76], 1    ;  64- 71
+    vinserti128          m7, [z_filter_s+16], 0
+    pshufb               m0, m11, m8
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m8
+    pmaddubsw            m2, m9
+    pmaxsb               m1, m6 ; clip (16|32)x64
+    pshufb              m13, m1
+    pshufb               m1, m13, m8
+    pmaddubsw            m1, m9
+    pshufb               m6, m14, m8
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    shufps              m15, m8, m7, q1021
+    pshufb              m10, m11, m15
+    pmaddubsw           m10, m9
+    paddw                m0, m10
+    pshufb              m10, m12, m15
+    pmaddubsw           m10, m9
+    paddw                m2, m10
+    pshufb              m10, m13, m15
+    pmaddubsw           m10, m9
+    paddw                m1, m10
+    pshufb              m10, m14, m15
+    pmaddubsw           m10, m9
+    paddw                m6, m10
+    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
+    shufps              m10, m8, m7, q2132
+    pshufb              m11, m10
+    pmaddubsw           m11, m9
+    pshufb              m12, m10
+    pmaddubsw           m12, m9
+    pshufb              m13, m10
+    pmaddubsw           m13, m9
+    pshufb              m14, m10
+    pmaddubsw           m14, m9
+    paddw                m0, m11
+    paddw                m2, m12
+    paddw                m1, m13
+    paddw                m6, m14
+    movu               xm11, [tlq-66]    ; 56-63
+    vinserti128         m11, [tlq-52], 1 ; 40-47
+    movu               xm12, [tlq-58]    ; 48-55
+    vinserti128         m12, [tlq-44], 1 ; 32-39
+    movu               xm13, [tlq-34]    ; 24-31
+    vinserti128         m13, [tlq-20], 1 ;  8-15
+    movu               xm14, [tlq-28]    ; 16-23
+    vinserti128         m14, [tlq-14], 1 ;  0- 7
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    lea                 tlq, [rsp+127]
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova          [tlq-127], m0
+    mova          [tlq- 95], m1
+    pshufb               m0, m11, m10
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m10
+    pmaddubsw            m2, m9
+    pshufb               m1, m13, m10
+    pmaddubsw            m1, m9
+    pshufb               m6, m14, m7
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    pshufb               m7, m11, m15
+    pmaddubsw            m7, m9
+    paddw                m0, m7
+    pshufb               m7, m12, m15
+    pmaddubsw            m7, m9
+    paddw                m2, m7
+    pshufb               m7, m13, m15
+    pmaddubsw            m7, m9
+    paddw                m1, m7
+    pshufb               m7, m14, m10
+    pmaddubsw            m7, m9
+    paddw                m6, m7
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    pshufb              m11, m8
+    pmaddubsw           m11, m9
+    pshufb              m12, m8
+    pmaddubsw           m12, m9
+    pshufb              m13, m8
+    pmaddubsw           m13, m9
+    pshufb              m14, m15
+    pmaddubsw           m14, m9
+    paddw                m0, m11
+    paddw                m2, m12
+    paddw                m1, m13
+    paddw                m6, m14
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova           [tlq-63], m0
+    mova           [tlq-31], m1
+.h64_main:
+    movd               xm12, dyd
+    neg            maxbaseq
+    vbroadcasti128       m8, [z3_shuf]
+    vpbroadcastb         m7, [tlq+maxbaseq]
+    shl            maxbased, 6
+    vpbroadcastw        m12, xm12
+    lea                 r5d, [dyq+maxbaseq-64]
+    neg                 dyq
+    or             maxbased, 63
+    lea                  r4, [dyq+63]
+    movd                xm6, r5d
+    mova               xm10, [pb_1to32+16]
+    vinserti128         m10, [pb_1to32], 1
+    vpbroadcastd        m11, [pb_32]
+    vpbroadcastw         m6, xm6
+.h64_loop:
+    mov                  r5, r4
+    sar                  r5, 6
+    movu                 m0, [tlq+r5-24]
+    movu                 m1, [tlq+r5-32]
+    pand                 m2, m4, m6
+    psubw                m9, m5, m2
+    psllw                m2, 8
+    por                  m9, m2
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m9
+    pmaddubsw            m1, m9
+    psraw                m2, m6, 6
+    sub                 rsp, 64
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packsswb             m2, m2
+    paddb                m2, m10
+    packuswb             m0, m1
+    vpblendvb            m0, m7, m0, m2
+    mova           [rsp+32], m0
+    movu                 m0, [tlq+r5-56]
+    movu                 m1, [tlq+r5-64]
+    add                  r4, dyq
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m9
+    pmaddubsw            m1, m9
+    paddb                m2, m11
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    paddw                m6, m12
+    packuswb             m0, m1
+    vpblendvb            m0, m7, m0, m2
+    mova              [rsp], m0
+    dec                  wd
+    jz .h64_transpose
+    cmp                 r4d, maxbased
+    jg .h64_loop
+.h64_end_loop:
+    sub                 rsp, 64
+    mova           [rsp+32], m7
+    mova           [rsp+ 0], m7
+    dec                  wd
+    jg .h64_end_loop
+.h64_transpose:
+    lea                  r2, [strideq*3]
+    lea                  r3, [strideq*5]
+    imul                 r5, strideq, -8
+    lea                dstq, [dstq+org_wq-16]
+    lea                  r4, [strideq+r2*2] ; stride*7
+.h64_transpose_loop0:
+    lea                  r6, [rsp+16*3]
+.h64_transpose_loop:
+    mova                xm0, [r6+64*15]
+    vinserti128          m0, [r6+64* 7], 1
+    mova                xm1, [r6+64*14]
+    vinserti128          m1, [r6+64* 6], 1
+    mova                xm2, [r6+64*13]
+    vinserti128          m2, [r6+64* 5], 1
+    mova                xm3, [r6+64*12]
+    vinserti128          m3, [r6+64* 4], 1
+    mova                xm4, [r6+64*11]
+    vinserti128          m4, [r6+64* 3], 1
+    mova                xm5, [r6+64*10]
+    vinserti128          m5, [r6+64* 2], 1
+    mova                xm6, [r6+64* 9]
+    vinserti128          m6, [r6+64* 1], 1
+    mova                xm7, [r6+64* 8]
+    vinserti128          m7, [r6+64* 0], 1
+    sub                  r6, 16
+    punpcklbw            m8, m0, m1
+    punpckhbw            m0, m1
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    punpcklbw            m3, m4, m5
+    punpckhbw            m4, m5
+    punpcklbw            m5, m6, m7
+    punpckhbw            m6, m7
+    punpcklwd            m7, m8, m1
+    punpckhwd            m8, m1
+    punpcklwd            m1, m0, m2
+    punpckhwd            m0, m2
+    punpcklwd            m2, m3, m5
+    punpckhwd            m3, m5
+    punpcklwd            m5, m4, m6
+    punpckhwd            m4, m6
+    punpckldq            m6, m7, m2
+    punpckhdq            m7, m2
+    punpckldq            m2, m8, m3
+    punpckhdq            m8, m3
+    punpckldq            m3, m1, m5
+    punpckhdq            m1, m5
+    punpckldq            m5, m0, m4
+    punpckhdq            m0, m4
+    vpermq               m6, m6, q3120
+    vpermq               m7, m7, q3120
+    vpermq               m2, m2, q3120
+    vpermq               m8, m8, q3120
+    vpermq               m3, m3, q3120
+    vpermq               m1, m1, q3120
+    vpermq               m5, m5, q3120
+    vpermq               m0, m0, q3120
+    mova         [dstq+strideq*0], xm6
+    vextracti128 [dstq+strideq*1], m6, 1
+    mova         [dstq+strideq*2], xm7
+    vextracti128 [dstq+r2       ], m7, 1
+    mova         [dstq+strideq*4], xm2
+    vextracti128 [dstq+r3       ], m2, 1
+    mova         [dstq+r2*2     ], xm8
+    vextracti128 [dstq+r4       ], m8, 1
+    sub               dstq, r5
+    mova         [dstq+strideq*0], xm3
+    vextracti128 [dstq+strideq*1], m3, 1
+    mova         [dstq+strideq*2], xm1
+    vextracti128 [dstq+r2       ], m1, 1
+    mova         [dstq+strideq*4], xm5
+    vextracti128 [dstq+r3       ], m5, 1
+    mova         [dstq+r2*2     ], xm0
+    vextracti128 [dstq+r4       ], m0, 1
+    sub                dstq, r5
+    cmp                  r6, rsp
+    jae .h64_transpose_loop
+    add                 rsp, 64*16
+    lea                dstq, [dstq+r5*8-16]
+    sub              org_wd, 16
+    jg .h64_transpose_loop0
+.h64_end:
+    RET
+
+%macro FILTER_XMM 4 ; dst, src, tmp, shuf
+%ifnum %4
+    pshufb             xm%2, xm%4
+%else
+    pshufb             xm%2, %4
+%endif
+    pshufd             xm%1, xm%2, q0000 ; p0 p1
+    pmaddubsw          xm%1, xm2
+    pshufd             xm%3, xm%2, q1111 ; p2 p3
+    pmaddubsw          xm%3, xm3
+    paddw              xm%1, xm1
+    paddw              xm%1, xm%3
+    pshufd             xm%3, xm%2, q2222 ; p4 p5
+    pmaddubsw          xm%3, xm4
+    paddw              xm%1, xm%3
+    pshufd             xm%3, xm%2, q3333 ; p6 __
+    pmaddubsw          xm%3, xm5
+    paddw              xm%1, xm%3
+    psraw              xm%1, 4
+    packuswb           xm%1, xm%1
+%endmacro
+
+%macro FILTER_YMM 4 ; dst, src, tmp, shuf
+    pshufb              m%2, m%4
+    pshufd              m%1, m%2, q0000
+    pmaddubsw           m%1, m2
+    pshufd              m%3, m%2, q1111
+    pmaddubsw           m%3, m3
+    paddw               m%1, m1
+    paddw               m%1, m%3
+    pshufd              m%3, m%2, q2222
+    pmaddubsw           m%3, m4
+    paddw               m%1, m%3
+    pshufd              m%3, m%2, q3333
+    pmaddubsw           m%3, m5
+    paddw               m%1, m%3
+    psraw               m%1, 4
+    vpermq              m%3, m%1, q1032
+    packuswb            m%1, m%3
+%endmacro
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row. One redundant
+; block is calculated for w8 and w16, two for w32.
+;     w4     w8       w16             w32
+;     1     1 2     1 2 3 5     1 2 3 5 b c d f
+;     2     2 3     2 4 5 7     2 4 5 7 c e f h
+;     3     3 4     4 6 7 9     4 6 7 9 e g h j
+; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
+;           5       8           8       i
+
+cglobal ipred_filter, 3, 7, 0, dst, stride, tl, w, h, filter
+%define base r6-ipred_filter_avx2_table
+    lea                  r6, [filter_intra_taps]
+    tzcnt                wd, wm
+%ifidn filterd, filterm
+    movzx           filterd, filterb
+%else
+    movzx           filterd, byte filterm
+%endif
+    shl             filterd, 6
+    add             filterq, r6
+    lea                  r6, [ipred_filter_avx2_table]
+    movq                xm0, [tlq-3] ; _ 6 5 0 1 2 3 4
+    movsxd               wq, [r6+wq*4]
+    vpbroadcastd         m1, [base+pw_8]
+    vbroadcasti128       m2, [filterq+16*0]
+    vbroadcasti128       m3, [filterq+16*1]
+    vbroadcasti128       m4, [filterq+16*2]
+    vbroadcasti128       m5, [filterq+16*3]
+    add                  wq, r6
+    mov                  hd, hm
+    jmp                  wq
+.w4:
+    WIN64_SPILL_XMM       9
+    mova                xm8, [base+filter_shuf2]
+    sub                 tlq, 3
+    sub                 tlq, hq
+    jmp .w4_loop_start
+.w4_loop:
+    pinsrd              xm0, xm6, [tlq+hq], 0
+    lea                dstq, [dstq+strideq*2]
+.w4_loop_start:
+    FILTER_XMM            6, 0, 7, 8
+    movd   [dstq+strideq*0], xm6
+    pextrd [dstq+strideq*1], xm6, 1
+    sub                  hd, 2
+    jg .w4_loop
+    RET
+ALIGN function_align
+.w8:
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM      10
+    mova                 m8, [base+filter_shuf1]
+    FILTER_XMM            7, 0, 6, [base+filter_shuf2]
+    vpbroadcastd         m0, [tlq+4]
+    vpbroadcastd         m6, [tlq+5]
+    sub                 tlq, 4
+    sub                 tlq, hq
+    vpbroadcastq         m7, xm7
+    vpblendd             m7, m6, 0x20
+.w8_loop:
+    vpbroadcastd        xm6, [tlq+hq]
+    palignr              m6, m0, 12
+    vpblendd             m0, m6, m7, 0xeb     ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
+                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+    mova                xm6, xm7
+    call .main
+    vpblendd            xm6, xm7, 0x0c
+    pshufd              xm6, xm6, q3120
+    movq   [dstq+strideq*0], xm6
+    movhps [dstq+strideq*1], xm6
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+ALIGN function_align
+.w16:
+%if WIN64
+    %assign stack_offset stack_offset - stack_size_padded
+    %assign xmm_regs_used 15
+    %assign stack_size_padded 0x98
+    SUB                 rsp, stack_size_padded
+%endif
+    sub                  hd, 2
+    TAIL_CALL .w16_main, 0
+.w16_main:
+%if WIN64
+    movaps       [rsp+0xa8], xmm6
+    movaps       [rsp+0xb8], xmm7
+    movaps       [rsp+0x28], xmm8
+    movaps       [rsp+0x38], xmm9
+    movaps       [rsp+0x48], xmm10
+    movaps       [rsp+0x58], xmm11
+    movaps       [rsp+0x68], xmm12
+    movaps       [rsp+0x78], xmm13
+    movaps       [rsp+0x88], xmm14
+%endif
+    FILTER_XMM           12, 0, 7, [base+filter_shuf2]
+    vpbroadcastd         m0, [tlq+5]
+    vpblendd             m0, [tlq-12], 0x14
+    mova                 m8, [base+filter_shuf1]
+    vpbroadcastq         m7, xm12
+    vpblendd             m0, m7, 0xc2         ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
+                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+    call .main                                ; c0 d0 a1 b1   a1 b1 c0 d0
+    movlps              xm9, xm7, [tlq+5]     ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    vinserti128         m14, m8, [base+filter_shuf3], 0
+    vpblendd           xm12, xm7, 0x0c        ; a0 b0 a1 b1
+    FILTER_XMM            6, 9, 10, 14
+    vpbroadcastq         m6, xm6              ; a2 b2 __ __ __ __ a2 b2
+    vpbroadcastd         m9, [tlq+13]
+    vpbroadcastd        m10, [tlq+12]
+    psrld               m11, m8, 4
+    vpblendd             m6, m9, 0x20         ; top
+    sub                 tlq, 6
+    sub                 tlq, hq
+.w16_loop:
+    vpbroadcastd        xm9, [tlq+hq]
+    palignr              m9, m0, 12
+    vpblendd             m0, m9, m7, 0xe2     ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
+                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+    mova               xm13, xm7
+    call .main                                ; e0 f0 c1 d1   c1 d1 e0 f0
+    vpblendd             m9, m12, m10, 0xf0
+    vpblendd            m12, m6, 0xc0
+    pshufd               m9, m9, q3333
+    vpblendd             m9, m6, 0xee
+    vpblendd            m10, m9, m7, 0x0c     ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+    FILTER_YMM            6, 10, 9, 14        ; c2 d2 a3 b3   a3 b3 c2 d2
+    vpblendd            m12, m6, 0x30         ; a0 b0 a1 b1   a3 b3 a2 b2
+    vpermd               m9, m11, m12         ; a0 a1 a2 a3   b0 b1 b2 b3
+    vpblendd           xm12, xm13, xm7, 0x0c  ; c0 d0 c1 d1
+    mova         [dstq+strideq*0], xm9
+    vextracti128 [dstq+strideq*1], m9, 1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w16_loop
+    vpblendd            xm7, xm6, xm10, 0x04  ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
+    pshufd              xm7, xm7, q1032       ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+    FILTER_XMM            0, 7, 9, [base+filter_shuf1+16]
+    vpblendd            xm6, xm0, 0x0c        ; c2 d2 c3 d3
+    shufps              xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
+    shufps              xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
+    mova   [dstq+strideq*0], xm0
+    mova   [dstq+strideq*1], xm6
+    ret
+ALIGN function_align
+.w32:
+    sub                 rsp, stack_size_padded
+    sub                  hd, 2
+    lea                  r3, [dstq+16]
+    lea                 r5d, [hq-2]
+    call .w16_main
+    add                 tlq, r5
+    mov                dstq, r3
+    lea                  r3, [strideq-4]
+    lea                  r4, [r3+strideq*2]
+    movq                xm0, [tlq+21]
+    pinsrd              xm0, [dstq-4], 2
+    pinsrd              xm0, [dstq+r3*1], 3
+    FILTER_XMM           12, 0, 7, 14         ; a0 b0 a0 b0
+    movq                xm7, [dstq+r3*2]
+    pinsrd              xm7, [dstq+r4], 2
+    palignr             xm7, xm0, 12          ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6
+    vpbroadcastd         m0, [tlq+28]
+    vpbroadcastd         m9, [tlq+29]
+    vbroadcasti128       m8, [base+filter_shuf1+16]
+    vpblendd             m0, m9, 0x20
+    vpblendd             m0, m7, 0x0f
+    vpbroadcastq         m7, xm12
+    vpblendd             m0, m7, 0xc2         ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+    call .main                                ; c0 d0 a1 b1   a1 b1 c0 d0
+    add                  r3, 2
+    lea                  r4, [r4+strideq*2]
+    movlps              xm9, xm7, [tlq+29]    ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    vpblendd           xm12, xm7, 0x0c        ; a0 b0 a1 b1
+    FILTER_XMM            6, 9, 10, 14
+    vpbroadcastq         m6, xm6              ; a2 b2 __ __ __ __ a2 b2
+    vpbroadcastd         m9, [tlq+37]
+    vpbroadcastd        m10, [tlq+36]
+    vpblendd             m6, m9, 0x20         ; top
+.w32_loop:
+    movq                xm9, [dstq+r3*4]
+    pinsrd              xm9, [dstq+r4], 2
+.w32_loop_last:
+    palignr              m9, m0, 12
+    vpblendd             m0, m9, m7, 0xe2     ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+    mova               xm13, xm7              ; c0 d0
+    call .main                                ; e0 f0 c1 d1   c1 d1 e0 f0
+    vpblendd             m9, m12, m10, 0xf0
+    vpblendd            m12, m6, 0xc0
+    pshufd               m9, m9, q3333
+    vpblendd             m9, m6, 0xee
+    vpblendd            m10, m9, m7, 0x0c     ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+    FILTER_YMM            6, 10, 9, 14        ; c2 d2 a3 b3   a3 b3 c2 d2
+    vpblendd            m12, m6, 0x30         ; a0 b0 a1 b1   a3 b3 a2 b2
+    vpermd               m9, m11, m12         ; a0 a1 a2 a3   b0 b1 b2 b3
+    vpblendd           xm12, xm13, xm7, 0x0c  ; c0 d0 c1 d1
+    mova         [dstq+strideq*0], xm9
+    vextracti128 [dstq+strideq*1], m9, 1
+    lea                dstq, [dstq+strideq*2]
+    sub                 r5d, 2
+    jg .w32_loop
+    jz .w32_loop_last
+    vpblendd            xm7, xm6, xm10, 0x04  ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
+    pshufd              xm7, xm7, q1032       ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+    FILTER_XMM            0, 7, 9, [base+filter_shuf1+16]
+    vpblendd            xm6, xm0, 0x0c        ; c2 d2 c3 d3
+    shufps              xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
+    shufps              xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
+    mova   [dstq+strideq*0], xm0
+    mova   [dstq+strideq*1], xm6
+    RET
+ALIGN function_align
+.main:
+    FILTER_YMM            7, 0, 9, 8
+    ret
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+%macro IPRED_CFL 1 ; ac in, unpacked pixels out
+    psignw               m3, m%1, m1
+    pabsw               m%1, m%1
+    pmulhrsw            m%1, m2
+    psignw              m%1, m3
+    paddw               m%1, m0
+%endmacro
+
+cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+    lea                  t0, [ipred_cfl_left_avx2_table]
+    tzcnt                wd, wm
+    inc                 tlq
+    movu                 m0, [tlq]
+    movifnidn            hd, hm
+    mov                 r6d, 0x8000
+    shrx                r6d, r6d, wd
+    movd                xm3, r6d
+    movsxd               r6, [t0+wq*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, t0
+    add                  t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
+    movsxd               wq, [t0+wq*4]
+    add                  wq, t0
+    movifnidn           acq, acmp
+    jmp                  r6
+
+cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+    mov                  hd, hm ; zero upper half
+    tzcnt               r6d, hd
+    sub                 tlq, hq
+    tzcnt                wd, wm
+    movu                 m0, [tlq]
+    mov                 t0d, 0x8000
+    shrx                t0d, t0d, r6d
+    movd                xm3, t0d
+    lea                  t0, [ipred_cfl_left_avx2_table]
+    movsxd               r6, [t0+r6*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, t0
+    add                  t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
+    movsxd               wq, [t0+wq*4]
+    add                  wq, t0
+    movifnidn           acq, acmp
+    jmp                  r6
+.h32:
+    vextracti128        xm1, m0, 1
+    paddw               xm0, xm1
+.h16:
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+.h8:
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+.h4:
+    pmaddwd             xm0, xm2
+    pmulhrsw            xm0, xm3
+    vpbroadcastw         m0, xm0
+    jmp                  wq
+
+cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+    movifnidn            hd, hm
+    movifnidn            wd, wm
+    tzcnt               r6d, hd
+    lea                 t0d, [wq+hq]
+    movd                xm4, t0d
+    tzcnt               t0d, t0d
+    movd                xm5, t0d
+    lea                  t0, [ipred_cfl_avx2_table]
+    tzcnt                wd, wd
+    movsxd               r6, [t0+r6*4]
+    movsxd               wq, [t0+wq*4+4*4]
+    pcmpeqd              m3, m3
+    psrlw               xm4, 1
+    add                  r6, t0
+    add                  wq, t0
+    movifnidn           acq, acmp
+    jmp                  r6
+.h4:
+    movd                xm0, [tlq-4]
+    pmaddubsw           xm0, xm3
+    jmp                  wq
+.w4:
+    movd                xm1, [tlq+1]
+    pmaddubsw           xm1, xm3
+    psubw               xm0, xm4
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    cmp                  hd, 4
+    jg .w4_mul
+    psrlw               xm0, 3
+    jmp .w4_end
+.w4_mul:
+    punpckhqdq          xm1, xm0, xm0
+    lea                 r2d, [hq*2]
+    mov                 r6d, 0x55563334
+    paddw               xm0, xm1
+    shrx                r6d, r6d, r2d
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    movd                xm1, r6d
+    psrlw               xm0, 2
+    pmulhuw             xm0, xm1
+.w4_end:
+    vpbroadcastw         m0, xm0
+.s4:
+    vpbroadcastw         m1, alpham
+    lea                  r6, [strideq*3]
+    pabsw                m2, m1
+    psllw                m2, 9
+.s4_loop:
+    mova                 m4, [acq]
+    IPRED_CFL             4
+    packuswb             m4, m4
+    vextracti128        xm5, m4, 1
+    movd   [dstq+strideq*0], xm4
+    pextrd [dstq+strideq*1], xm4, 1
+    movd   [dstq+strideq*2], xm5
+    pextrd [dstq+r6       ], xm5, 1
+    lea                dstq, [dstq+strideq*4]
+    add                 acq, 32
+    sub                  hd, 4
+    jg .s4_loop
+    RET
+ALIGN function_align
+.h8:
+    movq                xm0, [tlq-8]
+    pmaddubsw           xm0, xm3
+    jmp                  wq
+.w8:
+    movq                xm1, [tlq+1]
+    vextracti128        xm2, m0, 1
+    pmaddubsw           xm1, xm3
+    psubw               xm0, xm4
+    paddw               xm0, xm2
+    punpckhqdq          xm2, xm0, xm0
+    paddw               xm0, xm2
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 8
+    je .w8_end
+    mov                 r6d, 0x5556
+    mov                 r2d, 0x3334
+    cmp                  hd, 32
+    cmove               r6d, r2d
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w8_end:
+    vpbroadcastw         m0, xm0
+.s8:
+    vpbroadcastw         m1, alpham
+    lea                  r6, [strideq*3]
+    pabsw                m2, m1
+    psllw                m2, 9
+.s8_loop:
+    mova                 m4, [acq]
+    mova                 m5, [acq+32]
+    IPRED_CFL             4
+    IPRED_CFL             5
+    packuswb             m4, m5
+    vextracti128        xm5, m4, 1
+    movq   [dstq+strideq*0], xm4
+    movq   [dstq+strideq*1], xm5
+    movhps [dstq+strideq*2], xm4
+    movhps [dstq+r6       ], xm5
+    lea                dstq, [dstq+strideq*4]
+    add                 acq, 64
+    sub                  hd, 4
+    jg .s8_loop
+    RET
+ALIGN function_align
+.h16:
+    mova                xm0, [tlq-16]
+    pmaddubsw           xm0, xm3
+    jmp                  wq
+.w16:
+    movu                xm1, [tlq+1]
+    vextracti128        xm2, m0, 1
+    pmaddubsw           xm1, xm3
+    psubw               xm0, xm4
+    paddw               xm0, xm2
+    paddw               xm0, xm1
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 16
+    je .w16_end
+    mov                 r6d, 0x5556
+    mov                 r2d, 0x3334
+    test                 hb, 8|32
+    cmovz               r6d, r2d
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w16_end:
+    vpbroadcastw         m0, xm0
+.s16:
+    vpbroadcastw         m1, alpham
+    pabsw                m2, m1
+    psllw                m2, 9
+.s16_loop:
+    mova                 m4, [acq]
+    mova                 m5, [acq+32]
+    IPRED_CFL             4
+    IPRED_CFL             5
+    packuswb             m4, m5
+    vpermq               m4, m4, q3120
+    mova         [dstq+strideq*0], xm4
+    vextracti128 [dstq+strideq*1], m4, 1
+    lea                dstq, [dstq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .s16_loop
+    RET
+ALIGN function_align
+.h32:
+    mova                 m0, [tlq-32]
+    pmaddubsw            m0, m3
+    jmp                  wq
+.w32:
+    movu                 m1, [tlq+1]
+    pmaddubsw            m1, m3
+    paddw                m0, m1
+    vextracti128        xm1, m0, 1
+    psubw               xm0, xm4
+    paddw               xm0, xm1
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 32
+    je .w32_end
+    lea                 r2d, [hq*2]
+    mov                 r6d, 0x33345556
+    shrx                r6d, r6d, r2d
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w32_end:
+    vpbroadcastw         m0, xm0
+.s32:
+    vpbroadcastw         m1, alpham
+    pabsw                m2, m1
+    psllw                m2, 9
+.s32_loop:
+    mova                 m4, [acq]
+    mova                 m5, [acq+32]
+    IPRED_CFL             4
+    IPRED_CFL             5
+    packuswb             m4, m5
+    vpermq               m4, m4, q3120
+    mova             [dstq], m4
+    add                dstq, strideq
+    add                 acq, 64
+    dec                  hd
+    jg .s32_loop
+    RET
+
+cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+    lea                  t0, [ipred_cfl_splat_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [t0+wq*4]
+    vpbroadcastd         m0, [t0-ipred_cfl_splat_avx2_table+pw_128]
+    add                  wq, t0
+    movifnidn           acq, acmp
+    jmp                  wq
+
+cglobal ipred_cfl_ac_420, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+    movifnidn         hpadd, hpadm
+    movifnidn            wd, wm
+    mov                  hd, hm
+    mov                 szd, wd
+    mov             ac_bakq, acq
+    imul                szd, hd
+    shl               hpadd, 2
+    sub                  hd, hpadd
+    vpbroadcastd         m2, [pb_2]
+    pxor                 m4, m4
+    cmp                  wd, 8
+    jg .w16
+    je .w8
+    ; fall-through
+
+    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+.w4:
+    lea            stride3q, [strideq*3]
+.w4_loop:
+    movq                xm0, [yq]
+    movq                xm1, [yq+strideq]
+    movhps              xm0, [yq+strideq*2]
+    movhps              xm1, [yq+stride3q]
+    pmaddubsw           xm0, xm2
+    pmaddubsw           xm1, xm2
+    paddw               xm0, xm1
+    mova              [acq], xm0
+    paddw               xm4, xm0
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 16
+    sub                  hd, 2
+    jg .w4_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+    vpermq               m0, m0, q1111
+.w4_hpad_loop:
+    mova              [acq], m0
+    paddw                m4, m0
+    add                 acq, 32
+    sub               hpadd, 4
+    jg .w4_hpad_loop
+    jmp .calc_avg
+
+.w8:
+    lea            stride3q, [strideq*3]
+    test              wpadd, wpadd
+    jnz .w8_wpad
+.w8_loop:
+    mova                xm0, [yq]
+    mova                xm1, [yq+strideq]
+    vinserti128          m0, [yq+strideq*2], 1
+    vinserti128          m1, [yq+stride3q], 1
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    mova              [acq], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 32
+    sub                  hd, 2
+    jg .w8_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+    jmp .w8_hpad
+.w8_wpad:
+    vbroadcasti128       m3, [cfl_ac_w8_pad1_shuffle]
+.w8_wpad_loop:
+    movq                xm0, [yq]
+    movq                xm1, [yq+strideq]
+    vinserti128          m0, [yq+strideq*2], 1
+    vinserti128          m1, [yq+stride3q], 1
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    pshufb               m0, m3
+    mova              [acq], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 32
+    sub                  hd, 2
+    jg .w8_wpad_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+.w8_hpad:
+    vpermq               m0, m0, q3232
+.w8_hpad_loop:
+    mova              [acq], m0
+    paddw                m4, m0
+    add                 acq, 32
+    sub               hpadd, 2
+    jg .w8_hpad_loop
+    jmp .calc_avg
+
+.w16:
+    test              wpadd, wpadd
+    jnz .w16_wpad
+.w16_loop:
+    mova                 m0, [yq]
+    mova                 m1, [yq+strideq]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    mova              [acq], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 32
+    dec                  hd
+    jg .w16_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+    jmp .w16_hpad_loop
+.w16_wpad:
+    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+    lea               iptrq, [ipred_cfl_ac_420_avx2_table]
+    shl               wpadd, 2
+    mova                 m3, [iptrq+cfl_ac_w16_pad_shuffle- \
+                              ipred_cfl_ac_420_avx2_table+wpadq*8-32]
+    movsxd            wpadq, [iptrq+wpadq+4]
+    add               iptrq, wpadq
+    jmp iptrq
+.w16_pad3:
+    vpbroadcastq         m0, [yq]
+    vpbroadcastq         m1, [yq+strideq]
+    jmp .w16_wpad_end
+.w16_pad2:
+    vbroadcasti128       m0, [yq]
+    vbroadcasti128       m1, [yq+strideq]
+    jmp .w16_wpad_end
+.w16_pad1:
+    mova                 m0, [yq]
+    mova                 m1, [yq+strideq]
+    ; fall-through
+.w16_wpad_end:
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    pshufb               m0, m3
+    mova              [acq], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 32
+    dec                  hd
+    jz .w16_wpad_done
+    jmp iptrq
+.w16_wpad_done:
+    test              hpadd, hpadd
+    jz .calc_avg
+.w16_hpad_loop:
+    mova              [acq], m0
+    paddw                m4, m0
+    add                 acq, 32
+    dec               hpadd
+    jg .w16_hpad_loop
+    ; fall-through
+
+.calc_avg:
+    vpbroadcastd         m2, [pw_1]
+    pmaddwd              m0, m4, m2
+    vextracti128        xm1, m0, 1
+    tzcnt               r1d, szd
+    paddd               xm0, xm1
+    movd                xm2, r1d
+    movd                xm3, szd
+    punpckhqdq          xm1, xm0, xm0
+    paddd               xm0, xm1
+    psrad               xm3, 1
+    psrlq               xm1, xm0, 32
+    paddd               xm0, xm3
+    paddd               xm0, xm1
+    psrad               xm0, xm2
+    vpbroadcastw         m0, xm0
+.sub_loop:
+    mova                 m1, [ac_bakq]
+    psubw                m1, m0
+    mova          [ac_bakq], m1
+    add             ac_bakq, 32
+    sub                 szd, 16
+    jg .sub_loop
+    RET
+
+cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+    movifnidn         hpadd, hpadm
+    movifnidn            wd, wm
+    mov                  hd, hm
+    mov                 szd, wd
+    mov             ac_bakq, acq
+    imul                szd, hd
+    shl               hpadd, 2
+    sub                  hd, hpadd
+    vpbroadcastd         m2, [pb_4]
+    pxor                 m4, m4
+    pxor                 m5, m5
+    cmp                  wd, 8
+    jg .w16
+    je .w8
+    ; fall-through
+
+    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+.w4:
+    lea            stride3q, [strideq*3]
+.w4_loop:
+    movq                xm1, [yq]
+    movhps              xm1, [yq+strideq]
+    movq                xm0, [yq+strideq*2]
+    movhps              xm0, [yq+stride3q]
+    pmaddubsw           xm0, xm2
+    pmaddubsw           xm1, xm2
+    mova              [acq], xm1
+    mova           [acq+16], xm0
+    paddw               xm4, xm0
+    paddw               xm5, xm1
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 32
+    sub                  hd, 4
+    jg .w4_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+    vpermq               m0, m0, q1111
+.w4_hpad_loop:
+    mova              [acq], m0
+    paddw                m4, m0
+    add                 acq, 32
+    sub               hpadd, 4
+    jg .w4_hpad_loop
+    jmp .calc_avg
+
+.w8:
+    lea            stride3q, [strideq*3]
+    test              wpadd, wpadd
+    jnz .w8_wpad
+.w8_loop:
+    mova                xm1, [yq]
+    vinserti128          m1, [yq+strideq], 1
+    mova                xm0, [yq+strideq*2]
+    vinserti128          m0, [yq+stride3q], 1
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m4, m0
+    paddw                m5, m1
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 64
+    sub                  hd, 4
+    jg .w8_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+    jmp .w8_hpad
+.w8_wpad:
+    vbroadcasti128       m3, [cfl_ac_w8_pad1_shuffle]
+.w8_wpad_loop:
+    movq                xm1, [yq]
+    vinserti128          m1, [yq+strideq], 1
+    movq                xm0, [yq+strideq*2]
+    vinserti128          m0, [yq+stride3q], 1
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pshufb               m0, m3
+    pshufb               m1, m3
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m4, m0
+    paddw                m5, m1
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 64
+    sub                  hd, 4
+    jg .w8_wpad_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+.w8_hpad:
+    vpermq               m0, m0, q3232
+.w8_hpad_loop:
+    mova              [acq], m0
+    paddw                m4, m0
+    add                 acq, 32
+    sub               hpadd, 2
+    jg .w8_hpad_loop
+    jmp .calc_avg
+
+.w16:
+    test              wpadd, wpadd
+    jnz .w16_wpad
+.w16_loop:
+    mova                 m1, [yq]
+    mova                 m0, [yq+strideq]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m4, m0
+    paddw                m5, m1
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+    jmp .w16_hpad_loop
+.w16_wpad:
+    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+    lea               iptrq, [ipred_cfl_ac_422_avx2_table]
+    shl               wpadd, 2
+    mova                 m3, [iptrq+cfl_ac_w16_pad_shuffle- \
+                              ipred_cfl_ac_422_avx2_table+wpadq*8-32]
+    movsxd            wpadq, [iptrq+wpadq+4]
+    add               iptrq, wpadq
+    jmp iptrq
+.w16_pad3:
+    vpbroadcastq         m1, [yq]
+    vpbroadcastq         m0, [yq+strideq]
+    jmp .w16_wpad_end
+.w16_pad2:
+    vbroadcasti128       m1, [yq]
+    vbroadcasti128       m0, [yq+strideq]
+    jmp .w16_wpad_end
+.w16_pad1:
+    mova                 m1, [yq]
+    mova                 m0, [yq+strideq]
+    ; fall-through
+.w16_wpad_end:
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pshufb               m0, m3
+    pshufb               m1, m3
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m4, m0
+    paddw                m5, m1
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jz .w16_wpad_done
+    jmp iptrq
+.w16_wpad_done:
+    test              hpadd, hpadd
+    jz .calc_avg
+.w16_hpad_loop:
+    mova              [acq], m0
+    mova           [acq+32], m0
+    paddw                m4, m0
+    paddw                m5, m0
+    add                 acq, 64
+    sub               hpadd, 2
+    jg .w16_hpad_loop
+    ; fall-through
+
+.calc_avg:
+    vpbroadcastd         m2, [pw_1]
+    pmaddwd              m5, m5, m2
+    pmaddwd              m0, m4, m2
+    paddd                m0, m5
+    vextracti128        xm1, m0, 1
+    tzcnt               r1d, szd
+    paddd               xm0, xm1
+    movd                xm2, r1d
+    movd                xm3, szd
+    punpckhqdq          xm1, xm0, xm0
+    paddd               xm0, xm1
+    psrad               xm3, 1
+    psrlq               xm1, xm0, 32
+    paddd               xm0, xm3
+    paddd               xm0, xm1
+    psrad               xm0, xm2
+    vpbroadcastw         m0, xm0
+.sub_loop:
+    mova                 m1, [ac_bakq]
+    psubw                m1, m0
+    mova          [ac_bakq], m1
+    add             ac_bakq, 32
+    sub                 szd, 16
+    jg .sub_loop
+    RET
+
+cglobal ipred_cfl_ac_444, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+    movifnidn         hpadd, hpadm
+    movifnidn            wd, wm
+    mov                  hd, hm
+    mov                 szd, wd
+    imul                szd, hd
+    shl               hpadd, 2
+    sub                  hd, hpadd
+    pxor                 m4, m4
+    vpbroadcastd         m5, [pw_1]
+    tzcnt               r8d, wd
+    lea                  r5, [ipred_cfl_ac_444_avx2_table]
+    movsxd               r8, [r5+r8*4+12]
+    add                  r5, r8
+
+    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+    mov             ac_bakq, acq
+    jmp                  r5
+
+.w4:
+    lea            stride3q, [strideq*3]
+    pxor                xm2, xm2
+.w4_loop:
+    movd                xm1, [yq]
+    movd                xm0, [yq+strideq*2]
+    pinsrd              xm1, [yq+strideq], 1
+    pinsrd              xm0, [yq+stride3q], 1
+    punpcklbw           xm1, xm2
+    punpcklbw           xm0, xm2
+    psllw               xm1, 3
+    psllw               xm0, 3
+    mova              [acq], xm1
+    mova           [acq+16], xm0
+    paddw               xm1, xm0
+    paddw               xm4, xm1
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 32
+    sub                  hd, 4
+    jg .w4_loop
+    test              hpadd, hpadd
+    jz .calc_avg_mul
+    pshufd              xm0, xm0, q3232
+    paddw               xm1, xm0, xm0
+.w4_hpad_loop:
+    mova              [acq], xm0
+    mova           [acq+16], xm0
+    paddw               xm4, xm1
+    add                 acq, 32
+    sub               hpadd, 4
+    jg .w4_hpad_loop
+    jmp .calc_avg_mul
+
+.w8:
+    lea            stride3q, [strideq*3]
+    pxor                 m2, m2
+.w8_loop:
+    movq                xm1, [yq]
+    movq                xm0, [yq+strideq*2]
+    vinserti128          m1, [yq+strideq], 1
+    vinserti128          m0, [yq+stride3q], 1
+    punpcklbw            m1, m2
+    punpcklbw            m0, m2
+    psllw                m1, 3
+    psllw                m0, 3
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m1, m0
+    paddw                m4, m1
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 64
+    sub                  hd, 4
+    jg .w8_loop
+    test              hpadd, hpadd
+    jz .calc_avg_mul
+    vpermq               m0, m0, q3232
+    paddw                m1, m0, m0
+.w8_hpad_loop:
+    mova              [acq], m0
+    mova           [acq+32], m0
+    paddw                m4, m1
+    add                 acq, 64
+    sub               hpadd, 4
+    jg .w8_hpad_loop
+    jmp .calc_avg_mul
+
+.w16:
+    test              wpadd, wpadd
+    jnz .w16_wpad
+.w16_loop:
+    pmovzxbw             m1, [yq]
+    pmovzxbw             m0, [yq+strideq]
+    psllw                m1, 3
+    psllw                m0, 3
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m1, m0
+    pmaddwd              m1, m5
+    paddd                m4, m1
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+    jmp .w16_hpad
+.w16_wpad:
+    mova                 m3, [cfl_ac_444_w16_pad1_shuffle]
+.w16_wpad_loop:
+    vpbroadcastq         m1, [yq]
+    vpbroadcastq         m0, [yq+strideq]
+    pshufb               m1, m3
+    pshufb               m0, m3
+    psllw                m1, 3
+    psllw                m0, 3
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m1, m0
+    pmaddwd              m1, m5
+    paddd                m4, m1
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_wpad_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+.w16_hpad:
+    paddw                m1, m0, m0
+    pmaddwd              m1, m5
+.w16_hpad_loop:
+    mova              [acq], m0
+    mova           [acq+32], m0
+    paddd                m4, m1
+    add                 acq, 64
+    sub               hpadd, 2
+    jg .w16_hpad_loop
+    jmp .calc_avg
+
+.w32:
+    test              wpadd, wpadd
+    jnz .w32_wpad
+.w32_loop:
+    pmovzxbw             m1, [yq]
+    pmovzxbw             m0, [yq+16]
+    psllw                m1, 3
+    psllw                m0, 3
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m2, m1, m0
+    pmaddwd              m2, m5
+    paddd                m4, m2
+    add                  yq, strideq
+    add                 acq, 64
+    dec                  hd
+    jg .w32_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+    jmp .w32_hpad_loop
+.w32_wpad:
+    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+    lea               iptrq, [ipred_cfl_ac_444_avx2_table]
+    add               wpadd, wpadd
+    mova                 m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table]
+    movsxd            wpadq, [iptrq+wpadq+4]
+    add               iptrq, wpadq
+    jmp iptrq
+.w32_pad3:
+    vpbroadcastq         m1, [yq]
+    pshufb               m1, m3
+    vpermq               m0, m1, q3232
+    jmp .w32_wpad_end
+.w32_pad2:
+    pmovzxbw             m1, [yq]
+    pshufhw              m0, m1, q3333
+    vpermq               m0, m0, q3333
+    jmp .w32_wpad_end
+.w32_pad1:
+    pmovzxbw             m1, [yq]
+    vpbroadcastq         m0, [yq+16]
+    pshufb               m0, m3
+    ; fall-through
+.w32_wpad_end:
+    psllw                m1, 3
+    psllw                m0, 3
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m2, m1, m0
+    pmaddwd              m2, m5
+    paddd                m4, m2
+    add                  yq, strideq
+    add                 acq, 64
+    dec                  hd
+    jz .w32_wpad_done
+    jmp iptrq
+.w32_wpad_done:
+    test              hpadd, hpadd
+    jz .calc_avg
+.w32_hpad_loop:
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddd                m4, m2
+    add                 acq, 64
+    dec               hpadd
+    jg .w32_hpad_loop
+    jmp .calc_avg
+
+.calc_avg_mul:
+    pmaddwd              m4, m5
+.calc_avg:
+    vextracti128        xm1, m4, 1
+    tzcnt               r1d, szd
+    paddd               xm0, xm4, xm1
+    movd                xm2, r1d
+    movd                xm3, szd
+    punpckhqdq          xm1, xm0, xm0
+    paddd               xm0, xm1
+    psrad               xm3, 1
+    psrlq               xm1, xm0, 32
+    paddd               xm0, xm3
+    paddd               xm0, xm1
+    psrad               xm0, xm2
+    vpbroadcastw         m0, xm0
+.sub_loop:
+    mova                 m1, [ac_bakq]
+    psubw                m1, m0
+    mova          [ac_bakq], m1
+    add             ac_bakq, 32
+    sub                 szd, 16
+    jg .sub_loop
+    RET
+
+cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
+    vbroadcasti128       m4, [palq]
+    lea                  r2, [pal_pred_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [r2+wq*4]
+    packuswb             m4, m4
+    add                  wq, r2
+    lea                  r2, [strideq*3]
+    jmp                  wq
+.w4:
+    pshufb              xm0, xm4, [idxq]
+    add                idxq, 16
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r2       ], xm0, 3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4
+    RET
+ALIGN function_align
+.w8:
+    pshufb              xm0, xm4, [idxq+16*0]
+    pshufb              xm1, xm4, [idxq+16*1]
+    add                idxq, 16*2
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xm0
+    movq   [dstq+strideq*2], xm1
+    movhps [dstq+r2       ], xm1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w8
+    RET
+ALIGN function_align
+.w16:
+    pshufb               m0, m4, [idxq+32*0]
+    pshufb               m1, m4, [idxq+32*1]
+    add                idxq, 32*2
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    mova         [dstq+strideq*2], xm1
+    vextracti128 [dstq+r2       ], m1, 1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w16
+    RET
+ALIGN function_align
+.w32:
+    pshufb               m0, m4, [idxq+32*0]
+    pshufb               m1, m4, [idxq+32*1]
+    pshufb               m2, m4, [idxq+32*2]
+    pshufb               m3, m4, [idxq+32*3]
+    add                idxq, 32*4
+    mova   [dstq+strideq*0], m0
+    mova   [dstq+strideq*1], m1
+    mova   [dstq+strideq*2], m2
+    mova   [dstq+r2       ], m3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w32
+    RET
+ALIGN function_align
+.w64:
+    pshufb               m0, m4, [idxq+32*0]
+    pshufb               m1, m4, [idxq+32*1]
+    pshufb               m2, m4, [idxq+32*2]
+    pshufb               m3, m4, [idxq+32*3]
+    add                idxq, 32*4
+    mova [dstq+strideq*0+32*0], m0
+    mova [dstq+strideq*0+32*1], m1
+    mova [dstq+strideq*1+32*0], m2
+    mova [dstq+strideq*1+32*1], m3
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w64
+    RET
+
+%endif
diff --git a/src/x86/ipred_init_tmpl.c b/src/x86/ipred_init_tmpl.c
new file mode 100644 (file)
index 0000000..4219ab8
--- /dev/null
@@ -0,0 +1,139 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+decl_angular_ipred_fn(dav1d_ipred_dc_avx2);
+decl_angular_ipred_fn(dav1d_ipred_dc_128_avx2);
+decl_angular_ipred_fn(dav1d_ipred_dc_top_avx2);
+decl_angular_ipred_fn(dav1d_ipred_dc_left_avx2);
+decl_angular_ipred_fn(dav1d_ipred_h_avx2);
+decl_angular_ipred_fn(dav1d_ipred_v_avx2);
+decl_angular_ipred_fn(dav1d_ipred_paeth_avx2);
+decl_angular_ipred_fn(dav1d_ipred_smooth_avx2);
+decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2);
+decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2);
+decl_angular_ipred_fn(dav1d_ipred_z1_avx2);
+decl_angular_ipred_fn(dav1d_ipred_z2_avx2);
+decl_angular_ipred_fn(dav1d_ipred_z3_avx2);
+decl_angular_ipred_fn(dav1d_ipred_filter_avx2);
+
+decl_cfl_pred_fn(dav1d_ipred_cfl_avx2);
+decl_cfl_pred_fn(dav1d_ipred_cfl_128_avx2);
+decl_cfl_pred_fn(dav1d_ipred_cfl_top_avx2);
+decl_cfl_pred_fn(dav1d_ipred_cfl_left_avx2);
+
+decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_avx2);
+decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_avx2);
+decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_avx2);
+
+decl_pal_pred_fn(dav1d_pal_pred_avx2);
+
+decl_angular_ipred_fn(dav1d_ipred_dc_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_dc_128_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_paeth_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_smooth_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_filter_ssse3);
+
+decl_cfl_pred_fn(dav1d_ipred_cfl_ssse3);
+decl_cfl_pred_fn(dav1d_ipred_cfl_128_ssse3);
+decl_cfl_pred_fn(dav1d_ipred_cfl_top_ssse3);
+decl_cfl_pred_fn(dav1d_ipred_cfl_left_ssse3);
+
+decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_ssse3);
+decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_ssse3);
+decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_ssse3);
+
+decl_pal_pred_fn(dav1d_pal_pred_ssse3);
+
+COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+    c->intra_pred[DC_PRED]       = dav1d_ipred_dc_ssse3;
+    c->intra_pred[DC_128_PRED]   = dav1d_ipred_dc_128_ssse3;
+    c->intra_pred[TOP_DC_PRED]   = dav1d_ipred_dc_top_ssse3;
+    c->intra_pred[LEFT_DC_PRED]  = dav1d_ipred_dc_left_ssse3;
+    c->intra_pred[HOR_PRED]      = dav1d_ipred_h_ssse3;
+    c->intra_pred[VERT_PRED]     = dav1d_ipred_v_ssse3;
+    c->intra_pred[PAETH_PRED]    = dav1d_ipred_paeth_ssse3;
+    c->intra_pred[SMOOTH_PRED]   = dav1d_ipred_smooth_ssse3;
+    c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3;
+    c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3;
+    c->intra_pred[FILTER_PRED]   = dav1d_ipred_filter_ssse3;
+
+    c->cfl_pred[DC_PRED]         = dav1d_ipred_cfl_ssse3;
+    c->cfl_pred[DC_128_PRED]     = dav1d_ipred_cfl_128_ssse3;
+    c->cfl_pred[TOP_DC_PRED]     = dav1d_ipred_cfl_top_ssse3;
+    c->cfl_pred[LEFT_DC_PRED]    = dav1d_ipred_cfl_left_ssse3;
+
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_ssse3;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_ssse3;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_ssse3;
+
+    c->pal_pred                  = dav1d_pal_pred_ssse3;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+    c->intra_pred[DC_PRED]       = dav1d_ipred_dc_avx2;
+    c->intra_pred[DC_128_PRED]   = dav1d_ipred_dc_128_avx2;
+    c->intra_pred[TOP_DC_PRED]   = dav1d_ipred_dc_top_avx2;
+    c->intra_pred[LEFT_DC_PRED]  = dav1d_ipred_dc_left_avx2;
+    c->intra_pred[HOR_PRED]      = dav1d_ipred_h_avx2;
+    c->intra_pred[VERT_PRED]     = dav1d_ipred_v_avx2;
+    c->intra_pred[PAETH_PRED]    = dav1d_ipred_paeth_avx2;
+    c->intra_pred[SMOOTH_PRED]   = dav1d_ipred_smooth_avx2;
+    c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2;
+    c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2;
+    c->intra_pred[Z1_PRED]       = dav1d_ipred_z1_avx2;
+    c->intra_pred[Z2_PRED]       = dav1d_ipred_z2_avx2;
+    c->intra_pred[Z3_PRED]       = dav1d_ipred_z3_avx2;
+    c->intra_pred[FILTER_PRED]   = dav1d_ipred_filter_avx2;
+
+    c->cfl_pred[DC_PRED]      = dav1d_ipred_cfl_avx2;
+    c->cfl_pred[DC_128_PRED]  = dav1d_ipred_cfl_128_avx2;
+    c->cfl_pred[TOP_DC_PRED]  = dav1d_ipred_cfl_top_avx2;
+    c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_avx2;
+
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_avx2;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_avx2;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_avx2;
+
+    c->pal_pred = dav1d_pal_pred_avx2;
+#endif
+}
diff --git a/src/x86/ipred_ssse3.asm b/src/x86/ipred_ssse3.asm
new file mode 100644 (file)
index 0000000..58cb0a4
--- /dev/null
@@ -0,0 +1,3108 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%macro SMOOTH_WEIGHT_TABLE 1-*
+    %rep %0
+        db %1-128, 127-%1
+        %rotate 1
+    %endrep
+%endmacro
+
+; sm_weights[], but modified to precalculate x and 256-x with offsets to
+; enable efficient use of pmaddubsw (which requires signed values)
+smooth_weights: SMOOTH_WEIGHT_TABLE         \
+      0,   0, 255, 128, 255, 149,  85,  64, \
+    255, 197, 146, 105,  73,  50,  37,  32, \
+    255, 225, 196, 170, 145, 123, 102,  84, \
+     68,  54,  43,  33,  26,  20,  17,  16, \
+    255, 240, 225, 210, 196, 182, 169, 157, \
+    145, 133, 122, 111, 101,  92,  83,  74, \
+     66,  59,  52,  45,  39,  34,  29,  25, \
+     21,  17,  14,  12,  10,   9,   8,   8, \
+    255, 248, 240, 233, 225, 218, 210, 203, \
+    196, 189, 182, 176, 169, 163, 156, 150, \
+    144, 138, 133, 127, 121, 116, 111, 106, \
+    101,  96,  91,  86,  82,  77,  73,  69, \
+     65,  61,  57,  54,  50,  47,  44,  41, \
+     38,  35,  32,  29,  27,  25,  22,  20, \
+     18,  16,  15,  13,  12,  10,   9,   8, \
+      7,   6,   6,   5,   5,   4,   4,   4
+
+ipred_v_shuf      : db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
+ipred_h_shuf      : db  3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0
+ipred_paeth_shuf  : db  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0
+filter_shuf1      : db  3,  4,  3,  4,  5,  6,  5,  6,  7,  2,  7,  2,  1, -1,  1, -1
+filter_shuf2      : db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11, 15, -1, 15, -1
+
+pw_8        : times 8  dw 8
+pb_3        : times 16 db 3
+pb_128      : times 8  db 128
+pw_128      : times 4  dw 128
+pw_255      : times 4  dw 255
+pb_2        : times 8  db 2
+pb_4        : times 8  db 4
+pb_127_m127 : times 4  db 127, -127
+pd_32768    : times 1  dd 32768
+
+
+%macro JMP_TABLE 3-*
+    %xdefine %1_%2_table (%%table - 2*4)
+    %xdefine %%base mangle(private_prefix %+ _%1_%2)
+    %%table:
+    %rep %0 - 2
+        dd %%base %+ .%3 - (%%table - 2*4)
+        %rotate 1
+    %endrep
+%endmacro
+
+%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
+%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4)
+
+JMP_TABLE ipred_h,          ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_dc,         ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+                                s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left,    ssse3, h4, h8, h16, h32, h64
+JMP_TABLE ipred_smooth,     ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v,   ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h,   ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth,      ssse3, w4, w8, w16, w32, w64
+JMP_TABLE pal_pred,         ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_cfl,        ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
+                                s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left,   ssse3, h4, h8, h16, h32
+JMP_TABLE ipred_filter,     ssse3, w4, w8, w16, w32
+
+cextern filter_intra_taps
+
+
+SECTION .text
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro IPRED_SET   3                                          ; width, stride, stride size pshuflw_imm8
+    pshuflw                      m1, m0, %3                   ; extend 8 byte for 2 pos
+    punpcklqdq                   m1, m1
+    mova           [dstq +      %2], m1
+%if %1 > 16
+    mova           [dstq + 16 + %2], m1
+%endif
+%if %1 > 32
+    mova           [dstq + 32 + %2], m1
+    mova           [dstq + 48 + %2], m1
+%endif
+%endmacro
+
+%macro IPRED_H 1                                            ; width
+    sub                         tlq, 4
+    movd                         m0, [tlq]                  ; get 4 bytes of topleft data
+    punpcklbw                    m0, m0                     ; extend 2 byte
+%if %1 == 4
+    pshuflw                      m1, m0, q2233
+    movd           [dstq+strideq*0], m1
+    psrlq                        m1, 32
+    movd           [dstq+strideq*1], m1
+    pshuflw                      m0, m0, q0011
+    movd           [dstq+strideq*2], m0
+    psrlq                        m0, 32
+    movd           [dstq+stride3q ], m0
+
+%elif %1 == 8
+    punpcklwd                    m0, m0
+    punpckhdq                    m1, m0, m0
+    punpckldq                    m0, m0
+    movq           [dstq+strideq*1], m1
+    movhps         [dstq+strideq*0], m1
+    movq           [dstq+stride3q ], m0
+    movhps         [dstq+strideq*2], m0
+%else
+    IPRED_SET                    %1,         0, q3333
+    IPRED_SET                    %1,   strideq, q2222
+    IPRED_SET                    %1, strideq*2, q1111
+    IPRED_SET                    %1,  stride3q, q0000
+%endif
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .w%1
+    RET
+%endmacro
+
+INIT_XMM ssse3
+cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3
+    LEA                          r5, ipred_h_ssse3_table
+    tzcnt                        wd, wm
+    movifnidn                    hd, hm
+    movsxd                       wq, [r5+wq*4]
+    add                          wq, r5
+    lea                    stride3q, [strideq*3]
+    jmp                          wq
+.w4:
+    IPRED_H                       4
+.w8:
+    IPRED_H                       8
+.w16:
+    IPRED_H                      16
+.w32:
+    IPRED_H                      32
+.w64:
+    IPRED_H                      64
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
+    LEA                  r5, ipred_dc_splat_ssse3_table
+    tzcnt                wd, wm
+    movu                 m0, [tlq+ 1]
+    movu                 m1, [tlq+17]
+    movu                 m2, [tlq+33]
+    movu                 m3, [tlq+49]
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
+    movifnidn                    hd, hm
+    movifnidn                    wd, wm
+    tzcnt                       r6d, hd
+    lea                         r5d, [wq+hq]
+    movd                         m4, r5d
+    tzcnt                       r5d, r5d
+    movd                         m5, r5d
+    LEA                          r5, ipred_dc_ssse3_table
+    tzcnt                        wd, wd
+    movsxd                       r6, [r5+r6*4]
+    movsxd                       wq, [r5+wq*4+20]
+    pcmpeqd                      m3, m3
+    psrlw                        m4, 1                             ; dc = (width + height) >> 1;
+    add                          r6, r5
+    add                          wq, r5
+    lea                    stride3q, [strideq*3]
+    jmp r6
+.h4:
+    movd                         m0, [tlq-4]
+    pmaddubsw                    m0, m3
+    jmp                          wq
+.w4:
+    movd                         m1, [tlq+1]
+    pmaddubsw                    m1, m3
+    psubw                        m0, m4
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    cmp                          hd, 4
+    jg .w4_mul
+    psrlw                        m0, 3                             ; dc >>= ctz(width + height);
+    jmp .w4_end
+.w4_mul:
+    punpckhqdq                   m1, m0, m0
+    paddw                        m0, m1
+    psrlq                        m1, m0, 32
+    paddw                        m0, m1
+    psrlw                        m0, 2
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    test                         hd, 8
+    cmovz                       r6d, r2d
+    movd                         m5, r6d
+    pmulhuw                      m0, m5
+.w4_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+.s4:
+    movd           [dstq+strideq*0], m0
+    movd           [dstq+strideq*1], m0
+    movd           [dstq+strideq*2], m0
+    movd           [dstq+stride3q ], m0
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .s4
+    RET
+ALIGN function_align
+.h8:
+    movq                         m0, [tlq-8]
+    pmaddubsw                    m0, m3
+    jmp                          wq
+.w8:
+    movq                         m1, [tlq+1]
+    pmaddubsw                    m1, m3
+    psubw                        m4, m0
+    punpckhqdq                   m0, m0
+    psubw                        m0, m4
+    paddw                        m0, m1
+    pshuflw                      m1, m0, q1032                  ; psrlq  m1, m0, 32
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    psrlw                        m0, m5
+    cmp                          hd, 8
+    je .w8_end
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    cmp                          hd, 32
+    cmovz                       r6d, r2d
+    movd                         m1, r6d
+    pmulhuw                      m0, m1
+.w8_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+.s8:
+    movq           [dstq+strideq*0], m0
+    movq           [dstq+strideq*1], m0
+    movq           [dstq+strideq*2], m0
+    movq           [dstq+stride3q ], m0
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .s8
+    RET
+ALIGN function_align
+.h16:
+    mova                         m0, [tlq-16]
+    pmaddubsw                    m0, m3
+    jmp                          wq
+.w16:
+    movu                         m1, [tlq+1]
+    pmaddubsw                    m1, m3
+    paddw                        m0, m1
+    psubw                        m4, m0
+    punpckhqdq                   m0, m0
+    psubw                        m0, m4
+    pshuflw                      m1, m0, q1032                  ; psrlq  m1, m0, 32
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    psrlw                        m0, m5
+    cmp                          hd, 16
+    je .w16_end
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    test                         hd, 8|32
+    cmovz                       r6d, r2d
+    movd                         m1, r6d
+    pmulhuw                      m0, m1
+.w16_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+.s16:
+    mova           [dstq+strideq*0], m0
+    mova           [dstq+strideq*1], m0
+    mova           [dstq+strideq*2], m0
+    mova           [dstq+stride3q ], m0
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .s16
+    RET
+ALIGN function_align
+.h32:
+    mova                         m0, [tlq-32]
+    pmaddubsw                    m0, m3
+    mova                         m2, [tlq-16]
+    pmaddubsw                    m2, m3
+    paddw                        m0, m2
+    jmp wq
+.w32:
+    movu                         m1, [tlq+1]
+    pmaddubsw                    m1, m3
+    movu                         m2, [tlq+17]
+    pmaddubsw                    m2, m3
+    paddw                        m1, m2
+    paddw                        m0, m1
+    psubw                        m4, m0
+    punpckhqdq                   m0, m0
+    psubw                        m0, m4
+    pshuflw                      m1, m0, q1032                   ; psrlq  m1, m0, 32
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    psrlw                        m0, m5
+    cmp                          hd, 32
+    je .w32_end
+    lea                         r2d, [hq*2]
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    test                         hd, 64|16
+    cmovz                       r6d, r2d
+    movd                         m1, r6d
+    pmulhuw                      m0, m1
+.w32_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+    mova                         m1, m0
+.s32:
+    mova                     [dstq], m0
+    mova                  [dstq+16], m1
+    mova             [dstq+strideq], m0
+    mova          [dstq+strideq+16], m1
+    mova           [dstq+strideq*2], m0
+    mova        [dstq+strideq*2+16], m1
+    mova            [dstq+stride3q], m0
+    mova         [dstq+stride3q+16], m1
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .s32
+    RET
+ALIGN function_align
+.h64:
+    mova                         m0, [tlq-64]
+    mova                         m1, [tlq-48]
+    pmaddubsw                    m0, m3
+    pmaddubsw                    m1, m3
+    paddw                        m0, m1
+    mova                         m1, [tlq-32]
+    pmaddubsw                    m1, m3
+    paddw                        m0, m1
+    mova                         m1, [tlq-16]
+    pmaddubsw                    m1, m3
+    paddw                        m0, m1
+    jmp wq
+.w64:
+    movu                         m1, [tlq+ 1]
+    movu                         m2, [tlq+17]
+    pmaddubsw                    m1, m3
+    pmaddubsw                    m2, m3
+    paddw                        m1, m2
+    movu                         m2, [tlq+33]
+    pmaddubsw                    m2, m3
+    paddw                        m1, m2
+    movu                         m2, [tlq+49]
+    pmaddubsw                    m2, m3
+    paddw                        m1, m2
+    paddw                        m0, m1
+    psubw                        m4, m0
+    punpckhqdq                   m0, m0
+    psubw                        m0, m4
+    pshuflw                      m1, m0, q1032                   ; psrlq  m1, m0, 32
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    psrlw                        m0, m5
+    cmp                          hd, 64
+    je .w64_end
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    test                         hd, 32
+    cmovz                       r6d, r2d
+    movd                         m1, r6d
+    pmulhuw                      m0, m1
+.w64_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+    mova                         m1, m0
+    mova                         m2, m0
+    mova                         m3, m0
+.s64:
+    mova                     [dstq], m0
+    mova                  [dstq+16], m1
+    mova                  [dstq+32], m2
+    mova                  [dstq+48], m3
+    mova             [dstq+strideq], m0
+    mova          [dstq+strideq+16], m1
+    mova          [dstq+strideq+32], m2
+    mova          [dstq+strideq+48], m3
+    lea                        dstq, [dstq+strideq*2]
+    sub                          hd, 2
+    jg .s64
+    RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
+    LEA                  r5, ipred_dc_left_ssse3_table
+    mov                  hd, hm                ; zero upper half
+    tzcnt               r6d, hd
+    sub                 tlq, hq
+    tzcnt                wd, wm
+    movu                 m0, [tlq]
+    movd                 m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+    movd                 m2, r6d
+    psrld                m3, m2
+    movsxd               r6, [r5+r6*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, r5
+    add                  r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    jmp                  r6
+.h64:
+    movu                 m1, [tlq+48]                           ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    movu                 m1, [tlq+32]                           ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+.h32:
+    movu                 m1, [tlq+16]                           ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+.h16:
+    pshufd               m1, m0, q3232                          ; psrlq               m1, m0, 16
+    paddw                m0, m1
+.h8:
+    pshuflw              m1, m0, q1032                          ; psrlq               m1, m0, 32
+    paddw                m0, m1
+.h4:
+    pmaddwd              m0, m2
+    pmulhrsw             m0, m3
+    lea            stride3q, [strideq*3]
+    pxor                 m1, m1
+    pshufb               m0, m1
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    jmp                  wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
+    LEA                  r5, ipred_dc_splat_ssse3_table
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    movddup              m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
+    LEA                  r5, ipred_dc_left_ssse3_table
+    tzcnt                wd, wm
+    inc                 tlq
+    movu                 m0, [tlq]
+    movifnidn            hd, hm
+    movd                 m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+    movd                 m2, wd
+    psrld                m3, m2
+    movsxd               r6, [r5+wq*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, r5
+    add                  r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    jmp                  r6
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
+                ;            w * a         = (w - 128) * a + 128 * a
+                ;            (256 - w) * b = (127 - w) * b + 129 * b
+                ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b]
+    pmaddubsw            m6, m%3, m%1
+    pmaddubsw            m0, m%4, m%2                    ; (w - 128) * a + (127 - w) * b
+    paddw                m6, m%5
+    paddw                m0, m%6                         ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128]
+    psrlw                m6, 8
+    psrlw                m0, 8
+    packuswb             m6, m0
+%endmacro
+
+cglobal ipred_smooth_v, 3, 7, 7, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_ssse3_table
+    LEA                  r6, ipred_smooth_v_ssse3_table
+    tzcnt                wd, wm
+    mov                  hd, hm
+    movsxd               wq, [r6+wq*4]
+    movddup              m0, [base+pb_127_m127]
+    movddup              m1, [base+pw_128]
+    lea            weightsq, [base+smooth_weights+hq*4]
+    neg                  hq
+    movd                 m5, [tlq+hq]
+    pxor                 m2, m2
+    pshufb               m5, m2
+    add                  wq, r6
+    jmp                  wq
+.w4:
+    movd                 m2, [tlq+1]
+    punpckldq            m2, m2
+    punpcklbw            m2, m5                          ; top, bottom
+    lea                  r3, [strideq*3]
+    mova                 m4, [base+ipred_v_shuf]
+    mova                 m5, m4
+    punpckldq            m4, m4
+    punpckhdq            m5, m5
+    pmaddubsw            m3, m2, m0                      ; m3: 127 * top - 127 * bottom
+    paddw                m1, m2                          ; m1:   1 * top + 256 * bottom + 128, overflow is ok
+    paddw                m3, m1                          ; m3: 128 * top + 129 * bottom + 128
+.w4_loop:
+    movu                 m1, [weightsq+hq*2]
+    pshufb               m0, m1, m4                      ;m2, m3, m4 and m5 should be stable in loop
+    pshufb               m1, m5
+    SMOOTH                0, 1, 2, 2, 3, 3
+    movd   [dstq+strideq*0], m6
+    pshuflw              m1, m6, q1032
+    movd   [dstq+strideq*1], m1
+    punpckhqdq           m6, m6
+    movd   [dstq+strideq*2], m6
+    psrlq                m6, 32
+    movd   [dstq+r3       ], m6
+    lea                dstq, [dstq+strideq*4]
+    add                  hq, 4
+    jl .w4_loop
+    RET
+ALIGN function_align
+.w8:
+    movq                 m2, [tlq+1]
+    punpcklbw            m2, m5
+    mova                 m5, [base+ipred_v_shuf]
+    lea                  r3, [strideq*3]
+    pshufd               m4, m5, q0000
+    pshufd               m5, m5, q1111
+    pmaddubsw            m3, m2, m0
+    paddw                m1, m2
+    paddw                m3, m1                           ; m3 is output for loop
+.w8_loop:
+    movq                 m1, [weightsq+hq*2]
+    pshufb               m0, m1, m4
+    pshufb               m1, m5
+    SMOOTH                0, 1, 2, 2, 3, 3
+    movq   [dstq+strideq*0], m6
+    movhps [dstq+strideq*1], m6
+    lea                dstq, [dstq+strideq*2]
+    add                  hq, 2
+    jl .w8_loop
+    RET
+ALIGN function_align
+.w16:
+    movu                 m3, [tlq+1]
+    punpcklbw            m2, m3, m5
+    punpckhbw            m3, m5
+    pmaddubsw            m4, m2, m0
+    pmaddubsw            m5, m3, m0
+    paddw                m0, m1, m2
+    paddw                m1, m3
+    paddw                m4, m0
+    paddw                m5, m1                           ; m4 and m5 is output for loop
+.w16_loop:
+    movd                 m1, [weightsq+hq*2]
+    pshuflw              m1, m1, q0000
+    punpcklqdq           m1, m1
+    SMOOTH 1, 1, 2, 3, 4, 5
+    mova             [dstq], m6
+    add                dstq, strideq
+    add                  hq, 1
+    jl .w16_loop
+    RET
+ALIGN function_align
+.w32:
+%if WIN64
+    movaps         [rsp+24], xmm7
+    %define xmm_regs_used 8
+%endif
+    mova                 m7, m5
+.w32_loop_init:
+    mov                 r3d, 2
+.w32_loop:
+    movddup              m0, [base+pb_127_m127]
+    movddup              m1, [base+pw_128]
+    movu                 m3, [tlq+1]
+    punpcklbw            m2, m3, m7
+    punpckhbw            m3, m7
+    pmaddubsw            m4, m2, m0
+    pmaddubsw            m5, m3, m0
+    paddw                m0, m1, m2
+    paddw                m1, m3
+    paddw                m4, m0
+    paddw                m5, m1
+    movd                 m1, [weightsq+hq*2]
+    pshuflw              m1, m1, q0000
+    punpcklqdq           m1, m1
+    SMOOTH                1, 1, 2, 3, 4, 5
+    mova             [dstq], m6
+    add                 tlq, 16
+    add                dstq, 16
+    dec                 r3d
+    jg .w32_loop
+    lea                dstq, [dstq-32+strideq]
+    sub                 tlq, 32
+    add                  hq, 1
+    jl .w32_loop_init
+    RET
+ALIGN function_align
+.w64:
+%if WIN64
+    movaps         [rsp+24], xmm7
+    %define xmm_regs_used 8
+%endif
+    mova                 m7, m5
+.w64_loop_init:
+    mov                 r3d, 4
+.w64_loop:
+    movddup              m0, [base+pb_127_m127]
+    movddup              m1, [base+pw_128]
+    movu                 m3, [tlq+1]
+    punpcklbw            m2, m3, m7
+    punpckhbw            m3, m7
+    pmaddubsw            m4, m2, m0
+    pmaddubsw            m5, m3, m0
+    paddw                m0, m1, m2
+    paddw                m1, m3
+    paddw                m4, m0
+    paddw                m5, m1
+    movd                 m1, [weightsq+hq*2]
+    pshuflw              m1, m1, q0000
+    punpcklqdq           m1, m1
+    SMOOTH                1, 1, 2, 3, 4, 5
+    mova             [dstq], m6
+    add                 tlq, 16
+    add                dstq, 16
+    dec                 r3d
+    jg .w64_loop
+    lea                dstq, [dstq-64+strideq]
+    sub                 tlq, 64
+    add                  hq, 1
+    jl .w64_loop_init
+    RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_smooth_h, 3, 7, 8, dst, stride, tl, w, h
+%define base r6-ipred_smooth_h_ssse3_table
+    LEA                  r6, ipred_smooth_h_ssse3_table
+    mov                  wd, wm
+    movd                 m3, [tlq+wq]
+    pxor                 m1, m1
+    pshufb               m3, m1                          ; right
+    tzcnt                wd, wd
+    mov                  hd, hm
+    movsxd               wq, [r6+wq*4]
+    movddup              m4, [base+pb_127_m127]
+    movddup              m5, [base+pw_128]
+    add                  wq, r6
+    jmp                  wq
+.w4:
+    movddup              m6, [base+smooth_weights+4*2]
+    mova                 m7, [base+ipred_h_shuf]
+    sub                 tlq, 4
+    sub                 tlq, hq
+    lea                  r3, [strideq*3]
+.w4_loop:
+    movd                 m2, [tlq+hq]                    ; left
+    pshufb               m2, m7
+    punpcklbw            m1, m2, m3                      ; left, right
+    punpckhbw            m2, m3
+    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
+    paddw                m0, m1                          ; 128 * left + 129 * right
+    pmaddubsw            m1, m6
+    paddw                m1, m5
+    paddw                m0, m1
+    pmaddubsw            m1, m2, m4
+    paddw                m1, m2
+    pmaddubsw            m2, m6
+    paddw                m2, m5
+    paddw                m1, m2
+    psrlw                m0, 8
+    psrlw                m1, 8
+    packuswb             m0, m1
+    movd   [dstq+strideq*0], m0
+    pshuflw              m1, m0, q1032
+    movd   [dstq+strideq*1], m1
+    punpckhqdq           m0, m0
+    movd   [dstq+strideq*2], m0
+    psrlq                m0, 32
+    movd   [dstq+r3       ], m0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4_loop
+    RET
+ALIGN function_align
+.w8:
+    mova                 m6, [base+smooth_weights+8*2]
+    mova                 m7, [base+ipred_h_shuf]
+    sub                 tlq, 4
+    sub                 tlq, hq
+    punpckldq            m7, m7
+.w8_loop:
+    movd                 m2, [tlq+hq]                    ; left
+    pshufb               m2, m7
+    punpcklbw            m1, m2, m3                      ; left, right
+    punpckhbw            m2, m3
+    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
+    paddw                m0, m1                          ; 128 * left + 129 * right
+    pmaddubsw            m1, m6
+    paddw                m1, m5
+    paddw                m0, m1
+    pmaddubsw            m1, m2, m4
+    paddw                m1, m2
+    pmaddubsw            m2, m6
+    paddw                m2, m5
+    paddw                m1, m2
+    psrlw                m0, 8
+    psrlw                m1, 8
+    packuswb             m0, m1
+    movq   [dstq+strideq*0], m0
+    movhps [dstq+strideq*1], m0
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+ALIGN function_align
+.w16:
+    mova                 m6, [base+smooth_weights+16*2]
+    mova                 m7, [base+smooth_weights+16*3]
+    sub                 tlq, 1
+    sub                 tlq, hq
+.w16_loop:
+    pxor                 m1, m1
+    movd                 m2, [tlq+hq]                    ; left
+    pshufb               m2, m1
+    punpcklbw            m1, m2, m3                      ; left, right
+    punpckhbw            m2, m3
+    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
+    paddw                m0, m1                          ; 128 * left + 129 * right
+    pmaddubsw            m1, m6
+    paddw                m1, m5
+    paddw                m0, m1
+    pmaddubsw            m1, m2, m4
+    paddw                m1, m2
+    pmaddubsw            m2, m7
+    paddw                m2, m5
+    paddw                m1, m2
+    psrlw                m0, 8
+    psrlw                m1, 8
+    packuswb             m0, m1
+    mova             [dstq], m0
+    lea                dstq, [dstq+strideq]
+    sub                  hd, 1
+    jg .w16_loop
+    RET
+ALIGN function_align
+.w32:
+    sub                 tlq, 1
+    sub                 tlq, hq
+    pxor                 m6, m6
+.w32_loop_init:
+    mov                  r5, 2
+    lea                  r3, [base+smooth_weights+16*4]
+.w32_loop:
+    mova                 m7, [r3]
+    add                  r3, 16
+    movd                 m2, [tlq+hq]                    ; left
+    pshufb               m2, m6
+    punpcklbw            m1, m2, m3                      ; left, right
+    punpckhbw            m2, m3
+    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
+    paddw                m0, m1                          ; 128 * left + 129 * right
+    pmaddubsw            m1, m7
+    paddw                m1, m5
+    paddw                m0, m1
+    pmaddubsw            m1, m2, m4
+    paddw                m1, m2
+    mova                 m7, [r3]
+    add                  r3, 16
+    pmaddubsw            m2, m7
+    paddw                m2, m5
+    paddw                m1, m2
+    psrlw                m0, 8
+    psrlw                m1, 8
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, 16
+    dec                  r5
+    jg .w32_loop
+    lea                dstq, [dstq-32+strideq]
+    sub                  hd, 1
+    jg .w32_loop_init
+    RET
+ALIGN function_align
+.w64:
+    sub                 tlq, 1
+    sub                 tlq, hq
+    pxor                 m6, m6
+.w64_loop_init:
+    mov                  r5, 4
+    lea                  r3, [base+smooth_weights+16*8]
+.w64_loop:
+    mova                 m7, [r3]
+    add                  r3, 16
+    movd                 m2, [tlq+hq]                    ; left
+    pshufb               m2, m6
+    punpcklbw            m1, m2, m3                      ; left, right
+    punpckhbw            m2, m3
+    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
+    paddw                m0, m1                          ; 128 * left + 129 * right
+    pmaddubsw            m1, m7
+    paddw                m1, m5
+    paddw                m0, m1
+    pmaddubsw            m1, m2, m4
+    paddw                m1, m2
+    mova                 m7, [r3]
+    add                  r3, 16
+    pmaddubsw            m2, m7
+    paddw                m2, m5
+    paddw                m1, m2
+    psrlw                m0, 8
+    psrlw                m1, 8
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, 16
+    dec                  r5
+    jg .w64_loop
+    lea                dstq, [dstq-64+strideq]
+    sub                  hd, 1
+    jg .w64_loop_init
+    RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro SMOOTH_2D_END  7                                  ; src[1-2], mul[1-2], add[1-2], m3
+    pmaddubsw            m6, m%3, m%1
+    mova                 m0, m6
+    pmaddubsw            m6, m%4, m%2
+    mova                 m1, m6
+%ifnum %5
+    paddw                m0, m%5
+%else
+    paddw                m0, %5
+%endif
+%ifnum %6
+    paddw                m1, m%6
+%else
+    paddw                m1, %6
+%endif
+%ifnum %7
+%else
+    mova                 m3, %7
+%endif
+    pavgw                m0, m2
+    pavgw                m1, m3
+    psrlw                m0, 8
+    psrlw                m1, 8
+    packuswb             m0, m1
+%endmacro
+
+%macro SMOOTH_OUTPUT_16B  12      ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5]
+    mova                 m1, [rsp+16*%1]                  ; top
+    punpckhbw            m6, m1, m0                       ; top, bottom
+    punpcklbw            m1, m0                           ; top, bottom
+    pmaddubsw            m2, m1, m5
+    mova        [rsp+16*%2], m1
+    paddw                m1, m3                           ;   1 * top + 255 * bottom + 255
+    paddw                m2, m1                           ; 128 * top + 129 * bottom + 255
+    mova        [rsp+16*%3], m2
+    pmaddubsw            m2, m6, m5
+    mova        [rsp+16*%4], m6
+    paddw                m6, m3                           ;   1 * top + 255 * bottom + 255
+    paddw                m2, m6                           ; 128 * top + 129 * bottom + 255
+    mova        [rsp+16*%5], m2
+    movd                 m1, [tlq+hq]                     ; left
+    pshufb               m1, [base+pb_3]                  ; topleft[-(1 + y)]
+    punpcklbw            m1, m4                           ; left, right
+    pmaddubsw            m2, m1, m5                       ; 127 * left - 127 * right
+    paddw                m2, m1                           ; 128 * left + 129 * right
+    mova                 m3, m2
+    pmaddubsw            m0, m1, %6                       ; weights_hor = &dav1d_sm_weights[width];
+    pmaddubsw            m1, %7
+    paddw                m2, m3, m0
+    paddw                m3, m1
+    movd                 m1, [v_weightsq]                 ; weights_ver = &dav1d_sm_weights[height];
+    mova                 m7, [rsp+16*%9]
+    pshufb               m1, m7
+    mova        [rsp+16*%8], m3
+    mova                 m4, [rsp+16*%2]
+    mova                 m5, [rsp+16*%3]
+    mova                 m3, [rsp+16*%4]
+    mova                 m7, [rsp+16*%5]
+    SMOOTH_2D_END         1, 1, 4, 3, 5, 7, [rsp+16*%8]
+    mova             [dstq], m0
+    movddup              m3, [base+pw_255]                ; recovery
+    mova                 m0, [rsp+16*%10]                 ; recovery
+    mova                 m4, [rsp+16*%11]                 ; recovery
+    mova                 m5, [rsp+16*%12]                 ; recovery
+%endmacro
+
+cglobal ipred_smooth, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_ssse3_table
+    mov                  wd, wm
+    mov                  hd, hm
+    LEA                  r6, ipred_smooth_ssse3_table
+    movd                 m4, [tlq+wq]                     ; right
+    pxor                 m2, m2
+    pshufb               m4, m2
+    tzcnt                wd, wd
+    mov                  r5, tlq
+    sub                  r5, hq
+    movsxd               wq, [r6+wq*4]
+    movddup              m5, [base+pb_127_m127]
+    movd                 m0, [r5]
+    pshufb               m0, m2                           ; bottom
+    movddup              m3, [base+pw_255]
+    add                  wq, r6
+    lea          v_weightsq, [base+smooth_weights+hq*2]   ; weights_ver = &dav1d_sm_weights[height]
+    jmp                  wq
+.w4:
+    mova                 m7, [base+ipred_v_shuf]
+    movd                 m1, [tlq+1]                      ; left
+    pshufd               m1, m1, q0000
+    sub                 tlq, 4
+    lea                  r3, [strideq*3]
+    sub                 tlq, hq
+    punpcklbw            m1, m0                           ; top, bottom
+    pshufd               m6, m7, q1100
+    pshufd               m7, m7, q3322
+    pmaddubsw            m2, m1, m5
+    paddw                m3, m1                           ;   1 * top + 255 * bottom + 255
+    paddw                m2, m3                           ; 128 * top + 129 * bottom + 255
+    mova         [rsp+16*0], m1
+    mova         [rsp+16*1], m2
+    movq                 m1,  [base+smooth_weights+4*2]   ; weights_hor = &dav1d_sm_weights[width];
+    punpcklqdq           m1, m1
+    mova         [rsp+16*2], m1
+    mova         [rsp+16*3], m4
+    mova         [rsp+16*4], m6
+    mova         [rsp+16*5], m5
+.w4_loop:
+    movd                 m1, [tlq+hq]                 ; left
+    pshufb               m1, [base+ipred_h_shuf]
+    punpcklbw            m0, m1, m4                   ; left, right
+    punpckhbw            m1, m4
+    pmaddubsw            m2, m0, m5                   ; 127 * left - 127 * right
+    pmaddubsw            m3, m1, m5
+    paddw                m2, m0                       ; 128 * left + 129 * right
+    paddw                m3, m1
+    mova                 m4, [rsp+16*2]
+    pmaddubsw            m0, m4
+    pmaddubsw            m1, m4
+    paddw                m2, m0
+    paddw                m3, m1
+    movq                 m1, [v_weightsq]             ; weights_ver = &dav1d_sm_weights[height];
+    add          v_weightsq, 8
+    pshufb               m0, m1, m6
+    pshufb               m1, m7
+    mova                 m4, [rsp+16*0]
+    mova                 m5, [rsp+16*1]
+    SMOOTH_2D_END         0, 1, 4, 4, 5, 5, 3
+    mova                 m4, [rsp+16*3]
+    mova                 m6, [rsp+16*4]
+    mova                 m5, [rsp+16*5]
+    movd   [dstq+strideq*0], m0
+    pshuflw              m1, m0, q1032
+    movd   [dstq+strideq*1], m1
+    punpckhqdq           m0, m0
+    movd   [dstq+strideq*2], m0
+    psrlq                m0, 32
+    movd   [dstq+r3       ], m0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4_loop
+    RET
+ALIGN function_align
+.w8:
+    mova                 m7, [base+ipred_v_shuf]
+    movq                 m1, [tlq+1]                  ; left
+    punpcklqdq           m1, m1
+    sub                 tlq, 4
+    sub                 tlq, hq
+    punpcklbw            m1, m0
+    pshufd               m6, m7, q0000
+    pshufd               m7, m7, q1111
+    pmaddubsw            m2, m1, m5
+    paddw                m3, m1
+    paddw                m2, m3
+    mova         [rsp+16*0], m1
+    mova         [rsp+16*1], m2
+    mova                 m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width];
+    mova         [rsp+16*2], m1
+    mova         [rsp+16*3], m4
+    mova         [rsp+16*4], m6
+    mova         [rsp+16*5], m5
+.w8_loop:
+    movd                 m1, [tlq+hq]                  ; left
+    pshufb               m1, [base+ipred_h_shuf]
+    pshufd               m1, m1, q1100
+    punpcklbw            m0, m1, m4
+    punpckhbw            m1, m4
+    pmaddubsw            m2, m0, m5
+    pmaddubsw            m3, m1, m5
+    paddw                m2, m0
+    paddw                m3, m1
+    mova                 m4,  [rsp+16*2]
+    pmaddubsw            m0, m4
+    pmaddubsw            m1, m4
+    paddw                m2, m0
+    paddw                m3, m1
+    movd                 m1, [v_weightsq]              ; weights_ver = &dav1d_sm_weights[height];
+    add          v_weightsq, 4
+    pshufb               m0, m1, m6
+    pshufb               m1, m7
+    mova                 m4, [rsp+16*0]
+    mova                 m5, [rsp+16*1]
+    SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
+    mova                 m4, [rsp+16*3]
+    mova                 m6, [rsp+16*4]
+    mova                 m5, [rsp+16*5]
+    movq   [dstq+strideq*0], m0
+    movhps [dstq+strideq*1], m0
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+ALIGN function_align
+.w16:
+    mova                 m7, [base+ipred_v_shuf]
+    movu                 m1, [tlq+1]                     ; left
+    sub                 tlq, 4
+    sub                 tlq, hq
+    punpckhbw            m6, m1, m0                      ; top, bottom
+    punpcklbw            m1, m0                          ; top, bottom
+    pshufd               m7, m7, q0000
+    mova         [rsp+16*2], m7
+    pmaddubsw            m2, m6, m5
+    mova         [rsp+16*5], m6
+    paddw                m6, m3                          ;   1 * top + 255 * bottom + 255
+    paddw                m2, m6                          ; 128 * top + 129 * bottom + 255
+    mova         [rsp+16*6], m2
+    pmaddubsw            m2, m1, m5
+    paddw                m3, m1                          ;   1 * top + 255 * bottom + 255
+    mova         [rsp+16*0], m1
+    paddw                m2, m3                          ; 128 * top + 129 * bottom + 255
+    mova         [rsp+16*1], m2
+    mova         [rsp+16*3], m4
+    mova         [rsp+16*4], m5
+.w16_loop:
+    movd                 m1, [tlq+hq]                    ; left
+    pshufb               m1, [base+pb_3]                 ; topleft[-(1 + y)]
+    punpcklbw            m1, m4                          ; left, right
+    pmaddubsw            m2, m1, m5                      ; 127 * left - 127 * right
+    paddw                m2, m1                          ; 128 * left + 129 * right
+    mova                 m0, m1
+    mova                 m3, m2
+    pmaddubsw            m0, [base+smooth_weights+16*2]  ; weights_hor = &dav1d_sm_weights[width];
+    pmaddubsw            m1, [base+smooth_weights+16*3]
+    paddw                m2, m0
+    paddw                m3, m1
+    movd                 m1, [v_weightsq]                ; weights_ver = &dav1d_sm_weights[height];
+    add          v_weightsq, 2
+    mova                 m7, [rsp+16*2]
+    pshufb               m1, m7
+    mova         [rsp+16*7], m3
+    mova                 m4, [rsp+16*0]
+    mova                 m5, [rsp+16*1]
+    mova                 m3, [rsp+16*5]
+    mova                 m7, [rsp+16*6]
+    SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7]
+    mova                 m4, [rsp+16*3]
+    mova                 m5, [rsp+16*4]
+    mova             [dstq], m0
+    lea                dstq, [dstq+strideq]
+    sub                  hd, 1
+    jg .w16_loop
+    RET
+ALIGN function_align
+.w32:
+    movu                 m1, [tlq+1]                     ; top     topleft[1 + x]
+    movu                 m2, [tlq+17]                    ; top
+    mova         [rsp+16*0], m1
+    mova         [rsp+16*1], m2
+    sub                 tlq, 4
+    sub                 tlq, hq
+    mova                 m7, [base+ipred_v_shuf]
+    pshufd               m7, m7, q0000
+    mova         [rsp+16*2], m7
+    mova         [rsp+16*3], m0
+    mova         [rsp+16*4], m4
+    mova         [rsp+16*5], m5
+.w32_loop:
+    SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5
+    add                dstq, 16
+    SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5
+    lea                dstq, [dstq-16+strideq]
+    add          v_weightsq, 2
+    sub                  hd, 1
+    jg .w32_loop
+    RET
+ALIGN function_align
+.w64:
+    movu                 m1, [tlq+1]                     ; top     topleft[1 + x]
+    movu                 m2, [tlq+17]                    ; top
+    mova         [rsp+16*0], m1
+    mova         [rsp+16*1], m2
+    movu                 m1, [tlq+33]                    ; top
+    movu                 m2, [tlq+49]                    ; top
+    mova        [rsp+16*11], m1
+    mova        [rsp+16*12], m2
+    sub                 tlq, 4
+    sub                 tlq, hq
+    mova                 m7, [base+ipred_v_shuf]
+    pshufd               m7, m7, q0000
+    mova         [rsp+16*2], m7
+    mova         [rsp+16*3], m0
+    mova         [rsp+16*4], m4
+    mova         [rsp+16*5], m5
+.w64_loop:
+    SMOOTH_OUTPUT_16B  0, 6, 7, 8, 9,  [base+smooth_weights+16*8],  [base+smooth_weights+16*9], 10, 2, 3, 4, 5
+    add                dstq, 16
+    SMOOTH_OUTPUT_16B  1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5
+    add                dstq, 16
+    SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5
+    add                dstq, 16
+    SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5
+    lea                dstq, [dstq-48+strideq]
+    add          v_weightsq, 2
+    sub                  hd, 1
+    jg .w64_loop
+    RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
+;                                         const uint8_t *idx, const int w, const int h);
+;---------------------------------------------------------------------------------------
+cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
+    mova                 m4, [palq]
+    LEA                  r2, pal_pred_ssse3_table
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [r2+wq*4]
+    packuswb             m4, m4
+    add                  wq, r2
+    lea                  r2, [strideq*3]
+    jmp                  wq
+.w4:
+    pshufb               m0, m4, [idxq]
+    add                idxq, 16
+    movd   [dstq          ], m0
+    pshuflw              m1, m0, q1032
+    movd   [dstq+strideq  ], m1
+    punpckhqdq           m0, m0
+    movd   [dstq+strideq*2], m0
+    psrlq                m0, 32
+    movd   [dstq+r2       ], m0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4
+    RET
+ALIGN function_align
+.w8:
+    pshufb               m0, m4, [idxq]
+    pshufb               m1, m4, [idxq+16]
+    add                idxq, 32
+    movq   [dstq          ], m0
+    movhps [dstq+strideq  ], m0
+    movq   [dstq+strideq*2], m1
+    movhps [dstq+r2       ], m1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w8
+    RET
+ALIGN function_align
+.w16:
+    pshufb               m0, m4, [idxq]
+    pshufb               m1, m4, [idxq+16]
+    pshufb               m2, m4, [idxq+32]
+    pshufb               m3, m4, [idxq+48]
+    add                idxq, 64
+    mova   [dstq          ], m0
+    mova   [dstq+strideq  ], m1
+    mova   [dstq+strideq*2], m2
+    mova   [dstq+r2       ], m3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w16
+    RET
+ALIGN function_align
+.w32:
+    pshufb               m0, m4, [idxq]
+    pshufb               m1, m4, [idxq+16]
+    pshufb               m2, m4, [idxq+32]
+    pshufb               m3, m4, [idxq+48]
+    add                idxq, 64
+    mova  [dstq           ], m0
+    mova  [dstq+16        ], m1
+    mova  [dstq+strideq   ], m2
+    mova  [dstq+strideq+16], m3
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w32
+    RET
+ALIGN function_align
+.w64:
+    pshufb               m0, m4, [idxq]
+    pshufb               m1, m4, [idxq+16]
+    pshufb               m2, m4, [idxq+32]
+    pshufb               m3, m4, [idxq+48]
+    add                idxq, 64
+    mova          [dstq   ], m0
+    mova          [dstq+16], m1
+    mova          [dstq+32], m2
+    mova          [dstq+48], m3
+    add                dstq, strideq
+    sub                  hd, 1
+    jg .w64
+    RET
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                           const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+%macro IPRED_CFL 1                   ; ac in, unpacked pixels out
+    psignw               m3, m%1, m1
+    pabsw               m%1, m%1
+    pmulhrsw            m%1, m2
+    psignw              m%1, m3
+    paddw               m%1, m0
+%endmacro
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+    movifnidn            wd, wm
+    movifnidn            hd, hm
+    tzcnt               r6d, hd
+    lea                 t0d, [wq+hq]
+    movd                 m4, t0d
+    tzcnt               t0d, t0d
+    movd                 m5, t0d
+    LEA                  t0, ipred_cfl_ssse3_table
+    tzcnt                wd, wd
+    movsxd               r6, [t0+r6*4]
+    movsxd               wq, [t0+wq*4+16]
+    pcmpeqd              m3, m3
+    psrlw                m4, 1
+    add                  r6, t0
+    add                  wq, t0
+    movifnidn           acq, acmp
+    jmp                  r6
+.h4:
+    movd                 m0, [tlq-4]
+    pmaddubsw            m0, m3
+    jmp                  wq
+.w4:
+    movd                 m1, [tlq+1]
+    pmaddubsw            m1, m3
+    psubw                m0, m4
+    paddw                m0, m1
+    pmaddwd              m0, m3
+    cmp                  hd, 4
+    jg .w4_mul
+    psrlw                m0, 3                             ; dc >>= ctz(width + height);
+    jmp .w4_end
+.w4_mul:
+    punpckhqdq           m1, m0, m0
+    paddw                m0, m1
+    pshuflw              m1, m0, q1032                     ; psrlq                m1, m0, 32
+    paddw                m0, m1
+    psrlw                m0, 2
+    mov                 r6d, 0x5556
+    mov                 r2d, 0x3334
+    test                 hd, 8
+    cmovz               r6d, r2d
+    movd                 m5, r6d
+    pmulhuw              m0, m5
+.w4_end:
+    pshuflw              m0, m0, q0000
+    punpcklqdq           m0, m0
+.s4:
+    movd                 m1, alpham
+    pshuflw              m1, m1, q0000
+    punpcklqdq           m1, m1
+    lea                  r6, [strideq*3]
+    pabsw                m2, m1
+    psllw                m2, 9
+.s4_loop:
+    mova                 m4, [acq]
+    mova                 m5, [acq+16]
+    IPRED_CFL             4
+    IPRED_CFL             5
+    packuswb             m4, m5
+    movd   [dstq+strideq*0], m4
+    pshuflw              m4, m4, q1032
+    movd   [dstq+strideq*1], m4
+    punpckhqdq           m4, m4
+    movd   [dstq+strideq*2], m4
+    psrlq                m4, 32
+    movd   [dstq+r6       ], m4
+    lea                dstq, [dstq+strideq*4]
+    add                 acq, 32
+    sub                  hd, 4
+    jg .s4_loop
+    RET
+ALIGN function_align
+.h8:
+    movq                 m0, [tlq-8]
+    pmaddubsw            m0, m3
+    jmp                  wq
+.w8:
+    movq                 m1, [tlq+1]
+    pmaddubsw            m1, m3
+    psubw                m4, m0
+    punpckhqdq           m0, m0
+    psubw                m0, m4
+    paddw                m0, m1
+    pshuflw              m1, m0, q1032                  ; psrlq  m1, m0, 32
+    paddw                m0, m1
+    pmaddwd              m0, m3
+    psrlw                m0, m5
+    cmp                  hd, 8
+    je .w8_end
+    mov                 r6d, 0x5556
+    mov                 r2d, 0x3334
+    cmp                  hd, 32
+    cmovz               r6d, r2d
+    movd                 m1, r6d
+    pmulhuw              m0, m1
+.w8_end:
+    pshuflw              m0, m0, q0000
+    punpcklqdq           m0, m0
+.s8:
+    movd                 m1, alpham
+    pshuflw              m1, m1, q0000
+    punpcklqdq           m1, m1
+    lea                  r6, [strideq*3]
+    pabsw                m2, m1
+    psllw                m2, 9
+.s8_loop:
+    mova                 m4, [acq]
+    mova                 m5, [acq+16]
+    IPRED_CFL             4
+    IPRED_CFL             5
+    packuswb             m4, m5
+    movq   [dstq          ], m4
+    movhps [dstq+strideq  ], m4
+    mova                 m4, [acq+32]
+    mova                 m5, [acq+48]
+    IPRED_CFL             4
+    IPRED_CFL             5
+    packuswb             m4, m5
+    movq   [dstq+strideq*2], m4
+    movhps [dstq+r6       ], m4
+    lea                dstq, [dstq+strideq*4]
+    add                 acq, 64
+    sub                  hd, 4
+    jg .s8_loop
+    RET
+ALIGN function_align
+.h16:
+    mova                 m0, [tlq-16]
+    pmaddubsw            m0, m3
+    jmp                  wq
+.w16:
+    movu                 m1, [tlq+1]
+    pmaddubsw            m1, m3
+    paddw                m0, m1
+    psubw                m4, m0
+    punpckhqdq           m0, m0
+    psubw                m0, m4
+    pshuflw              m1, m0, q1032                  ; psrlq  m1, m0, 32
+    paddw                m0, m1
+    pmaddwd              m0, m3
+    psrlw                m0, m5
+    cmp                  hd, 16
+    je .w16_end
+    mov                 r6d, 0x5556
+    mov                 r2d, 0x3334
+    test                 hd, 8|32
+    cmovz               r6d, r2d
+    movd                 m1, r6d
+    pmulhuw              m0, m1
+.w16_end:
+    pshuflw              m0, m0, q0000
+    punpcklqdq           m0, m0
+.s16:
+    movd                 m1, alpham
+    pshuflw              m1, m1, q0000
+    punpcklqdq           m1, m1
+    pabsw                m2, m1
+    psllw                m2, 9
+.s16_loop:
+    mova                 m4, [acq]
+    mova                 m5, [acq+16]
+    IPRED_CFL             4
+    IPRED_CFL             5
+    packuswb             m4, m5
+    mova             [dstq], m4
+    mova                 m4, [acq+32]
+    mova                 m5, [acq+48]
+    IPRED_CFL             4
+    IPRED_CFL             5
+    packuswb             m4, m5
+    mova     [dstq+strideq], m4
+    lea                dstq, [dstq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .s16_loop
+    RET
+ALIGN function_align
+.h32:
+    mova                 m0, [tlq-32]
+    pmaddubsw            m0, m3
+    mova                 m2, [tlq-16]
+    pmaddubsw            m2, m3
+    paddw                m0, m2
+    jmp                  wq
+.w32:
+    movu                 m1, [tlq+1]
+    pmaddubsw            m1, m3
+    movu                 m2, [tlq+17]
+    pmaddubsw            m2, m3
+    paddw                m1, m2
+    paddw                m0, m1
+    psubw                m4, m0
+    punpckhqdq           m0, m0
+    psubw                m0, m4
+    pshuflw              m1, m0, q1032                   ; psrlq  m1, m0, 32
+    paddw                m0, m1
+    pmaddwd              m0, m3
+    psrlw                m0, m5
+    cmp                  hd, 32
+    je .w32_end
+    lea                 r2d, [hq*2]
+    mov                 r6d, 0x5556
+    mov                 r2d, 0x3334
+    test                 hd, 64|16
+    cmovz               r6d, r2d
+    movd                 m1, r6d
+    pmulhuw              m0, m1
+.w32_end:
+    pshuflw              m0, m0, q0000
+    punpcklqdq           m0, m0
+.s32:
+    movd                 m1, alpham
+    pshuflw              m1, m1, q0000
+    punpcklqdq           m1, m1
+    pabsw                m2, m1
+    psllw                m2, 9
+.s32_loop:
+    mova                 m4, [acq]
+    mova                 m5, [acq+16]
+    IPRED_CFL             4
+    IPRED_CFL             5
+    packuswb             m4, m5
+    mova             [dstq], m4
+    mova                 m4, [acq+32]
+    mova                 m5, [acq+48]
+    IPRED_CFL             4
+    IPRED_CFL             5
+    packuswb             m4, m5
+    mova          [dstq+16], m4
+    add                dstq, strideq
+    add                 acq, 64
+    dec                  hd
+    jg .s32_loop
+    RET
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                           const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+    mov                  hd, hm                                 ; zero upper half
+    tzcnt               r6d, hd
+    sub                 tlq, hq
+    tzcnt                wd, wm
+    movu                 m0, [tlq]
+    mov                 t0d, 0x8000
+    movd                 m3, t0d
+    movd                 m2, r6d
+    psrld                m3, m2
+    LEA                  t0, ipred_cfl_left_ssse3_table
+    movsxd               r6, [t0+r6*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, t0
+    add                  t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
+    movsxd               wq, [t0+wq*4]
+    add                  wq, t0
+    movifnidn           acq, acmp
+    jmp                  r6
+.h32:
+    movu                 m1, [tlq+16]                           ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+.h16:
+    pshufd               m1, m0, q3232                          ; psrlq               m1, m0, 16
+    paddw                m0, m1
+.h8:
+    pshuflw              m1, m0, q1032                          ; psrlq               m1, m0, 32
+    paddw                m0, m1
+.h4:
+    pmaddwd              m0, m2
+    pmulhrsw             m0, m3
+    pshuflw              m0, m0, q0000
+    punpcklqdq           m0, m0
+    jmp                  wq
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                           const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+    LEA                  t0, ipred_cfl_left_ssse3_table
+    tzcnt                wd, wm
+    inc                 tlq
+    movu                 m0, [tlq]
+    movifnidn            hd, hm
+    mov                 r6d, 0x8000
+    movd                 m3, r6d
+    movd                 m2, wd
+    psrld                m3, m2
+    movsxd               r6, [t0+wq*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, t0
+    add                  t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
+    movsxd               wq, [t0+wq*4]
+    add                  wq, t0
+    movifnidn           acq, acmp
+    jmp                  r6
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                           const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    LEA                  r6, ipred_cfl_splat_ssse3_table
+    movsxd               wq, [r6+wq*4]
+    movddup              m0, [r6-ipred_cfl_splat_ssse3_table+pw_128]
+    add                  wq, r6
+    movifnidn           acq, acmp
+    jmp                  wq
+
+%macro RELOAD_ACQ_32 1
+    mov                 acq, ac_bakq       ; restore acq
+%endmacro
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_420, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
+DECLARE_REG_TMP 7
+    movddup              m2, [pb_2]
+%else
+cglobal ipred_cfl_ac_420, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
+DECLARE_REG_TMP 4
+%define ac_bakq acmp
+    mov                 t0d, 0x02020202
+    movd                 m2, t0d
+    pshufd               m2, m2, q0000
+%endif
+    movifnidn            wd, wm
+    mov                 t0d, hm
+    mov                  hd, t0d
+    imul                t0d, wd
+    movd                 m5, t0d
+    movifnidn         hpadd, hpadm
+%if ARCH_X86_64
+    mov             ac_bakq, acq
+%endif
+    shl               hpadd, 2
+    sub                  hd, hpadd
+    pxor                 m4, m4
+    cmp                  wd, 8
+    jg .w16
+    je .w8
+    ; fall-through
+%if ARCH_X86_64
+    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+    lea            stride3q, [strideq*3]
+.w4_loop:
+    movq                 m0, [yq]
+    movq                 m1, [yq+strideq]
+    movhps               m0, [yq+strideq*2]
+    movhps               m1, [yq+stride3q]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    mova              [acq], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 16
+    sub                  hd, 2
+    jg .w4_loop
+    test              hpadd, hpadd
+    jz .calc_avg_4_8
+    punpckhqdq           m0, m0
+.w4_hpad_loop:
+    mova              [acq], m0
+    paddw                m4, m0
+    add                 acq, 16
+    sub               hpadd, 2
+    jg .w4_hpad_loop
+    jmp .calc_avg_4_8
+.w8:
+    lea            stride3q, [strideq*3]
+    test              wpadd, wpadd
+    jnz .w8_wpad
+.w8_loop:
+    mova                 m0, [yq]
+    mova                 m1, [yq+strideq]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    mova              [acq], m0
+    paddw                m4, m0
+    mova                 m0, [yq+strideq*2]
+    mova                 m1, [yq+stride3q]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    mova           [acq+16], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 32
+    sub                  hd, 2
+    jg .w8_loop
+    test              hpadd, hpadd
+    jz .calc_avg_4_8
+    jmp .w8_hpad
+.w8_wpad:                                              ; wpadd=1
+    movddup              m0, [yq]
+    movddup              m1, [yq+strideq]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    pshufhw              m0, m0, q3333
+    mova              [acq], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 16
+    sub                  hd, 1
+    jg .w8_wpad
+    test              hpadd, hpadd
+    jz .calc_avg_4_8
+.w8_hpad:
+    mova              [acq], m0
+    paddw                m4, m0
+    add                 acq, 16
+    sub               hpadd, 1
+    jg .w8_hpad
+    jmp .calc_avg_4_8
+.w16:
+    test              wpadd, wpadd
+    jnz .w16_wpad
+.w16_loop:
+    mova                 m0, [yq]
+    mova                 m1, [yq+strideq]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    mova              [acq], m0
+    paddw                m4, m0
+    mova                 m6, [yq+16]
+    mova                 m1, [yq+strideq+16]
+    pmaddubsw            m6, m2
+    pmaddubsw            m1, m2
+    paddw                m6, m1
+    mova           [acq+16], m6
+    paddw                m4, m6
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 32
+    dec                  hd
+    jg .w16_loop
+    test              hpadd, hpadd
+    jz .calc_avg16
+    jmp .w16_hpad_loop
+.w16_wpad:
+    cmp               wpadd, 2
+    jl .w16_pad1
+    je .w16_pad2
+.w16_pad3:
+    movddup              m0, [yq]
+    movddup              m1, [yq+strideq]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    pshufhw              m0, m0, q3333
+    mova              [acq], m0
+    paddw                m4, m0
+    mova                 m6, m0
+    punpckhqdq           m6, m0, m0
+    mova           [acq+16], m6
+    paddw                m4, m6
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 32
+    dec                  hd
+    jg .w16_pad3
+    jmp .w16_wpad_done
+.w16_pad2:
+    mova                 m0, [yq]
+    mova                 m1, [yq+strideq]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    mova              [acq], m0
+    paddw                m4, m0
+    pshufhw              m6, m0, q3333
+    punpckhqdq           m6, m6
+    mova           [acq+16], m6
+    paddw                m4, m6
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 32
+    dec                  hd
+    jg .w16_pad2
+    jmp .w16_wpad_done
+.w16_pad1:
+    mova                 m0, [yq]
+    mova                 m1, [yq+strideq]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    mova              [acq], m0
+    paddw                m4, m0
+    movddup              m6, [yq+16]
+    movddup              m1, [yq+strideq+16]
+    pmaddubsw            m6, m2
+    pmaddubsw            m1, m2
+    paddw                m6, m1
+    pshufhw              m6, m6, q3333
+    mova           [acq+16], m6
+    paddw                m4, m6
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 32
+    dec                  hd
+    jg .w16_pad1
+.w16_wpad_done:
+    test              hpadd, hpadd
+    jz .calc_avg16
+.w16_hpad_loop:
+    mova              [acq], m0
+    paddw                m4, m0
+    mova           [acq+16], m6
+    paddw                m4, m6
+    add                 acq, 32
+    dec               hpadd
+    jg .w16_hpad_loop
+    jmp .calc_avg16
+
+%if ARCH_X86_64
+    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+.calc_avg_4_8:
+    psrlw                m2, 9
+    pmaddwd              m4, m2
+    jmp .calc_avg
+.calc_avg16:
+    psrld                m0, m4, 16
+    pslld                m4, 16
+    psrld                m4, 16
+    paddd                m4, m0
+.calc_avg:
+    movd                szd, m5
+    psrad                m5, 1
+    tzcnt               r1d, szd
+    paddd                m4, m5
+    movd                 m1, r1d
+    pshufd               m0, m4, q2301
+    paddd                m0, m4
+    pshufd               m4, m0, q1032
+    paddd                m0, m4
+    psrad                m0, m1                        ; sum >>= log2sz;
+    packssdw             m0, m0
+    RELOAD_ACQ_32       acq
+.sub_loop:
+    mova                 m1, [acq]
+    psubw                m1, m0                        ; ac[x] -= sum;
+    mova              [acq], m1
+    add                 acq, 16
+    sub                 szd, 8
+    jg .sub_loop
+    RET
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_422, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
+    movddup              m2, [pb_4]
+%else
+cglobal ipred_cfl_ac_422, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
+    mov                 t0d, 0x04040404
+    movd                 m2, t0d
+    pshufd               m2, m2, q0000
+%endif
+    movifnidn            wd, wm
+    mov                 t0d, hm
+    mov                  hd, t0d
+    imul                t0d, wd
+    movd                 m6, t0d
+    movifnidn         hpadd, hpadm
+%if ARCH_X86_64
+    mov             ac_bakq, acq
+%endif
+    shl               hpadd, 2
+    sub                  hd, hpadd
+    pxor                 m4, m4
+    pxor                 m5, m5
+    cmp                  wd, 8
+    jg .w16
+    je .w8
+    ; fall-through
+
+%if ARCH_X86_64
+    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+    lea            stride3q, [strideq*3]
+.w4_loop:
+    movq                 m1, [yq]
+    movhps               m1, [yq+strideq]
+    movq                 m0, [yq+strideq*2]
+    movhps               m0, [yq+stride3q]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    mova           [acq+16], m0
+    paddw                m4, m0
+    paddw                m5, m1
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 32
+    sub                  hd, 4
+    jg .w4_loop
+    test              hpadd, hpadd
+    jz .calc_avg_4
+    punpckhqdq           m0, m0
+.w4_hpad_loop:
+    mova              [acq], m0
+    paddw                m4, m0
+    add                 acq, 16
+    sub               hpadd, 2
+    jg .w4_hpad_loop
+    jmp .calc_avg_4
+.w8:
+    lea            stride3q, [strideq*3]
+    test              wpadd, wpadd
+    jnz .w8_wpad
+.w8_loop:
+    mova                 m1, [yq]
+    mova                 m0, [yq+strideq]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    mova           [acq+16], m0
+    paddw                m4, m0
+    paddw                m5, m1
+    mova                 m1, [yq+strideq*2]
+    mova                 m0, [yq+stride3q]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    mova           [acq+32], m1
+    mova           [acq+48], m0
+    paddw                m4, m0
+    paddw                m5, m1
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 64
+    sub                  hd, 4
+    jg .w8_loop
+    test              hpadd, hpadd
+    jz .calc_avg_8_16
+    jmp .w8_hpad
+.w8_wpad:
+    movddup              m1, [yq]
+    pmaddubsw            m1, m2
+    pshufhw              m1, m1, q3333
+    mova              [acq], m1
+    paddw                m5, m1
+    movddup              m0, [yq+strideq]
+    pmaddubsw            m0, m2
+    pshufhw              m0, m0, q3333
+    mova           [acq+16], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 32
+    sub                  hd, 2
+    jg .w8_wpad
+    test              hpadd, hpadd
+    jz .calc_avg_8_16
+.w8_hpad:
+    mova              [acq], m0
+    paddw                m4, m0
+    mova           [acq+16], m0
+    paddw                m4, m0
+    add                 acq, 32
+    sub               hpadd, 2
+    jg .w8_hpad
+    jmp .calc_avg_8_16
+.w16:
+    test              wpadd, wpadd
+    jnz .w16_wpad
+.w16_loop:
+    mova                 m1, [yq]
+    mova                 m0, [yq+16]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    mova           [acq+16], m0
+    paddw                m5, m0
+    paddw                m5, m1
+    mova                 m1, [yq+strideq]
+    mova                 m0, [yq+strideq+16]
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    mova           [acq+32], m1
+    mova           [acq+48], m0
+    paddw                m4, m0
+    paddw                m4, m1
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_loop
+    test              hpadd, hpadd
+    jz .calc_avg_8_16
+    jmp .w16_hpad_loop
+.w16_wpad:
+    cmp               wpadd, 2
+    jl .w16_pad1
+    je .w16_pad2
+.w16_pad3:
+    movddup              m1, [yq]
+    pmaddubsw            m1, m2
+    pshufhw              m1, m1, q3333
+    mova              [acq], m1
+    paddw                m5, m1
+    punpckhqdq           m1, m1
+    mova           [acq+16], m1
+    paddw                m5, m1
+    movddup              m1, [yq+strideq]
+    pmaddubsw            m1, m2
+    pshufhw              m1, m1, q3333
+    mova           [acq+32], m1
+    paddw                m4, m1
+    punpckhqdq           m0, m1, m1
+    mova           [acq+48], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_pad3
+    jmp .w16_wpad_done
+.w16_pad2:
+    mova                 m1, [yq]
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1
+    pshufhw              m1, m1, q3333
+    punpckhqdq           m1, m1
+    mova           [acq+16], m1
+    paddw                m5, m1
+    mova                 m1, [yq+strideq]
+    pmaddubsw            m1, m2
+    mova           [acq+32], m1
+    paddw                m4, m1
+    mova                 m0, m1
+    pshufhw              m0, m0, q3333
+    punpckhqdq           m0, m0
+    mova           [acq+48], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_pad2
+    jmp .w16_wpad_done
+.w16_pad1:
+    mova                 m1, [yq]
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1
+    movddup              m0, [yq+16]
+    pmaddubsw            m0, m2
+    pshufhw              m0, m0, q3333
+    mova           [acq+16], m0
+    paddw                m5, m0
+    mova                 m1, [yq+strideq]
+    pmaddubsw            m1, m2
+    mova           [acq+32], m1
+    paddw                m4, m1
+    movddup              m0, [yq+strideq+16]
+    pmaddubsw            m0, m2
+    pshufhw              m0, m0, q3333
+    mova           [acq+48], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_pad1
+.w16_wpad_done:
+    test              hpadd, hpadd
+    jz .calc_avg_8_16
+.w16_hpad_loop:
+    mova              [acq], m1
+    mova           [acq+16], m0
+    paddw                m4, m1
+    paddw                m5, m0
+    mova           [acq+32], m1
+    mova           [acq+48], m0
+    paddw                m4, m1
+    paddw                m5, m0
+    add                 acq, 64
+    sub               hpadd, 2
+    jg .w16_hpad_loop
+    jmp .calc_avg_8_16
+
+%if ARCH_X86_64
+    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+.calc_avg_4:
+    psrlw                m2, 10
+    pmaddwd              m5, m2
+    pmaddwd              m0, m4, m2
+    jmp .calc_avg
+.calc_avg_8_16:
+    mova                 m0, m5
+    psrld                m5, 16
+    pslld                m0, 16
+    psrld                m0, 16
+    paddd                m5, m0
+    mova                 m0, m4
+    psrld                m0, 16
+    pslld                m4, 16
+    psrld                m4, 16
+    paddd                m0, m4
+.calc_avg:
+    paddd                m5, m0
+    movd                szd, m6
+    psrad                m6, 1
+    tzcnt               r1d, szd                       ; const int log2sz = ctz(width) + ctz(height);
+    paddd                m5, m6
+    movd                 m1, r1d
+    pshufd               m0, m5, q2301
+    paddd                m0, m5
+    pshufd               m5, m0, q1032
+    paddd                m0, m5
+    psrad                m0, m1                        ; sum >>= log2sz;
+    packssdw             m0, m0
+    RELOAD_ACQ_32       acq                            ; ac = ac_orig
+.sub_loop:
+    mova                 m1, [acq]
+    psubw                m1, m0
+    mova              [acq], m1
+    add                 acq, 16
+    sub                 szd, 8
+    jg .sub_loop
+    RET
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_444, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
+    movddup              m2, [pb_4]
+%else
+cglobal ipred_cfl_ac_444, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
+%define ac_bakq [rsp+16*4]
+    mov                 t0d, 0x04040404
+    movd                 m2, t0d
+    pshufd               m2, m2, q0000
+%endif
+    movifnidn            wd, wm
+    movifnidn         hpadd, hpadm
+    movd                 m0, hpadd
+    mov                 t0d, hm
+    mov                  hd, t0d
+    imul                t0d, wd
+    movd                 m6, t0d
+    movd              hpadd, m0
+    mov             ac_bakq, acq
+    shl               hpadd, 2
+    sub                  hd, hpadd
+    pxor                 m5, m5
+    pxor                 m4, m4
+    cmp                  wd, 16
+    jg .w32
+    cmp                  wd, 8
+    jg .w16
+    je .w8
+    ; fall-through
+
+%if ARCH_X86_64
+    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+    lea            stride3q, [strideq*3]
+.w4_loop:
+    movd                 m1, [yq]
+    movd                 m3, [yq+strideq]
+    punpckldq            m1, m3
+    punpcklbw            m1, m1
+    movd                 m0, [yq+strideq*2]
+    movd                 m3, [yq+stride3q]
+    punpckldq            m0, m3
+    punpcklbw            m0, m0
+    pmaddubsw            m1, m2
+    pmaddubsw            m0, m2
+    mova              [acq], m1
+    mova           [acq+16], m0
+    paddw                m5, m0
+    paddw                m5, m1
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 32
+    sub                  hd, 4
+    jg .w4_loop
+    test              hpadd, hpadd
+    jz .calc_avg_4
+    punpckhqdq           m0, m0
+.w4_hpad_loop:
+    mova              [acq], m0
+    paddw                m5, m0
+    add                 acq, 16
+    sub               hpadd, 2
+    jg .w4_hpad_loop
+.calc_avg_4:
+    psrlw                m2, 10
+    pmaddwd              m5, m2
+    jmp .calc_avg
+
+.w8:
+    lea            stride3q, [strideq*3]
+    test              wpadd, wpadd
+    jnz .w8_wpad
+.w8_loop:
+    movq                 m1, [yq]
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1
+    movq                 m0, [yq+strideq]
+    punpcklbw            m0, m0
+    pmaddubsw            m0, m2
+    mova           [acq+16], m0
+    paddw                m5, m0
+    movq                 m1, [yq+strideq*2]
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova           [acq+32], m1
+    paddw                m4, m1
+    movq                 m0, [yq+stride3q]
+    punpcklbw            m0, m0
+    pmaddubsw            m0, m2
+    mova           [acq+48], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 64
+    sub                  hd, 4
+    jg .w8_loop
+    test              hpadd, hpadd
+    jz .calc_avg_8_16
+    jmp .w8_hpad
+.w8_wpad:
+    movd                 m1, [yq]
+    punpcklbw            m1, m1
+    punpcklqdq           m1, m1
+    pmaddubsw            m1, m2
+    pshufhw              m1, m1, q3333
+    mova              [acq], m1
+    paddw                m5, m1
+    movd                 m0, [yq+strideq]
+    punpcklbw            m0, m0
+    punpcklqdq           m0, m0
+    pmaddubsw            m0, m2
+    pshufhw              m0, m0, q3333
+    mova           [acq+16], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 32
+    sub                  hd, 2
+    jg .w8_wpad
+    test              hpadd, hpadd
+    jz .calc_avg_8_16
+.w8_hpad:
+    mova              [acq], m0
+    paddw                m5, m0
+    mova           [acq+16], m0
+    paddw                m4, m0
+    add                 acq, 32
+    sub               hpadd, 2
+    jg .w8_hpad
+    jmp .calc_avg_8_16
+
+.w16:
+    test              wpadd, wpadd
+    jnz .w16_wpad
+.w16_loop:
+    mova                 m0, [yq]
+    mova                 m1, m0
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1
+    punpckhbw            m0, m0
+    pmaddubsw            m0, m2
+    mova           [acq+16], m0
+    paddw                m5, m0
+    mova                 m0, [yq+strideq]
+    mova                 m1, m0
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova           [acq+32], m1
+    paddw                m4, m1
+    punpckhbw            m0, m0
+    pmaddubsw            m0, m2
+    mova           [acq+48], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_loop
+    test              hpadd, hpadd
+    jz .calc_avg_8_16
+    jmp .w16_hpad_loop
+.w16_wpad:
+    cmp               wpadd, 2
+    jl .w16_pad1
+    je .w16_pad2
+.w16_pad3:
+    movd                 m1, [yq]
+    punpcklbw            m1, m1
+    punpcklqdq           m1, m1
+    pshufhw              m1, m1, q3333
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1
+    punpckhqdq           m1, m1
+    mova           [acq+16], m1
+    paddw                m5, m1
+    movd                 m1, [yq+strideq]
+    punpcklbw            m1, m1
+    punpcklqdq           m1, m1
+    pshufhw              m1, m1, q3333
+    pmaddubsw            m1, m2
+    mova           [acq+32], m1
+    paddw                m4, m1
+    punpckhqdq           m0, m1, m1
+    mova           [acq+48], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_pad3
+    jmp .w16_wpad_done
+.w16_pad2:
+    movq                 m1, [yq]
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1
+    pshufhw              m1, m1, q3333
+    punpckhqdq           m1, m1
+    mova           [acq+16], m1
+    paddw                m5, m1
+    movq                 m1, [yq+strideq]
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova           [acq+32], m1
+    paddw                m4, m1
+    mova                 m0, m1
+    pshufhw              m0, m0, q3333
+    punpckhqdq           m0, m0
+    mova           [acq+48], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_pad2
+    jmp .w16_wpad_done
+.w16_pad1:
+    mova                 m0, [yq]
+    mova                 m1, m0
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1
+    punpckhbw            m0, m0
+    punpcklqdq           m0, m0
+    pshufhw              m0, m0, q3333
+    pmaddubsw            m0, m2
+    mova           [acq+16], m0
+    paddw                m5, m0
+    mova                 m0, [yq+strideq]
+    mova                 m1, m0
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova           [acq+32], m1
+    paddw                m4, m1
+    punpckhbw            m0, m0
+    punpcklqdq           m0, m0
+    pshufhw              m0, m0, q3333
+    pmaddubsw            m0, m2
+    mova           [acq+48], m0
+    paddw                m4, m0
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_pad1
+.w16_wpad_done:
+    test              hpadd, hpadd
+    jz .calc_avg_8_16
+.w16_hpad_loop:
+    mova              [acq], m1
+    mova           [acq+16], m0
+    paddw                m4, m1
+    paddw                m5, m0
+    mova           [acq+32], m1
+    mova           [acq+48], m0
+    paddw                m4, m1
+    paddw                m5, m0
+    add                 acq, 64
+    sub               hpadd, 2
+    jg .w16_hpad_loop
+.calc_avg_8_16:
+    mova                 m0, m5
+    psrld                m5, 16
+    pslld                m0, 16
+    psrld                m0, 16
+    paddd                m5, m0
+    mova                 m0, m4
+    psrld                m0, 16
+    pslld                m4, 16
+    psrld                m4, 16
+    paddd                m0, m4
+    paddd                m5, m0
+    jmp .calc_avg
+
+.w32:
+    pxor                 m0, m0
+    mova           [rsp   ], m0
+    mova           [rsp+16], m0
+    mova           [rsp+32], m0
+    mova           [rsp+48], m0
+    test              wpadd, wpadd
+    jnz .w32_wpad
+.w32_loop:
+    mova                 m0, [yq]
+    mova                 m1, m0
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1, [rsp]
+    mova           [rsp   ], m5
+    punpckhbw            m0, m0
+    pmaddubsw            m0, m2
+    mova           [acq+16], m0
+    paddw                m5, m0, [rsp+16]
+    mova           [rsp+16], m5
+    mova                 m4, [yq+16]
+    mova                 m3, m4
+    punpcklbw            m3, m3
+    pmaddubsw            m3, m2
+    mova           [acq+32], m3
+    paddw                m5, m3, [rsp+32]
+    mova           [rsp+32], m5
+    punpckhbw            m4, m4
+    pmaddubsw            m4, m2
+    mova           [acq+48], m4
+    paddw                m5, m4, [rsp+48]
+    mova           [rsp+48], m5
+    lea                  yq, [yq+strideq]
+    add                 acq, 64
+    sub                  hd, 1
+    jg .w32_loop
+    test              hpadd, hpadd
+    jz .calc_avg_32
+    jmp .w32_hpad_loop
+.w32_wpad:
+    cmp               wpadd, 2
+    jl .w32_pad1
+    je .w32_pad2
+    cmp               wpadd, 4
+    jl .w32_pad3
+    je .w32_pad4
+    cmp               wpadd, 6
+    jl .w32_pad5
+    je .w32_pad6
+.w32_pad7:
+    movd                 m1, [yq]
+    punpcklbw            m1, m1
+    punpcklqdq           m1, m1
+    pshufhw              m1, m1, q3333
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1, [rsp]
+    mova           [rsp   ], m5
+    mova                 m0, m1
+    punpckhqdq           m0, m0
+    mova           [acq+16], m0
+    paddw                m5, m0, [rsp+16]
+    mova           [rsp+16], m5
+    mova                 m3, m0
+    mova           [acq+32], m3
+    paddw                m5, m3, [rsp+32]
+    mova           [rsp+32], m5
+    mova                 m4, m3
+    mova           [acq+48], m4
+    paddw                m5, m4, [rsp+48]
+    mova           [rsp+48], m5
+    lea                  yq, [yq+strideq]
+    add                 acq, 64
+    sub                  hd, 1
+    jg .w32_pad7
+    jmp .w32_wpad_done
+.w32_pad6:
+    mova                 m0, [yq]
+    mova                 m1, m0
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1, [rsp]
+    mova           [rsp   ], m5
+    pshufhw              m0, m1, q3333
+    punpckhqdq           m0, m0
+    mova           [acq+16], m0
+    paddw                m5, m0, [rsp+16]
+    mova           [rsp+16], m5
+    mova                 m3, m0
+    mova           [acq+32], m3
+    paddw                m5, m3, [rsp+32]
+    mova           [rsp+32], m5
+    mova                 m4, m3
+    mova           [acq+48], m4
+    paddw                m5, m4, [rsp+48]
+    mova           [rsp+48], m5
+    lea                  yq, [yq+strideq]
+    add                 acq, 64
+    sub                  hd, 1
+    jg .w32_pad6
+    jmp .w32_wpad_done
+.w32_pad5:
+    mova                 m0, [yq]
+    mova                 m1, m0
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    mova                 m5, [rsp]
+    paddw                m5, m1
+    mova           [rsp   ], m5
+    punpckhbw            m0, m0
+    punpcklqdq           m0, m0
+    pshufhw              m0, m0, q3333
+    pmaddubsw            m0, m2
+    mova           [acq+16], m0
+    paddw                m5, m0, [rsp+16]
+    mova           [rsp+16], m5
+    mova                 m3, m0
+    punpckhqdq           m3, m3
+    mova           [acq+32], m3
+    paddw                m5, m3, [rsp+32]
+    mova           [rsp+32], m5
+    mova                 m4, m3
+    mova           [acq+48], m4
+    paddw                m5, m4, [rsp+48]
+    mova           [rsp+48], m5
+    lea                  yq, [yq+strideq]
+    add                 acq, 64
+    sub                  hd, 1
+    jg .w32_pad5
+    jmp .w32_wpad_done
+.w32_pad4:
+    mova                 m0, [yq]
+    mova                 m1, m0
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1, [rsp]
+    mova           [rsp   ], m5
+    punpckhbw            m0, m0
+    pmaddubsw            m0, m2
+    mova           [acq+16], m0
+    paddw                m5, m0, [rsp+16]
+    mova           [rsp+16], m5
+    mova                 m3, m0
+    pshufhw              m3, m3, q3333
+    punpckhqdq           m3, m3
+    mova           [acq+32], m3
+    paddw                m5, m3, [rsp+32]
+    mova           [rsp+32], m5
+    mova                 m4, m3
+    mova           [acq+48], m4
+    paddw                m5, m4, [rsp+48]
+    mova           [rsp+48], m5
+    lea                  yq, [yq+strideq]
+    add                 acq, 64
+    sub                  hd, 1
+    jg .w32_pad4
+    jmp .w32_wpad_done
+.w32_pad3:
+    mova                 m0, [yq]
+    mova                 m1, m0
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1, [rsp]
+    mova           [rsp   ], m5
+    punpckhbw            m0, m0
+    pmaddubsw            m0, m2
+    mova           [acq+16], m0
+    paddw                m5, m0, [rsp+16]
+    mova           [rsp+16], m5
+    movd                 m3, [yq+16]
+    punpcklbw            m3, m3
+    punpcklqdq           m3, m3
+    pshufhw              m3, m3, q3333
+    pmaddubsw            m3, m2
+    mova           [acq+32], m3
+    paddw                m5, m3, [rsp+32]
+    mova           [rsp+32], m5
+    mova                 m4, m3
+    punpckhqdq           m4, m4
+    mova           [acq+48], m4
+    paddw                m5, m4, [rsp+48]
+    mova           [rsp+48], m5
+    lea                  yq, [yq+strideq]
+    add                 acq, 64
+    sub                  hd, 1
+    jg .w32_pad3
+    jmp .w32_wpad_done
+.w32_pad2:
+    mova                 m0, [yq]
+    mova                 m1, m0
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1, [rsp]
+    mova           [rsp   ], m5
+    punpckhbw            m0, m0
+    pmaddubsw            m0, m2
+    mova           [acq+16], m0
+    paddw                m5, m0, [rsp+16]
+    mova           [rsp+16], m5
+    mova                 m3, [yq+16]
+    punpcklbw            m3, m3
+    pmaddubsw            m3, m2
+    mova           [acq+32], m3
+    paddw                m5, m3, [rsp+32]
+    mova           [rsp+32], m5
+    pshufhw              m4, m3, q3333
+    punpckhqdq           m4, m4
+    mova           [acq+48], m4
+    paddw                m5, m4, [rsp+48]
+    mova           [rsp+48], m5
+    lea                  yq, [yq+strideq]
+    add                 acq, 64
+    sub                  hd, 1
+    jg .w32_pad2
+    jmp .w32_wpad_done
+.w32_pad1:
+    mova                 m0, [yq]
+    mova                 m1, m0
+    punpcklbw            m1, m1
+    pmaddubsw            m1, m2
+    mova              [acq], m1
+    paddw                m5, m1, [rsp]
+    mova           [rsp   ], m5
+    punpckhbw            m0, m0
+    pmaddubsw            m0, m2
+    mova           [acq+16], m0
+    paddw                m5, m0, [rsp+16]
+    mova           [rsp+16], m5
+    mova                 m4, [yq+16]
+    mova                 m3, m4
+    punpcklbw            m3, m3
+    pmaddubsw            m3, m2
+    mova           [acq+32], m3
+    paddw                m5, m3, [rsp+32]
+    mova           [rsp+32], m5
+    punpckhbw            m4, m4
+    punpcklqdq           m4, m4
+    pshufhw              m4, m4, q3333
+    pmaddubsw            m4, m2
+    mova           [acq+48], m4
+    paddw                m5, m4, [rsp+48]
+    mova           [rsp+48], m5
+    lea                  yq, [yq+strideq]
+    add                 acq, 64
+    sub                  hd, 1
+    jg .w32_pad1
+.w32_wpad_done:
+    test              hpadd, hpadd
+    jz .calc_avg_32
+.w32_hpad_loop:
+    mova              [acq], m1
+    mova           [acq+16], m0
+    paddw                m5, m1, [rsp]
+    mova           [rsp   ], m5
+    paddw                m5, m0, [rsp+16]
+    mova           [rsp+16], m5
+    mova           [acq+32], m3
+    mova           [acq+48], m4
+    paddw                m5, m3, [rsp+32]
+    mova           [rsp+32], m5
+    paddw                m5, m4, [rsp+48]
+    mova           [rsp+48], m5
+    add                 acq, 64
+    sub               hpadd, 1
+    jg .w32_hpad_loop
+
+%if ARCH_X86_64
+    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+
+.calc_avg_32:
+    mova                 m5, [rsp]
+    mova                 m0, m5
+    psrld                m5, 16
+    pslld                m0, 16
+    psrld                m0, 16
+    paddd                m5, m0
+    mova                 m0, [rsp+16]
+    mova                 m3, m0
+    psrld                m0, 16
+    pslld                m3, 16
+    psrld                m3, 16
+    paddd                m0, m3
+    paddd                m5, m0
+    mova                 m0, [rsp+32]
+    mova                 m3, m0
+    psrld                m0, 16
+    pslld                m3, 16
+    psrld                m3, 16
+    paddd                m0, m3
+    mova                 m1, [rsp+48]
+    mova                 m3, m1
+    psrld                m1, 16
+    pslld                m3, 16
+    psrld                m3, 16
+    paddd                m1, m3
+    paddd                m1, m0
+    paddd                m5, m1
+.calc_avg:
+    movd                szd, m6
+    psrad                m6, 1
+    tzcnt               r1d, szd                       ; const int log2sz = ctz(width) + ctz(height);
+    paddd                m5, m6
+    movd                 m1, r1d
+    pshufd               m0, m5, q2301
+    paddd                m0, m5
+    pshufd               m5, m0, q1032
+    paddd                m0, m5
+    psrad                m0, m1                        ; sum >>= log2sz;
+    packssdw             m0, m0
+    RELOAD_ACQ_32       acq                            ; ac = ac_orig
+.sub_loop:
+    mova                 m1, [acq]
+    psubw                m1, m0
+    mova              [acq], m1
+    add                 acq, 16
+    sub                 szd, 8
+    jg .sub_loop
+    RET
+
+; %1 simd register that hold the mask and will hold the result
+; %2 simd register that holds the "true" values
+; %3 location of the "false" values (simd register/memory)
+%macro BLEND 3 ; mask, true, false
+    pand  %2, %1
+    pandn %1, %3
+    por   %1, %2
+%endmacro
+
+%macro PAETH 2                                 ; top, ldiff
+    pavgb                m1, m%1, m3
+    pxor                 m0, m%1, m3
+    pand                 m0, m4
+    psubusb              m2, m5, m1
+    psubb                m1, m0
+    psubusb              m1, m5
+    por                  m1, m2
+    paddusb              m1, m1
+    por                  m1, m0               ; min(tldiff, 255)
+    psubusb              m2, m5, m3
+    psubusb              m0, m3, m5
+    por                  m2, m0               ; tdiff
+%ifnum %2
+    pminub               m2, m%2
+    pcmpeqb              m0, m%2, m2          ; ldiff <= tdiff
+%else
+    mova                 m0, %2
+    pminub               m2, m0
+    pcmpeqb              m0, m2
+%endif
+    pminub               m1, m2
+    pcmpeqb              m1, m2               ; ldiff <= tldiff && tdiff <= tldiff
+    mova                 m2, m3
+    BLEND                m0, m2, m%1
+    BLEND                m1, m0, m5
+%endmacro
+
+cglobal ipred_paeth, 3, 6, 8, -7*16, dst, stride, tl, w, h
+%define base r5-ipred_paeth_ssse3_table
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    pxor                 m0, m0
+    movd                 m5, [tlq]
+    pshufb               m5, m0
+    LEA                  r5, ipred_paeth_ssse3_table
+    movsxd               wq, [r5+wq*4]
+    movddup              m4, [base+ipred_paeth_shuf]
+    add                  wq, r5
+    jmp                  wq
+.w4:
+    movd                 m6, [tlq+1]            ; top
+    pshufd               m6, m6, q0000
+    lea                  r3, [strideq*3]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0                 ; ldiff
+.w4_loop:
+    sub                 tlq, 4
+    movd                 m3, [tlq]
+    mova                 m1, [base+ipred_h_shuf]
+    pshufb               m3, m1                 ; left
+    PAETH                 6, 7
+    movd   [dstq          ], m1
+    pshuflw              m0, m1, q1032
+    movd   [dstq+strideq  ], m0
+    punpckhqdq           m1, m1
+    movd   [dstq+strideq*2], m1
+    psrlq                m1, 32
+    movd   [dstq+r3       ], m1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4_loop
+    RET
+ALIGN function_align
+.w8:
+    movddup              m6, [tlq+1]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+.w8_loop:
+    sub                 tlq, 2
+    movd                 m3, [tlq]
+    pshufb               m3, [base+ipred_paeth_shuf]
+    PAETH                 6, 7
+    movq     [dstq        ], m1
+    movhps   [dstq+strideq], m1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+ALIGN function_align
+.w16:
+    movu                 m6, [tlq+1]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+.w16_loop:
+    sub                 tlq, 1
+    movd                 m3, [tlq]
+    pxor                 m1, m1
+    pshufb               m3, m1
+    PAETH                 6, 7
+    mova             [dstq], m1
+    add                dstq, strideq
+    sub                  hd, 1
+    jg .w16_loop
+    RET
+ALIGN function_align
+.w32:
+    movu                 m6, [tlq+1]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+    mova           [rsp   ], m6
+    mova           [rsp+16], m7
+    movu                 m6, [tlq+17]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+    mova           [rsp+32], m6
+.w32_loop:
+    dec                 tlq
+    movd                 m3, [tlq]
+    pxor                 m1, m1
+    pshufb               m3, m1
+    mova                 m6, [rsp]
+    PAETH                 6, [rsp+16]
+    mova          [dstq   ], m1
+    mova                 m6, [rsp+32]
+    PAETH                 6, 7
+    mova          [dstq+16], m1
+    add                dstq, strideq
+    dec                  hd
+    jg .w32_loop
+    RET
+ALIGN function_align
+.w64:
+    movu                 m6, [tlq+1]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+    mova           [rsp   ], m6
+    mova           [rsp+16], m7
+    movu                 m6, [tlq+17]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+    mova           [rsp+32], m6
+    mova           [rsp+48], m7
+    movu                 m6, [tlq+33]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+    mova           [rsp+64], m6
+    mova           [rsp+80], m7
+    movu                 m6, [tlq+49]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+    mova           [rsp+96], m6
+.w64_loop:
+    dec                 tlq
+    movd                 m3, [tlq]
+    pxor                 m1, m1
+    pshufb               m3, m1
+    mova                 m6, [rsp]
+    PAETH                 6, [rsp+16]
+    mova          [dstq   ], m1
+    mova                 m6, [rsp+32]
+    PAETH                 6, [rsp+48]
+    mova          [dstq+16], m1
+    mova                 m6, [rsp+64]
+    PAETH                 6, [rsp+80]
+    mova          [dstq+32], m1
+    mova                 m6, [rsp+96]
+    PAETH                 6, 7
+    mova          [dstq+48], m1
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_loop
+    RET
+
+
+%macro FILTER 4  ;dst, src, tmp, shuf
+%ifnum %4
+    pshufb               m%2, m%4
+%else
+    pshufb               m%2, %4
+%endif
+    pshufd               m%1, m%2, q0000           ;p0 p1
+    pmaddubsw            m%1, m2
+    pshufd               m%3, m%2, q1111           ;p2 p3
+    pmaddubsw            m%3, m3
+    paddw                m%1, [base+pw_8]
+    paddw                m%1, m%3
+    pshufd               m%3, m%2, q2222           ;p4 p5
+    pmaddubsw            m%3, m4
+    paddw                m%1, m%3
+    pshufd               m%3, m%2, q3333           ;p6 __
+    pmaddubsw            m%3, m5
+    paddw                m%1, m%3
+    psraw                m%1, 4
+    packuswb             m%1, m%1
+%endmacro
+
+cglobal ipred_filter, 3, 7, 8, dst, stride, tl, w, h, filter
+%define base r6-$$
+    LEA                   r6, $$
+    tzcnt                 wd, wm
+%ifidn filterd, filterm
+    movzx            filterd, filterb
+%else
+    movzx            filterd, byte filterm
+%endif
+    shl              filterd, 6
+    lea              filterq, [base+filter_intra_taps+filterq]
+    movq                  m0, [tlq-3]                     ;_ 6 5 0 1 2 3 4
+    movsxd                wq, [base+ipred_filter_ssse3_table+wq*4]
+    mova                  m2, [filterq+16*0]
+    mova                  m3, [filterq+16*1]
+    mova                  m4, [filterq+16*2]
+    mova                  m5, [filterq+16*3]
+    lea                   wq, [base+ipred_filter_ssse3_table+wq]
+    mov                   hd, hm
+    jmp                   wq
+.w4:
+    mova                  m1, [base+filter_shuf1]
+    sub                  tlq, 3
+    sub                  tlq, hq
+    jmp .w4_loop_start
+.w4_loop:
+    movd                  m0, [tlq+hq]
+    punpckldq             m0, m6
+    lea                 dstq, [dstq+strideq*2]
+.w4_loop_start:
+    FILTER                 6, 0, 7, 1
+    movd    [dstq+strideq*0], m6
+    pshuflw               m6, m6, q1032
+    movd    [dstq+strideq*1], m6
+    sub                   hd, 2
+    jg .w4_loop
+    RET
+
+ALIGN function_align
+.w8:
+    movq                  m6, [tlq+1]                   ;_ _ _ 0 1 2 3 4
+    sub                  tlq, 5
+    sub                  tlq, hq
+
+.w8_loop:
+    FILTER                 7, 0, 1, [base+filter_shuf1]
+    punpcklqdq            m6, m7                        ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    FILTER                 0, 6, 1, [base+filter_shuf2]
+
+    punpckldq             m6, m7, m0
+    movq    [dstq+strideq*0], m6
+    punpckhqdq            m6, m6
+    movq    [dstq+strideq*1], m6
+
+    movd                  m0, [tlq+hq]                  ;_ 6 5 0
+    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
+
+    lea                 dstq, [dstq+strideq*2]
+    sub                   hd, 2
+    jg .w8_loop
+    RET
+
+ALIGN function_align
+.w16:
+    movu                  m6, [tlq+1]                   ;top row
+    sub                  tlq, 5
+    sub                  tlq, hq
+
+.w16_loop:
+    FILTER                 7, 0, 1, [base+filter_shuf1]
+    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd    [dstq+strideq*0], m7
+    psrlq                 m7, 32
+    palignr               m7, m6, 4
+
+    FILTER                 6, 0, 1, [base+filter_shuf2]
+    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd  [dstq+4+strideq*0], m6
+    psrlq                 m6, 32
+    palignr               m6, m7, 4
+
+    FILTER                 7, 0, 1, [base+filter_shuf2]
+    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd  [dstq+8+strideq*0], m7
+    psrlq                 m7, 32
+    palignr               m7, m6, 4
+
+    FILTER                 6, 0, 1, [base+filter_shuf2]
+    movd [dstq+12+strideq*0], m6
+    psrlq                 m6, 32
+    palignr               m6, m7, 4
+    mova    [dstq+strideq*1], m6
+
+    movd                  m0, [tlq+hq]                  ;_ 6 5 0
+    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
+
+    lea                 dstq, [dstq+strideq*2]
+    sub                   hd, 2
+    jg .w16_loop
+    RET
+
+ALIGN function_align
+.w32:
+    movu                  m6, [tlq+1]                   ;top row
+    lea              filterq, [tlq+17]
+    sub                  tlq, 5
+    sub                  tlq, hq
+
+.w32_loop:
+    FILTER                 7, 0, 1, [base+filter_shuf1]
+    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd    [dstq+strideq*0], m7
+    psrlq                 m7, 32
+    palignr               m7, m6, 4
+
+    FILTER                 6, 0, 1, [base+filter_shuf2]
+    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd  [dstq+4+strideq*0], m6
+    psrlq                 m6, 32
+    palignr               m6, m7, 4
+
+    FILTER                 7, 0, 1, [base+filter_shuf2]
+    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd  [dstq+8+strideq*0], m7
+    psrlq                 m7, 32
+    palignr               m7, m6, 4
+
+    FILTER                 6, 0, 1, [base+filter_shuf2]
+    movu                  m1, [filterq]
+    punpckldq             m0, m7, m1                    ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _
+    punpcklqdq            m0, m6                        ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd [dstq+12+strideq*0], m6
+    psrlq                 m6, 32
+    palignr               m6, m7, 4
+    mova    [dstq+strideq*1], m6
+
+    mova                  m6, m1
+
+    FILTER                 7, 0, 6, [base+filter_shuf2]
+    punpcklqdq            m0, m1, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd [dstq+16+strideq*0], m7
+    psrlq                 m7, 32
+    palignr               m7, m1, 4
+
+    FILTER                 6, 0, 1, [base+filter_shuf2]
+    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd [dstq+20+strideq*0], m6
+    psrlq                 m6, 32
+    palignr               m6, m7, 4
+
+    FILTER                 7, 0, 1, [base+filter_shuf2]
+    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd [dstq+24+strideq*0], m7
+    psrlq                 m7, 32
+    palignr               m7, m6, 4
+
+    FILTER                 6, 0, 1, [base+filter_shuf2]
+    movd [dstq+28+strideq*0], m6
+    psrlq                 m6, 32
+    palignr               m6, m7, 4
+    mova [dstq+16+strideq*1], m6
+
+    mova                  m6, [dstq+strideq*1]
+    movd                  m0, [tlq+hq]                  ;_ 6 5 0
+    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
+    lea              filterq, [dstq+16+strideq*1]
+    lea                 dstq, [dstq+strideq*2]
+    sub                   hd, 2
+    jg .w32_loop
+    RET
diff --git a/src/x86/itx.asm b/src/x86/itx.asm
new file mode 100644 (file)
index 0000000..f27b900
--- /dev/null
@@ -0,0 +1,5562 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+
+; Note: The order of (at least some of) those constants matter!
+
+deint_shuf: db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
+
+%macro COEF_PAIR 2
+pw_%1_%2:  dw  %1, %2
+pw_m%2_%1: dw -%2, %1
+%endmacro
+
+; ADST-only
+pw_3803_1321:   dw  3803,  1321
+pw_m1321_2482:  dw -1321,  2482
+pw_2482_3344:   dw  2482,  3344
+pw_m3344_3344:  dw -3344,  3344
+pw_m3803_3344:  dw -3803,  3344
+pw_m3803_m6688: dw -3803, -6688
+pw_2896_m2896:  dw  2896, -2896
+
+pw_5:       times 2 dw 5
+pw_2048:    times 2 dw 2048
+pw_4096:    times 2 dw 4096
+pw_8192:    times 2 dw 8192
+pw_16384:   times 2 dw 16384
+pw_1697x16: times 2 dw 1697*16
+pw_1697x8:  times 2 dw 1697*8
+pw_2896x8:  times 2 dw 2896*8
+
+pd_2048: dd 2048
+
+COEF_PAIR 2896, 2896
+COEF_PAIR 1567, 3784
+COEF_PAIR 3784, 1567
+COEF_PAIR  201, 4091
+COEF_PAIR  995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 2440, 3290
+COEF_PAIR 3035, 2751
+COEF_PAIR 3513, 2106
+COEF_PAIR 3857, 1380
+COEF_PAIR 4052,  601
+COEF_PAIR  401, 4076
+COEF_PAIR 1931, 3612
+COEF_PAIR 3166, 2598
+COEF_PAIR 3920, 1189
+COEF_PAIR  799, 4017
+COEF_PAIR 3406, 2276
+pw_m799_m4017:  dw  -799, -4017
+pw_m1567_m3784: dw -1567, -3784
+pw_m3406_m2276: dw -3406, -2276
+pw_m401_m4076:  dw  -401, -4076
+pw_m3166_m2598: dw -3166, -2598
+pw_m1931_m3612: dw -1931, -3612
+pw_m3920_m1189: dw -3920, -1189
+COEF_PAIR 2276, 3406
+COEF_PAIR 4017,  799
+
+%macro COEF_X8 1-*
+%rep %0
+    dw %1*8, %1*8
+    %rotate 1
+%endrep
+%endmacro
+
+pw_3703x8:  COEF_X8  3703
+pw_1751x8:  COEF_X8  1751
+pw_m1380x8: COEF_X8 -1380
+pw_3857x8:  COEF_X8  3857
+pw_3973x8:  COEF_X8  3973
+pw_995x8:   COEF_X8   995
+pw_m2106x8: COEF_X8 -2106
+pw_3513x8:  COEF_X8  3513
+pw_3290x8:  COEF_X8  3290
+pw_2440x8:  COEF_X8  2440
+pw_m601x8:  COEF_X8  -601
+pw_4052x8:  COEF_X8  4052
+
+idct64_mul: COEF_X8  4095,   101,  4065,   501,  2967, -2824,  3229, -2520
+            COEF_X8  3745,  1660,  3564,  2019,  3822, -1474,  3948, -1092
+            COEF_X8  3996,   897,  3889,  1285,  3461, -2191,  3659, -1842
+            COEF_X8  3349,  2359,  3102,  2675,  4036,  -700,  4085,  -301
+
+pw_201_4091x8:   dw   201*8, 4091*8
+pw_m601_4052x8:  dw  -601*8, 4052*8
+pw_995_3973x8:   dw   995*8, 3973*8
+pw_m1380_3857x8: dw -1380*8, 3857*8
+pw_1751_3703x8:  dw  1751*8, 3703*8
+pw_m2106_3513x8: dw -2106*8, 3513*8
+pw_2440_3290x8:  dw  2440*8, 3290*8
+pw_m2751_3035x8: dw -2751*8, 3035*8
+
+%define o_idct64_offset idct64_mul - (o_base) - 8
+
+SECTION .text
+
+; Code size reduction trickery: Intead of using rip-relative loads with
+; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
+; single rip-relative lea and then address things relative from that with
+; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
+%define o_base deint_shuf + 128
+%define o(x) (rax - (o_base) + (x))
+
+%macro REPX 2-*
+    %xdefine %%f(x) %1
+%rep %0 - 1
+    %rotate 1
+    %%f(%1)
+%endrep
+%endmacro
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; flags: 1 = swap, 2 = interleave, 4: coef_regs
+%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
+%if %7 & 4
+    pmaddwd             m%2, m%5, m%1
+    pmaddwd             m%1, m%6
+%else
+%if %7 & 1
+    vpbroadcastd        m%2, [o(pw_%5_%6)]
+    vpbroadcastd        m%3, [o(pw_m%6_%5)]
+%else
+    vpbroadcastd        m%2, [o(pw_m%6_%5)]
+    vpbroadcastd        m%3, [o(pw_%5_%6)]
+%endif
+    pmaddwd             m%2, m%1
+    pmaddwd             m%1, m%3
+%endif
+    paddd               m%2, m%4
+    paddd               m%1, m%4
+%if %7 & 2
+    pslld               m%2, 4
+    psrld               m%1, 12
+    pblendw             m%1, m%2, 0xaa
+%else
+    psrad               m%2, 12
+    psrad               m%1, 12
+    packssdw            m%1, m%2
+%endif
+%endmacro
+
+; flags: 1 = swap, 2 = interleave, 4 = coef_regs
+%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags
+%if %10 & 1
+    vpbroadcastd        m%3, [o(pw_%8_%9)]
+    vpbroadcastd        m%4, [o(pw_m%9_%8)]
+    vpbroadcastd       xm%2, [o(pw_%6_%7)]
+    vpblendd            m%2, m%3, 0xf0
+    vpbroadcastd       xm%3, [o(pw_m%7_%6)]
+%else
+    vpbroadcastd        m%3, [o(pw_m%9_%8)]
+    vpbroadcastd        m%4, [o(pw_%8_%9)]
+    vpbroadcastd       xm%2, [o(pw_m%7_%6)]
+    vpblendd            m%2, m%3, 0xf0
+    vpbroadcastd       xm%3, [o(pw_%6_%7)]
+%endif
+    vpblendd            m%3, m%4, 0xf0
+    ITX_MUL2X_PACK       %1, %4, _, %5, %2, %3, (4|%10)
+%endmacro
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
+    punpckhwd           m%3, m%2, m%1
+    punpcklwd           m%2, m%1
+%if %7 < 32
+    pmaddwd             m%1, m%7, m%2
+    pmaddwd             m%4, m%7, m%3
+%else
+    vpbroadcastd        m%1, [o(pw_m%7_%6)]
+    pmaddwd             m%4, m%3, m%1
+    pmaddwd             m%1, m%2
+%endif
+    paddd               m%4, m%5
+    paddd               m%1, m%5
+    psrad               m%4, 12
+    psrad               m%1, 12
+    packssdw            m%1, m%4
+%if %7 < 32
+    pmaddwd             m%3, m%6
+    pmaddwd             m%2, m%6
+%else
+    vpbroadcastd        m%4, [o(pw_%6_%7)]
+    pmaddwd             m%3, m%4
+    pmaddwd             m%2, m%4
+%endif
+    paddd               m%3, m%5
+    paddd               m%2, m%5
+    psrad               m%3, 12
+    psrad               m%2, 12
+%if %0 == 8
+    packssdw            m%8, m%2, m%3
+%else
+    packssdw            m%2, m%3
+%endif
+%endmacro
+
+%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
+    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3
+    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0
+    psubsw              m%3, m%1, m%2
+    paddsw              m%2, m%1
+    paddsw              m%1, m%4, m%5
+    psubsw              m%4, m%5
+%endmacro
+
+%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
+    ITX_MULSUB_2W        %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
+    ITX_MULSUB_2W        %2, %8, %9, %10, %11,  799, 4017 ; t4a, t7a
+    ITX_MULSUB_2W        %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
+    paddsw              m%9, m%2, m%6  ; t4
+    psubsw              m%2, m%6       ; t5a
+    paddsw             m%10, m%8, m%4  ; t7
+    psubsw              m%8, m%4       ; t6a
+    ITX_MULSUB_2W        %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0
+    ITX_MULSUB_2W        %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6
+    psubsw              m%6, m%1, m%3  ; dct4 out2
+    paddsw              m%3, m%1       ; dct4 out1
+    paddsw              m%1, m%5, m%7  ; dct4 out0
+    psubsw              m%5, m%7       ; dct4 out3
+    psubsw              m%7, m%3, m%2  ; out6
+    paddsw              m%2, m%3       ; out1
+    paddsw              m%3, m%6, m%8  ; out2
+    psubsw              m%6, m%8       ; out5
+    psubsw              m%8, m%1, m%10 ; out7
+    paddsw              m%1, m%10      ; out0
+    paddsw              m%4, m%5, m%9  ; out3
+    psubsw              m%5, m%9       ; out4
+%endmacro
+
+; in1 = %1, in3  = %2, in5  = %3, in7  = %4
+; in9 = %5, in11 = %6, in13 = %7, in15 = %8
+%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048
+    ITX_MULSUB_2W        %1, %8, %9, %10, %11,  401, 4076 ; t8a,  t15a
+    ITX_MULSUB_2W        %5, %4, %9, %10, %11, 3166, 2598 ; t9a,  t14a
+    ITX_MULSUB_2W        %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
+    ITX_MULSUB_2W        %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
+    psubsw              m%9, m%2, m%6 ; t13
+    paddsw              m%6, m%2      ; t12
+    psubsw              m%2, m%8, m%4 ; t14
+    paddsw              m%8, m%4      ; t15
+    psubsw              m%4, m%7, m%3 ; t10
+    paddsw              m%3, m%7      ; t11
+    psubsw              m%7, m%1, m%5 ; t9
+    paddsw              m%1, m%5      ; t8
+    ITX_MULSUB_2W        %2, %7, %5, %10, %11,  1567, 3784 ; t9a,  t14a
+    ITX_MULSUB_2W        %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
+    psubsw              m%5, m%1, m%3 ; t11a
+    paddsw              m%1, m%3      ; t8a
+    psubsw              m%3, m%7, m%4 ; t13
+    paddsw              m%7, m%4      ; t14
+    psubsw              m%4, m%8, m%6 ; t12a
+    paddsw              m%8, m%6      ; t15a
+    psubsw              m%6, m%2, m%9 ; t10
+    paddsw              m%2, m%9      ; t9
+    ITX_MULSUB_2W        %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a
+    ITX_MULSUB_2W        %4, %5, %9, %10, %11, 2896, 2896 ; t11,  t12
+%endmacro
+
+%macro WRAP_XMM 1+
+    INIT_XMM cpuname
+    %1
+    INIT_YMM cpuname
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+    vpbroadcastd         m2, [o(pw_%5)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+%endif
+    lea                  r2, [dstq+strideq*2]
+%assign %%i 1
+%rep 4
+    %if %1 & 2
+        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
+    %else
+        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+    %endif
+    %assign %%i %%i + 1
+    %rotate 1
+%endrep
+    movd                 m2, [%%row_adr1]
+    pinsrd               m2, [%%row_adr2], 1
+    movd                 m3, [%%row_adr3]
+    pinsrd               m3, [%%row_adr4], 1
+    pmovzxbw             m2, m2
+    pmovzxbw             m3, m3
+    paddw                m0, m2
+    paddw                m1, m3
+    packuswb             m0, m1
+    movd       [%%row_adr1], m0
+    pextrd     [%%row_adr2], m0, 1
+    pextrd     [%%row_adr3], m0, 2
+    pextrd     [%%row_adr4], m0, 3
+    ret
+%endmacro
+
+%macro IWHT4_1D_PACKED 0
+    punpckhqdq           m3, m0, m1 ; in1 in3
+    punpcklqdq           m0, m1     ; in0 in2
+    psubw                m2, m0, m3
+    paddw                m0, m3
+    punpckhqdq           m2, m2     ; t2 t2
+    punpcklqdq           m0, m0     ; t0 t0
+    psubw                m1, m0, m2
+    psraw                m1, 1
+    psubw                m1, m3     ; t1 t3
+    psubw                m0, m1     ; ____ out0
+    paddw                m2, m1     ; out3 ____
+%endmacro
+
+INIT_XMM avx2
+cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c
+    mova                 m0, [cq+16*0]
+    mova                 m1, [cq+16*1]
+    pxor                 m2, m2
+    mova          [cq+16*0], m2
+    mova          [cq+16*1], m2
+    psraw                m0, 2
+    psraw                m1, 2
+    IWHT4_1D_PACKED
+    punpckhwd            m0, m1
+    punpcklwd            m3, m1, m2
+    punpckhdq            m1, m0, m3
+    punpckldq            m0, m3
+    IWHT4_1D_PACKED
+    vpblendd             m0, m2, 0x03
+    ITX4_END              3, 0, 2, 1, 0
+
+%macro INV_TXFM_FN 3 ; type1, type2, size
+cglobal inv_txfm_add_%1_%2_%3, 4, 5, 0, dst, stride, c, eob, tx2
+    %define %%p1 m(i%1_%3_internal)
+    lea                 rax, [o_base]
+    ; Jump to the 1st txfm function if we're not taking the fast path, which
+    ; in turn performs an indirect jump to the 2nd txfm function.
+    lea                tx2q, [m(i%2_%3_internal).pass2]
+%ifidn %1_%2, dct_dct
+    test               eobd, eobd
+    jnz %%p1
+%else
+    ; jump to the 1st txfm function unless it's located directly after this
+    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x4
+%ifidn %1_%2, dct_dct
+    vpbroadcastw         m0, [cq]
+    vpbroadcastd         m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+    mov                [cq], eobd ; 0
+    pmulhrsw             m0, m1
+    mova                 m1, m0
+    jmp m(iadst_4x4_internal).end2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0
+    vpbroadcastd         m4, [o(pd_2048)]
+    punpckhwd            m2, m1, m0
+    punpcklwd            m1, m0
+    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784
+    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896
+    paddsw               m0, m1, m2 ; out0 out1
+    psubsw               m1, m2     ; out3 out2
+%endmacro
+
+%macro IADST4_1D_PACKED 0
+    punpcklwd            m2, m1, m0
+    punpckhwd            m3, m1, m0
+    vpbroadcastd         m5, [o(pw_m3344_3344)]
+    vpbroadcastd         m0, [o(pw_3803_1321)]
+    vpbroadcastd         m4, [o(pw_m1321_2482)]
+    pmaddwd              m1, m5, m2 ; 3344*in3 - 3344*in2
+    psrld                m5, 16
+    pmaddwd              m0, m2
+    pmaddwd              m2, m4
+    pmaddwd              m5, m3 ; 3344*in0
+    paddd                m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3
+    vpbroadcastd         m4, [o(pw_2482_3344)]
+    vpbroadcastd         m5, [o(pw_m3803_3344)]
+    pmaddwd              m4, m3
+    pmaddwd              m5, m3
+    paddd                m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
+    vpbroadcastd         m0, [o(pw_m3803_m6688)]
+    pmaddwd              m3, m0
+    vpbroadcastd         m0, [o(pd_2048)]
+    paddd                m2, m0
+    paddd                m1, m0
+    paddd                m0, m4
+    paddd                m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
+    paddd                m2, m4
+    paddd                m2, m3
+    REPX      {psrad x, 12}, m1, m2, m0, m5
+    packssdw             m0, m5 ; out0 out1
+    packssdw             m1, m2 ; out2 out3
+%endmacro
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
+    mova                 m0, [cq+16*0]
+    mova                 m1, [cq+16*1]
+    IDCT4_1D_PACKED
+    mova                 m2, [o(deint_shuf)]
+    shufps               m3, m0, m1, q1331
+    shufps               m0, m1, q0220
+    pshufb               m0, m2
+    pshufb               m1, m3, m2
+    jmp                tx2q
+.pass2:
+    IDCT4_1D_PACKED
+    pxor                 m2, m2
+    mova          [cq+16*0], m2
+    mova          [cq+16*1], m2
+    ITX4_END              0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
+    mova                 m0, [cq+16*0]
+    mova                 m1, [cq+16*1]
+    call .main
+    punpckhwd            m3, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m0, m3
+    punpcklwd            m0, m3
+    jmp                tx2q
+.pass2:
+    call .main
+.end:
+    pxor                 m2, m2
+    mova          [cq+16*0], m2
+    mova          [cq+16*1], m2
+.end2:
+    ITX4_END              0, 1, 2, 3
+ALIGN function_align
+.main:
+    IADST4_1D_PACKED
+    ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
+    mova                 m0, [cq+16*0]
+    mova                 m1, [cq+16*1]
+    call m(iadst_4x4_internal).main
+    punpcklwd            m2, m1, m0
+    punpckhwd            m1, m0
+    punpcklwd            m0, m1, m2
+    punpckhwd            m1, m2
+    jmp                tx2q
+.pass2:
+    call m(iadst_4x4_internal).main
+.end:
+    pxor                 m2, m2
+    mova          [cq+16*0], m2
+    mova          [cq+16*1], m2
+.end2:
+    ITX4_END              3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
+    mova                 m0, [cq+16*0]
+    mova                 m1, [cq+16*1]
+    vpbroadcastd         m3, [o(pw_1697x8)]
+    pmulhrsw             m2, m3, m0
+    pmulhrsw             m3, m1
+    paddsw               m0, m2
+    paddsw               m1, m3
+    punpckhwd            m2, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m0, m2
+    punpcklwd            m0, m2
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m3, [o(pw_1697x8)]
+    pmulhrsw             m2, m3, m0
+    pmulhrsw             m3, m1
+    paddsw               m0, m2
+    paddsw               m1, m3
+    jmp m(iadst_4x4_internal).end
+
+%macro WRITE_4X8 2 ; coefs[1-2]
+    movd                xm4, [dstq+strideq*0]
+    pinsrd              xm4, [dstq+strideq*1], 1
+    movd                xm5, [dstq+strideq*2]
+    pinsrd              xm5, [dstq+r3       ], 1
+    pinsrd              xm4, [r2  +strideq*0], 2
+    pinsrd              xm4, [r2  +strideq*1], 3
+    pinsrd              xm5, [r2  +strideq*2], 2
+    pinsrd              xm5, [r2  +r3       ], 3
+    pmovzxbw             m4, xm4
+    pmovzxbw             m5, xm5
+    paddw                m4, m%1
+    paddw                m5, m%2
+    packuswb             m4, m5
+    vextracti128        xm5, m4, 1
+    movd   [dstq+strideq*0], xm4
+    pextrd [dstq+strideq*1], xm4, 1
+    pextrd [dstq+strideq*2], xm4, 2
+    pextrd [dstq+r3       ], xm4, 3
+    movd   [r2  +strideq*0], xm5
+    pextrd [r2  +strideq*1], xm5, 1
+    pextrd [r2  +strideq*2], xm5, 2
+    pextrd [r2  +r3       ], xm5, 3
+%endmacro
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x8
+%ifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_2048)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    mova                 m1, m0
+    jmp m(iadst_4x8_internal).end3
+%endif
+%endmacro
+
+%macro IDCT8_1D_PACKED 0
+    vpbroadcastd         m6, [o(pd_2048)]
+    punpckhwd            m5, m3, m0 ; in7 in1
+    punpckhwd            m4, m1, m2 ; in3 in5
+    punpcklwd            m3, m1     ; in6 in2
+    punpcklwd            m2, m0     ; in4 in0
+    ITX_MUL2X_PACK        5, 0, 1, 6,  799, 4017, 3 ; t4a t7a
+    ITX_MUL2X_PACK        4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
+    ITX_MUL2X_PACK        3, 0, 1, 6, 1567, 3784    ; t3 t2
+    psubsw               m0, m5, m4 ; t5a t6a (interleaved)
+    paddsw               m4, m5     ; t4  t7  (interleaved)
+    ITX_MUL2X_PACK        2, 1, 5, 6, 2896, 2896    ; t0 t1
+    vpbroadcastd         m1, [o(pw_m2896_2896)]
+    ITX_MUL2X_PACK        0, 1, _, 6, 1, 5, 4 ; t6 t5
+%if mmsize > 16
+    vbroadcasti128       m1, [o(deint_shuf)]
+    pshufb               m4, m1
+%else
+    pshufb               m4, [o(deint_shuf)]
+%endif
+    psubsw               m1, m2, m3 ; tmp3 tmp2
+    paddsw               m3, m2     ; tmp0 tmp1
+    shufps               m2, m4, m0, q1032 ; t7 t6
+    vpblendd             m4, m0, 0xcc      ; t4 t5
+    paddsw               m0, m3, m2 ; out0 out1
+    psubsw               m3, m2     ; out7 out6
+    psubsw               m2, m1, m4 ; out4 out5
+    paddsw               m1, m4     ; out3 out2
+%endmacro
+
+%macro IADST8_1D_PACKED 1 ; pass
+    vpbroadcastd         m6, [o(pd_2048)]
+    punpckhwd            m0, m4, m3 ; 0 7
+    punpckhwd            m1, m5, m2 ; 2 5
+    punpcklwd            m2, m5     ; 4 3
+    punpcklwd            m3, m4     ; 6 1
+%if %1 == 1
+    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076, 3 ; t1a t0a
+    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
+    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
+    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
+    psubsw               m4, m0, m2 ; t5 t4
+    paddsw               m0, m2     ; t1 t0
+    psubsw               m5, m1, m3 ; t6 t7
+    paddsw               m1, m3     ; t2 t3
+    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
+    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
+%if mmsize > 16
+    vbroadcasti128       m2, [o(deint_shuf)]
+%else
+    mova                 m2, [o(deint_shuf)]
+%endif
+    pshuflw              m1, m1, q2301
+    pshufhw              m1, m1, q2301
+    psubsw               m3, m0, m1        ; t3 t2
+    paddsw               m0, m1            ; -out7  out0
+    psubsw               m1, m4, m5        ; t7 t6
+    paddsw               m4, m5            ;  out6 -out1
+    pshufb               m0, m2
+    pshufb               m4, m2
+    vpbroadcastd         m5, [o(pw_m2896_2896)]
+    pmaddwd              m2, m5, m3
+    pmaddwd              m5, m1
+    paddd                m2, m6
+    paddd                m5, m6
+    psrad                m2, 12
+    psrad                m5, 12
+    packssdw             m2, m5            ; out4 -out5
+    vpbroadcastd         m5, [o(pw_2896_2896)]
+    pmaddwd              m3, m5
+    pmaddwd              m1, m5
+    paddd                m3, m6
+    paddd                m1, m6
+    psrad                m3, 12
+    psrad                m1, 12
+    packssdw             m1, m3            ; out2 -out3
+    punpcklqdq           m3, m4, m0        ; out6 -out7
+    punpckhqdq           m0, m4            ; out0 -out1
+%else
+    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
+    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
+    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
+    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
+    psubsw               m4, m0, m2 ; t4 t5
+    paddsw               m0, m2     ; t0 t1
+    psubsw               m5, m1, m3 ; t6 t7
+    paddsw               m1, m3     ; t2 t3
+    shufps               m2, m5, m4, q1032
+    punpckhwd            m4, m2
+    punpcklwd            m5, m2
+    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
+    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567    ; t7a t6a
+    psubsw               m2, m0, m1        ; t2 t3
+    paddsw               m0, m1            ; out0 -out7
+    psubsw               m1, m4, m5        ; t7 t6
+    paddsw               m4, m5            ; out6 -out1
+    vpbroadcastd         m5, [o(pw_2896x8)]
+    vpblendd             m3, m0, m4, 0x33  ; out6 -out7
+    vpblendd             m0, m4, 0xcc      ; out0 -out1
+    shufps               m4, m2, m1, q1032 ; t3 t7
+    vpblendd             m1, m2, 0x33      ; t2 t6
+    psubsw               m2, m1, m4        ; t2-t3 t6-t7
+    paddsw               m1, m4            ; t2+t3 t6+t7
+    pmulhrsw             m2, m5            ; out4 -out5
+    pshufd               m1, m1, q1032
+    pmulhrsw             m1, m5            ; out2 -out3
+%endif
+%endmacro
+
+INIT_YMM avx2
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
+
+cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
+    vpermq               m0, [cq+32*0], q3120
+    vpermq               m1, [cq+32*1], q3120
+    vpbroadcastd         m2, [o(pw_2896x8)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    IDCT4_1D_PACKED
+    vbroadcasti128       m2, [o(deint_shuf)]
+    shufps               m3, m0, m1, q1331
+    shufps               m0, m1, q0220
+    pshufb               m0, m2
+    pshufb               m1, m3, m2
+    jmp                tx2q
+.pass2:
+    vextracti128        xm2, m0, 1
+    vextracti128        xm3, m1, 1
+    call .main
+    vpbroadcastd         m4, [o(pw_2048)]
+    vinserti128          m0, xm2, 1
+    vinserti128          m1, xm3, 1
+    pshufd               m1, m1, q1032
+    jmp m(iadst_4x8_internal).end2
+ALIGN function_align
+.main:
+    WRAP_XMM IDCT8_1D_PACKED
+    ret
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
+    vpermq               m0, [cq+32*0], q3120
+    vpermq               m1, [cq+32*1], q3120
+    vpbroadcastd         m2, [o(pw_2896x8)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    call m(iadst_8x4_internal).main
+    punpckhwd            m3, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m0, m3
+    punpcklwd            m0, m3
+    jmp                tx2q
+.pass2:
+    vextracti128        xm2, m0, 1
+    vextracti128        xm3, m1, 1
+    pshufd              xm4, xm0, q1032
+    pshufd              xm5, xm1, q1032
+    call .main_pass2
+    vpbroadcastd         m4, [o(pw_2048)]
+    vinserti128          m0, xm2, 1
+    vinserti128          m1, xm3, 1
+    pxor                 m5, m5
+    psubw                m5, m4
+.end:
+    vpblendd             m4, m5, 0xcc
+.end2:
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+    WIN64_RESTORE_XMM
+    pxor                 m2, m2
+    mova          [cq+32*0], m2
+    mova          [cq+32*1], m2
+.end3:
+    lea                  r2, [dstq+strideq*4]
+    lea                  r3, [strideq*3]
+    WRITE_4X8             0, 1
+    RET
+ALIGN function_align
+.main_pass1:
+    WRAP_XMM IADST8_1D_PACKED 1
+    ret
+ALIGN function_align
+.main_pass2:
+    WRAP_XMM IADST8_1D_PACKED 2
+    ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
+    vpermq               m0, [cq+32*0], q3120
+    vpermq               m1, [cq+32*1], q3120
+    vpbroadcastd         m2, [o(pw_2896x8)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    call m(iadst_8x4_internal).main
+    punpcklwd            m3, m1, m0
+    punpckhwd            m1, m0
+    punpcklwd            m0, m1, m3
+    punpckhwd            m1, m3
+    jmp                tx2q
+.pass2:
+    vextracti128        xm2, m0, 1
+    vextracti128        xm3, m1, 1
+    pshufd              xm4, xm0, q1032
+    pshufd              xm5, xm1, q1032
+    call m(iadst_4x8_internal).main_pass2
+    vpbroadcastd         m5, [o(pw_2048)]
+    vinserti128          m3, xm1, 1
+    vinserti128          m2, xm0, 1
+    pxor                 m4, m4
+    psubw                m4, m5
+    pshufd               m0, m3, q1032
+    pshufd               m1, m2, q1032
+    jmp m(iadst_4x8_internal).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
+    vpermq               m2, [cq+32*0], q3120
+    vpermq               m0, [cq+32*1], q3120
+    vpbroadcastd         m3, [o(pw_2896x8)]
+    vpbroadcastd         m4, [o(pw_1697x8)]
+    punpcklwd            m1, m2, m0
+    punpckhwd            m2, m0
+    pmulhrsw             m1, m3
+    pmulhrsw             m2, m3
+    punpcklwd            m0, m1, m2
+    punpckhwd            m1, m2
+    pmulhrsw             m2, m4, m0
+    pmulhrsw             m4, m1
+    paddsw               m0, m2
+    paddsw               m1, m4
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m4, [o(pw_4096)]
+    jmp m(iadst_4x8_internal).end2
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x16
+%ifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    movd                xm3, [o(pw_2048)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm2
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm3
+    vpbroadcastw         m0, xm0
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    jmp m(iadst_4x16_internal).end3
+%endif
+%endmacro
+
+%macro IDCT16_1D_PACKED 0
+    vpbroadcastd        m10, [o(pd_2048)]
+.main2:
+    punpckhwd            m8, m7, m0 ; dct16 in15 in1
+    punpcklwd            m9, m4, m0 ; dct4  in2  in0
+    punpckhwd            m0, m3, m4 ; dct16 in7  in9
+    punpcklwd            m7, m1     ; dct8  in7  in1
+    punpckhwd            m1, m6     ; dct16 in3  in13
+    punpcklwd            m3, m5     ; dct8  in3  in5
+    punpckhwd            m5, m2     ; dct16 in11 in5
+    punpcklwd            m6, m2     ; dct4  in3  in1
+    ITX_MUL2X_PACK        8, 2, 4, 10,  401, 4076, 3 ; t8a  t15a
+    ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 3 ; t9a  t14a
+    ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
+    ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
+    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 3 ; t4a  t7a
+    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 3 ; t5a  t6a
+    ITX_MUL2X_PACK        6, 2, 4, 10, 1567, 3784    ; t3   t2
+    psubsw               m2, m8, m0 ; t9  t14
+    paddsw               m8, m0     ; t8  t15
+    psubsw               m0, m1, m5 ; t10 t13
+    paddsw               m1, m5     ; t11 t12
+    vpbroadcastd         m5, [o(pw_m3784_1567)]  ; reuse pw_1567_3784
+    ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 6   ; t9a  t14a
+    vpbroadcastd         m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
+    ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 6   ; t10a t13a
+    psubsw               m4, m8, m1 ; t11a t12a
+    paddsw               m8, m1     ; t8a  t15a
+    psubsw               m1, m7, m3 ; t5a  t6a
+    paddsw               m7, m3     ; t4   t7
+    paddsw               m3, m2, m0 ; t9   t14
+    psubsw               m2, m0     ; t10  t13
+%if mmsize > 16
+    vbroadcasti128       m0, [o(deint_shuf)]
+%else
+    mova                 m0, [o(deint_shuf)]
+%endif
+    pshufb               m8, m0
+    pshufb               m7, m0
+    pshufb               m3, m0
+    ITX_MUL2X_PACK        9, 0, 5, 10, 2896, 2896 ; t0   t1
+    vpbroadcastd         m0, [o(pw_m2896_2896)]
+    ITX_MUL2X_PACK        4, 5, _, 10, 5, 0, 4    ; t11  t12
+    vpbroadcastd         m5, [o(pw_2896_2896)]
+    ITX_MUL2X_PACK        1, 0, _, 10, 0, 5, 4    ; t6   t5
+    vpbroadcastd         m0, [o(pw_m2896_2896)]
+    ITX_MUL2X_PACK        2, 0, _, 10, 0, 5, 4    ; t13a t10a
+    punpckhqdq           m0, m8, m3        ; t15a t14
+    punpcklqdq           m8, m3            ; t8a  t9
+    shufps               m5, m4, m2, q1032 ; t12  t13a
+    vpblendd             m4, m2, 0xcc      ; t11  t10a
+    shufps               m2, m7, m1, q1032 ; t7 t6
+    vpblendd             m7, m1, 0xcc      ; t4 t5
+    psubsw               m1, m9, m6 ; dct4 out3 out2
+    paddsw               m9, m6     ; dct4 out0 out1
+    psubsw               m3, m9, m2 ; dct8 out7 out6
+    paddsw               m9, m2     ; dct8 out0 out1
+    psubsw               m2, m1, m7 ; dct8 out4 out5
+    paddsw               m1, m7     ; dct8 out3 out2
+    psubsw               m7, m9, m0 ; out15 out14
+    paddsw               m0, m9     ; out0  out1
+    psubsw               m6, m1, m5 ; out12 out13
+    paddsw               m1, m5     ; out3  out2
+    psubsw               m5, m2, m4 ; out11 out10
+    paddsw               m2, m4     ; out4  out5
+    psubsw               m4, m3, m8 ; out8  out9
+    paddsw               m3, m8     ; out7  out6
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
+
+cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
+    mova                 m0, [cq+32*0]
+    mova                 m1, [cq+32*1]
+    mova                 m2, [cq+32*2]
+    mova                 m3, [cq+32*3]
+    call m(idct_16x4_internal).main
+    vpbroadcastd         m5, [o(pw_16384)]
+    punpckhwd            m4, m2, m3
+    punpcklwd            m2, m3
+    punpckhwd            m3, m0, m1
+    punpcklwd            m0, m1
+    REPX   {pmulhrsw x, m5}, m0, m4, m2, m3
+    punpckhdq            m1, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    jmp                tx2q
+.pass2:
+    vextracti128        xm4, m0, 1
+    vextracti128        xm5, m1, 1
+    vextracti128        xm6, m2, 1
+    vextracti128        xm7, m3, 1
+    call .main
+    vinserti128          m0, xm4, 1
+    vinserti128          m1, xm5, 1
+    vpbroadcastd         m5, [o(pw_2048)]
+    vinserti128          m2, xm6, 1
+    vinserti128          m3, xm7, 1
+    pshufd               m1, m1, q1032
+    pshufd               m3, m3, q1032
+    jmp m(iadst_4x16_internal).end2
+ALIGN function_align
+.main:
+    WRAP_XMM IDCT16_1D_PACKED
+    ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
+    mova                 m0, [cq+32*0]
+    mova                 m1, [cq+32*1]
+    mova                 m2, [cq+32*2]
+    mova                 m3, [cq+32*3]
+    call m(iadst_16x4_internal).main
+    vpbroadcastd         m5, [o(pw_16384)]
+    punpckhwd            m4, m2, m3
+    punpcklwd            m2, m3
+    punpckhwd            m3, m0, m1
+    punpcklwd            m0, m1
+    REPX   {pmulhrsw x, m5}, m4, m2, m3, m0
+    punpckhdq            m1, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    jmp                tx2q
+.pass2:
+    call .main
+    vpbroadcastd         m5, [o(pw_2896x8)]
+    paddsw               m1, m2, m4
+    psubsw               m2, m4
+    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
+    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
+    vpbroadcastd         m5, [o(pw_2048)]
+    pshufd               m1, m1, q1032
+    vpblendd             m4, m1, m0, 0x33
+    vpblendd             m0, m2, 0x33
+    vpblendd             m2, m3, 0x33
+    vpblendd             m3, m1, 0x33
+    vpermq               m0, m0, q2031
+    vpermq               m1, m2, q1302
+    vpermq               m2, m3, q3120
+    vpermq               m3, m4, q0213
+    psubw                m6, m7, m5
+.end:
+    vpblendd             m5, m6, 0xcc
+.end2:
+    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
+    WIN64_RESTORE_XMM
+    pxor                 m4, m4
+    mova          [cq+32*0], m4
+    mova          [cq+32*1], m4
+    mova          [cq+32*2], m4
+    mova          [cq+32*3], m4
+.end3:
+    lea                  r2, [dstq+strideq*8]
+    lea                  r3, [strideq*3]
+    WRITE_4X8             0, 1
+    lea                dstq, [dstq+strideq*4]
+    lea                  r2, [r2  +strideq*4]
+    WRITE_4X8             2, 3
+    RET
+ALIGN function_align
+.main:
+    vpblendd             m4, m1, m0, 0xcc
+    vpblendd             m1, m0, 0x33
+    vpblendd             m5, m2, m3, 0xcc
+    vpblendd             m2, m3, 0x33
+    vperm2i128           m3, m5, m2, 0x31
+    vinserti128          m0, m1, xm4, 1 ; in0  in3  in2  in1
+    vperm2i128           m4, m1, m4, 0x31
+    vinserti128          m1, m5, xm2, 1 ; in4  in7  in6  in5
+    pshufd               m3, m3, q1032  ; in12 in15 in13 in14
+    pshufd               m2, m4, q1032  ; in11 in8  in9  in10
+.main2:
+    vpbroadcastd         m8, [o(pd_2048)]
+    pxor                 m7, m7
+    punpckhwd            m4, m3, m0 ; in12 in3  in14 in1
+    punpcklwd            m0, m3     ; in0  in15 in2  in13
+    punpckhwd            m3, m2, m1 ; in8  in7  in10 in5
+    punpcklwd            m1, m2     ; in4  in11 in6  in9
+    ITX_MUL4X_PACK        0, 2, 5, 6, 8,  201, 4091,  995, 3973, 3
+    ITX_MUL4X_PACK        1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
+    ITX_MUL4X_PACK        3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
+    ITX_MUL4X_PACK        4, 2, 5, 6, 8, 3857, 1380, 4052,  601, 3
+    psubsw               m2, m0, m3 ; t9a  t8a  t11a t10a
+    paddsw               m0, m3     ; t1a  t0a  t3a  t2a
+    psubsw               m3, m1, m4 ; t13a t12a t15a t14a
+    paddsw               m1, m4     ; t5a  t4a  t7a  t6a
+    ITX_MUL4X_PACK        2, 4, 5, 6, 8,  799, 4017, 3406, 2276, 3
+    psubw                m6, m7, m5
+    ITX_MUL2X_PACK        3, 5, _, 8, 6, 4, 6
+    vpbroadcastd         m6, [o(pw_m3784_1567)]
+    vpbroadcastd         m5, [o(pw_1567_3784)]
+    psubsw               m4, m0, m1 ; t5   t4   t7   t6
+    paddsw               m0, m1     ; t1   t0   t3   t2
+    psubsw               m1, m2, m3 ; t13a t12a t15a t14a
+    paddsw               m2, m3     ; t9a  t8a  t11a t10a
+    psubw                m3, m7, m6 ; pw_3784_m1567
+    vpblendd             m6, m3, 0xf0
+    ITX_MUL2X_PACK        4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
+    ITX_MUL2X_PACK        1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
+    vbroadcasti128       m5, [o(deint_shuf)]
+    pshufb               m0, m5
+    pshufb               m2, m5
+    vperm2i128           m3, m0, m2, 0x31  ; t3   t2   t11a t10a
+    vinserti128          m0, xm2, 1        ; t1   t0   t9a  t8a
+    vperm2i128           m2, m4, m1, 0x31  ; t7a  t6a  t15  t14
+    vinserti128          m4, xm1, 1        ; t4a  t5a  t12  t13
+    pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
+    psubsw               m1, m0, m3        ; t3a t2a t11 t10
+    paddsw               m0, m3     ; -out15  out0   out14 -out1
+    paddsw               m3, m4, m2 ; -out3   out12  out2  -out13
+    psubsw               m4, m2            ; t6 t7 t14a t15a
+    shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
+    vpblendd             m4, m1, 0x33      ; t3a t7  t11 t15a
+    ret
+ALIGN function_align
+.main_pass1_end:
+    vpbroadcastd         m5, [o(pw_m2896_2896)]
+    vpbroadcastd         m6, [o(pw_2896_2896)]
+    punpcklwd            m1, m4, m2
+    punpckhwd            m4, m2
+    pmaddwd              m2, m5, m4
+    pmaddwd              m4, m6
+    pmaddwd              m5, m1
+    pmaddwd              m1, m6
+    REPX      {paddd x, m8}, m5, m1, m2, m4
+    REPX      {psrad x, 12}, m5, m2, m1, m4
+    packssdw             m2, m5     ; -out11  out8   out10 -out9
+    packssdw             m1, m4     ; -out7   out4   out6  -out5
+    ret
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
+    mova                 m0, [cq+32*0]
+    mova                 m1, [cq+32*1]
+    mova                 m2, [cq+32*2]
+    mova                 m3, [cq+32*3]
+    call m(iadst_16x4_internal).main
+    vpbroadcastd         m5, [o(pw_16384)]
+    punpcklwd            m4, m1, m0
+    punpckhwd            m1, m0
+    punpcklwd            m0, m3, m2
+    punpckhwd            m3, m2
+    REPX   {pmulhrsw x, m5}, m4, m1, m0, m3
+    punpckldq            m2, m3, m1
+    punpckhdq            m3, m1
+    punpckhdq            m1, m0, m4
+    punpckldq            m0, m4
+    jmp                tx2q
+.pass2:
+    call m(iadst_4x16_internal).main
+    vpbroadcastd         m5, [o(pw_2896x8)]
+    paddsw               m1, m2, m4
+    psubsw               m2, m4
+    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
+    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
+    vpbroadcastd         m6, [o(pw_2048)]
+    pshufd               m1, m1, q1032
+    vpblendd             m4, m0, m2, 0x33
+    vpblendd             m0, m1, 0xcc
+    vpblendd             m1, m3, 0xcc
+    vpblendd             m2, m3, 0x33
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q0213
+    vpermq               m2, m2, q2031
+    vpermq               m3, m4, q1302
+    psubw                m5, m7, m6
+    jmp m(iadst_4x16_internal).end
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
+    mova                 m3, [cq+32*0]
+    mova                 m2, [cq+32*1]
+    mova                 m4, [cq+32*2]
+    mova                 m5, [cq+32*3]
+    vpbroadcastd         m8, [o(pw_1697x8)]
+    pcmpeqw              m0, m0 ; -1
+    punpcklwd            m1, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m2, m4, m5
+    punpckhwd            m4, m5
+    pmulhrsw             m5, m8, m1
+    pmulhrsw             m6, m8, m2
+    pmulhrsw             m7, m8, m3
+    pmulhrsw             m8, m4
+    pcmpeqw              m9, m0, m1 ; we want to do a signed avg, but pavgw is
+    pxor                 m1, m9     ; unsigned. as long as both signs are equal
+    pcmpeqw              m9, m0, m2 ; it still works, but if the input is -1 the
+    pxor                 m2, m9     ; pmulhrsw result will become 0 which causes
+    pcmpeqw              m9, m0, m3 ; pavgw to output -32768 instead of 0 unless
+    pxor                 m3, m9     ; we explicitly deal with that case here.
+    pcmpeqw              m0, m4
+    pxor                 m4, m0
+    pavgw                m1, m5
+    pavgw                m2, m6
+    pavgw                m3, m7
+    pavgw                m4, m8
+    punpckldq            m0, m1, m2
+    punpckhdq            m1, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m8, [o(pw_1697x16)]
+    vpbroadcastd         m5, [o(pw_2048)]
+    pmulhrsw             m4, m8, m0
+    pmulhrsw             m6, m8, m1
+    pmulhrsw             m7, m8, m2
+    pmulhrsw             m8, m3
+    REPX      {paddsw x, x}, m0, m1, m2, m3
+    paddsw               m0, m4
+    paddsw               m1, m6
+    paddsw               m2, m7
+    paddsw               m3, m8
+    jmp m(iadst_4x16_internal).end2
+
+%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3]
+    movq               xm%3, [dstq   ]
+    movhps             xm%3, [dstq+%5]
+    movq               xm%4, [dstq+%6]
+    movhps             xm%4, [dstq+%7]
+    pmovzxbw            m%3, xm%3
+    pmovzxbw            m%4, xm%4
+%ifnum %1
+    paddw               m%3, m%1
+%else
+    paddw               m%3, %1
+%endif
+%ifnum %2
+    paddw               m%4, m%2
+%else
+    paddw               m%4, %2
+%endif
+    packuswb            m%3, m%4
+    vextracti128       xm%4, m%3, 1
+    movq          [dstq   ], xm%3
+    movhps        [dstq+%6], xm%3
+    movq          [dstq+%5], xm%4
+    movhps        [dstq+%7], xm%4
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x4
+%ifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    pmulhrsw            xm0, xm1
+    movd                xm2, [o(pw_2048)]
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    mova                 m1, m0
+    jmp m(iadst_8x4_internal).end3
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
+    vpbroadcastd        xm3, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm3, [cq+16*0]
+    pmulhrsw            xm1, xm3, [cq+16*1]
+    pmulhrsw            xm2, xm3, [cq+16*2]
+    pmulhrsw            xm3,      [cq+16*3]
+    call m(idct_4x8_internal).main
+    vbroadcasti128       m4, [o(deint_shuf)]
+    vinserti128          m3, m1, xm3, 1
+    vinserti128          m1, m0, xm2, 1
+    shufps               m0, m1, m3, q0220
+    shufps               m1, m3, q1331
+    pshufb               m0, m4
+    pshufb               m1, m4
+    jmp                tx2q
+.pass2:
+    IDCT4_1D_PACKED
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q2031
+    jmp m(iadst_8x4_internal).end2
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
+    vpbroadcastd        xm0, [o(pw_2896x8)]
+    pshufd              xm4,      [cq+16*0], q1032
+    pmulhrsw            xm3, xm0, [cq+16*3]
+    pshufd              xm5,      [cq+16*1], q1032
+    pmulhrsw            xm2, xm0, [cq+16*2]
+    pmulhrsw            xm4, xm0
+    pmulhrsw            xm5, xm0
+    call m(iadst_4x8_internal).main_pass1
+    vinserti128        m0, xm2, 1
+    vinserti128        m1, xm3, 1
+    punpckhwd          m2, m0, m1
+    punpcklwd          m0, m1
+    pxor               m3, m3
+    psubsw             m3, m2
+    punpckhwd          m1, m0, m3
+    punpcklwd          m0, m3
+    jmp              tx2q
+.pass2:
+    call .main
+.end:
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q3120
+.end2:
+    vpbroadcastd         m2, [o(pw_2048)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    WIN64_RESTORE_XMM
+.end3:
+    pxor                 m2, m2
+    mova          [cq+32*0], m2
+    mova          [cq+32*1], m2
+    lea                  r3, [strideq*3]
+    WRITE_8X4             0, 1, 4, 5
+    RET
+ALIGN function_align
+.main:
+    IADST4_1D_PACKED
+    ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
+    vpbroadcastd        xm0, [o(pw_2896x8)]
+    pshufd              xm4,      [cq+16*0], q1032
+    pmulhrsw            xm3, xm0, [cq+16*3]
+    pshufd              xm5,      [cq+16*1], q1032
+    pmulhrsw            xm2, xm0, [cq+16*2]
+    pmulhrsw            xm4, xm0
+    pmulhrsw            xm5, xm0
+    call m(iadst_4x8_internal).main_pass1
+    vinserti128          m3, xm1, 1
+    vinserti128          m2, xm0, 1
+    punpckhwd            m1, m3, m2
+    punpcklwd            m3, m2
+    pxor                 m0, m0
+    psubsw               m0, m1
+    punpckhwd            m1, m0, m3
+    punpcklwd            m0, m3
+    jmp                tx2q
+.pass2:
+    call m(iadst_8x4_internal).main
+    mova                 m2, m1
+    vpermq               m1, m0, q2031
+    vpermq               m0, m2, q2031
+    jmp m(iadst_8x4_internal).end2
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
+    mova                xm2, [cq+16*0]
+    mova                xm0, [cq+16*1]
+    vinserti128          m2, [cq+16*2], 1
+    vinserti128          m0, [cq+16*3], 1
+    vpbroadcastd         m3, [o(pw_2896x8)]
+    punpcklwd            m1, m2, m0
+    punpckhwd            m2, m0
+    pmulhrsw             m1, m3
+    pmulhrsw             m2, m3
+    punpcklwd            m0, m1, m2
+    punpckhwd            m1, m2
+    paddsw               m0, m0
+    paddsw               m1, m1
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m3, [o(pw_1697x8)]
+    pmulhrsw             m2, m3, m0
+    pmulhrsw             m3, m1
+    paddsw               m0, m2
+    paddsw               m1, m3
+    jmp m(iadst_8x4_internal).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x8
+%ifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm2
+    psrlw               xm2, 3 ; pw_2048
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+.end:
+    mov                 r2d, 2
+.end2:
+    lea                  r3, [strideq*3]
+.loop:
+    WRITE_8X4             0, 0, 1, 2
+    lea                dstq, [dstq+strideq*4]
+    dec                 r2d
+    jg .loop
+    RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
+    vpermq               m0, [cq+32*0], q3120 ; 0 1
+    vpermq               m3, [cq+32*3], q3120 ; 6 7
+    vpermq               m2, [cq+32*2], q3120 ; 4 5
+    vpermq               m1, [cq+32*1], q3120 ; 2 3
+    call .main
+    shufps               m4, m0, m1, q0220
+    shufps               m5, m0, m1, q1331
+    shufps               m1, m2, m3, q0220
+    shufps               m3, m2, m3, q1331
+    vbroadcasti128       m0, [o(deint_shuf)]
+    vpbroadcastd         m2, [o(pw_16384)]
+    REPX   {pshufb   x, m0}, m4, m5, m1, m3
+    REPX   {pmulhrsw x, m2}, m4, m5, m1, m3
+    vinserti128          m0, m4, xm1, 1
+    vperm2i128           m2, m4, m1, 0x31
+    vinserti128          m1, m5, xm3, 1
+    vperm2i128           m3, m5, m3, 0x31
+    jmp                tx2q
+.pass2:
+    call .main
+    vpbroadcastd         m4, [o(pw_2048)]
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q2031
+    vpermq               m2, m2, q3120
+    vpermq               m3, m3, q2031
+    jmp m(iadst_8x8_internal).end2
+ALIGN function_align
+.main:
+    IDCT8_1D_PACKED
+    ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
+    vpermq               m4, [cq+32*0], q1302 ; 1 0
+    vpermq               m3, [cq+32*3], q3120 ; 6 7
+    vpermq               m5, [cq+32*1], q1302 ; 3 2
+    vpermq               m2, [cq+32*2], q3120 ; 4 5
+    call .main_pass1
+    vpbroadcastd         m5, [o(pw_16384)]
+    punpcklwd            m4, m0, m1
+    punpckhwd            m0, m1
+    punpcklwd            m1, m2, m3
+    punpckhwd            m2, m3
+    pxor                 m3, m3
+    psubw                m3, m5 ; negate odd elements during rounding
+    pmulhrsw             m4, m5
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m3
+    punpcklwd            m3, m4, m0
+    punpckhwd            m4, m0
+    punpcklwd            m0, m1, m2
+    punpckhwd            m1, m2
+    vperm2i128           m2, m3, m0, 0x31
+    vinserti128          m0, m3, xm0, 1
+    vperm2i128           m3, m4, m1, 0x31
+    vinserti128          m1, m4, xm1, 1
+    jmp                tx2q
+.pass2:
+    pshufd               m4, m0, q1032
+    pshufd               m5, m1, q1032
+    call .main_pass2
+    vpbroadcastd         m5, [o(pw_2048)]
+    vpbroadcastd        xm4, [o(pw_4096)]
+    psubw                m4, m5 ; lower half = 2048, upper half = -2048
+.end:
+    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+.end2:
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+.end3:
+    pmulhrsw             m2, m4
+    pmulhrsw             m3, m4
+    WIN64_RESTORE_XMM
+.end4:
+    pxor                 m4, m4
+    mova          [cq+32*0], m4
+    mova          [cq+32*1], m4
+    mova          [cq+32*2], m4
+    mova          [cq+32*3], m4
+    lea                  r3, [strideq*3]
+    WRITE_8X4             0, 1, 4, 5
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4             2, 3, 4, 5
+    RET
+ALIGN function_align
+.main_pass1:
+    IADST8_1D_PACKED 1
+    ret
+ALIGN function_align
+.main_pass2:
+    IADST8_1D_PACKED 2
+    ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
+    vpermq               m4, [cq+32*0], q1302 ; 1 0
+    vpermq               m3, [cq+32*3], q3120 ; 6 7
+    vpermq               m5, [cq+32*1], q1302 ; 3 2
+    vpermq               m2, [cq+32*2], q3120 ; 4 5
+    call m(iadst_8x8_internal).main_pass1
+    vpbroadcastd         m5, [o(pw_16384)]
+    punpckhwd            m4, m3, m2
+    punpcklwd            m3, m2
+    punpckhwd            m2, m1, m0
+    punpcklwd            m1, m0
+    pxor                 m0, m0
+    psubw                m0, m5
+    pmulhrsw             m4, m0
+    pmulhrsw             m3, m5
+    pmulhrsw             m2, m0
+    pmulhrsw             m1, m5
+    punpckhwd            m0, m4, m3
+    punpcklwd            m4, m3
+    punpckhwd            m3, m2, m1
+    punpcklwd            m2, m1
+    vinserti128          m1, m0, xm3, 1
+    vperm2i128           m3, m0, m3, 0x31
+    vinserti128          m0, m4, xm2, 1
+    vperm2i128           m2, m4, m2, 0x31
+    jmp                tx2q
+.pass2:
+    pshufd               m4, m0, q1032
+    pshufd               m5, m1, q1032
+    call m(iadst_8x8_internal).main_pass2
+    vpbroadcastd         m4, [o(pw_2048)]
+    vpbroadcastd        xm5, [o(pw_4096)]
+    psubw                m4, m5 ; lower half = -2048, upper half = 2048
+    vpermq               m5, m3, q2031
+    vpermq               m3, m0, q2031
+    vpermq               m0, m2, q2031
+    vpermq               m2, m1, q2031
+    pmulhrsw             m1, m0, m4
+    pmulhrsw             m0, m5, m4
+    jmp m(iadst_8x8_internal).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
+    mova                xm3, [cq+16*0]
+    mova                xm2, [cq+16*1]
+    vinserti128          m3, [cq+16*4], 1
+    vinserti128          m2, [cq+16*5], 1
+    mova                xm4, [cq+16*2]
+    mova                xm0, [cq+16*3]
+    vinserti128          m4, [cq+16*6], 1
+    vinserti128          m0, [cq+16*7], 1
+    punpcklwd            m1, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m2, m4, m0
+    punpckhwd            m4, m0
+    punpckldq            m0, m1, m2
+    punpckhdq            m1, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m4, [o(pw_4096)]
+    jmp m(iadst_8x8_internal).end
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x16
+%ifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    psrlw               xm2, 3 ; pw_2048
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    mov                 r2d, 4
+    jmp m(inv_txfm_add_dct_dct_8x8).end2
+%endif
+%endmacro
+
+%macro ITX_8X16_LOAD_COEFS 0
+    vpbroadcastd         m4, [o(pw_2896x8)]
+    pmulhrsw             m0, m4, [cq+32*0]
+    add                  cq, 32*4
+    pmulhrsw             m7, m4, [cq+32*3]
+    pmulhrsw             m1, m4, [cq-32*3]
+    pmulhrsw             m6, m4, [cq+32*2]
+    pmulhrsw             m2, m4, [cq-32*2]
+    pmulhrsw             m5, m4, [cq+32*1]
+    pmulhrsw             m3, m4, [cq-32*1]
+    pmulhrsw             m4,     [cq+32*0]
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
+
+cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
+    ITX_8X16_LOAD_COEFS
+    call m(idct_16x8_internal).main
+    vpbroadcastd        m10, [o(pw_16384)]
+.pass1_end:
+    vperm2i128           m9, m3, m7, 0x31
+    vinserti128          m3, xm7, 1
+    vperm2i128           m8, m2, m6, 0x31
+    vinserti128          m2, xm6, 1
+    vperm2i128           m6, m1, m5, 0x31
+    vinserti128          m1, xm5, 1
+    vperm2i128           m5, m0, m4, 0x31
+    vinserti128          m0, xm4, 1
+    punpckhwd            m4, m2, m3
+    punpcklwd            m2, m3
+    punpckhwd            m3, m0, m1
+    punpcklwd            m0, m1
+.pass1_end2:
+    punpckhwd            m7, m5, m6
+    punpcklwd            m5, m6
+    punpcklwd            m6, m8, m9
+    punpckhwd            m8, m9
+    REPX  {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8
+    punpckhdq            m1, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    punpckldq            m4, m5, m6
+    punpckhdq            m5, m6
+    punpckldq            m6, m7, m8
+    punpckhdq            m7, m8
+    jmp                tx2q
+.pass2:
+    call .main
+    REPX {vpermq x, x, q3120}, m0, m2, m4, m6
+    REPX {vpermq x, x, q2031}, m1, m3, m5, m7
+.end:
+    vpbroadcastd         m8, [o(pw_2048)]
+.end2:
+    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+.end3:
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
+    lea                  r3, [strideq*3]
+    WRITE_8X4             0, 1, 8, 9
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4             2, 3, 0, 1
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4             4, 5, 0, 1
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4             6, 7, 0, 1
+    RET
+ALIGN function_align
+.main:
+    IDCT16_1D_PACKED
+    ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
+    ITX_8X16_LOAD_COEFS
+    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
+    vpbroadcastd        m10, [o(pw_16384)]
+    pslld                m9, m10, 17
+    psubw               m10, m9 ; 16384, -16384
+    jmp m(idct_8x16_internal).pass1_end
+ALIGN function_align
+.pass2:
+    call .main
+    call .main_pass2_end
+    vpbroadcastd         m9, [o(pw_2048)]
+    vpbroadcastd        xm8, [o(pw_4096)]
+    psubw                m8, m9
+    REPX {vpermq x, x, q2031}, m0, m1, m2, m3
+    REPX {vpermq x, x, q3120}, m4, m5, m6, m7
+    jmp m(idct_8x16_internal).end2
+ALIGN function_align
+.main:
+    REPX {pshufd x, x, q1032}, m7, m1, m5, m3
+.main2:
+    vpbroadcastd        m10, [o(pd_2048)]
+    punpckhwd            m8, m7, m0 ; in14 in1
+    punpcklwd            m0, m7     ; in0  in15
+    punpcklwd            m7, m6, m1 ; in12 in3
+    punpckhwd            m1, m6     ; in2  in13
+    punpckhwd            m6, m5, m2 ; in10 in5
+    punpcklwd            m2, m5     ; in4  in11
+    punpcklwd            m5, m4, m3 ; in8  in7
+    punpckhwd            m3, m4     ; in6  in9
+    ITX_MUL2X_PACK        0, 4, 9, 10,  201, 4091, 3 ; t0  t1
+    ITX_MUL2X_PACK        1, 4, 9, 10,  995, 3973, 3 ; t2  t3
+    ITX_MUL2X_PACK        2, 4, 9, 10, 1751, 3703, 3 ; t4  t5
+    ITX_MUL2X_PACK        3, 4, 9, 10, 2440, 3290, 3 ; t6  t7
+    ITX_MUL2X_PACK        5, 4, 9, 10, 3035, 2751, 3 ; t8  t9
+    ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
+    ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
+    ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 3 ; t14 t15
+    psubsw               m4, m0, m5 ; t9a  t8a
+    paddsw               m0, m5     ; t1a  t0a
+    psubsw               m5, m1, m6 ; t11a t10a
+    paddsw               m1, m6     ; t3a  t2a
+    psubsw               m6, m2, m7 ; t13a t12a
+    paddsw               m2, m7     ; t5a  t4a
+    psubsw               m7, m3, m8 ; t15a t14a
+    paddsw               m3, m8     ; t7a  t6a
+    vpbroadcastd        m11, [o(pw_m4017_799)]
+    vpbroadcastd        m12, [o(pw_799_4017)]
+    pxor                 m9, m9
+    ITX_MUL2X_PACK        4, 8, _, 10, 11, 12, 6 ; t8  t9
+    psubw                m8, m9, m11 ; pw_4017_m799
+    ITX_MUL2X_PACK        6, 12, _, 10, 12, 8, 6 ; t12 t13
+    vpbroadcastd        m11, [o(pw_m2276_3406)]
+    vpbroadcastd        m12, [o(pw_3406_2276)]
+    ITX_MUL2X_PACK        5, 8, _, 10, 11, 12, 6 ; t10 t11
+    psubw                m8, m9, m11 ; pw_2276_m3406
+    ITX_MUL2X_PACK        7, 12, _, 10, 12, 8, 6 ; t14 t15
+    psubsw               m8, m1, m3 ; t7   t6
+    paddsw               m1, m3     ; t3   t2
+    psubsw               m3, m0, m2 ; t5   t4
+    paddsw               m0, m2     ; t1   t0
+    psubsw               m2, m5, m7 ; t14a t15a
+    paddsw               m7, m5     ; t10a t11a
+    psubsw               m5, m4, m6 ; t12a t13a
+    paddsw               m4, m6     ; t8a  t9a
+    vpbroadcastd        m11, [o(pw_m3784_1567)]
+    vpbroadcastd        m12, [o(pw_1567_3784)]
+    ITX_MUL2X_PACK        3, 6, _, 10, 12, 11, 6 ; t5a t4a
+    psubw                m6, m9, m11 ; pw_3784_m1567
+    ITX_MUL2X_PACK        8, 6, _, 10, 6, 12, 6  ; t7a t6a
+    vpbroadcastd        m11, [o(pw_m1567_3784)]
+    vpbroadcastd        m12, [o(pw_3784_1567)]
+    ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 6 ; t15 t14
+    psubw                m6, m9, m11 ; pw_1567_m3784
+    ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 6 ; t13 t12
+    vbroadcasti128      m12, [o(deint_shuf)]
+    paddsw               m6, m4, m7        ; -out1  out14
+    psubsw               m4, m7            ;  t10    t11
+    psubsw              m11, m3, m8        ;  t7     t6
+    paddsw               m8, m3            ;  out12 -out3
+    psubsw               m3, m0, m1        ;  t3a    t2a
+    paddsw               m0, m1            ; -out15  out0
+    paddsw               m1, m2, m5        ; -out13  out2
+    psubsw               m5, m2            ;  t15a   t14a
+    pshufb               m0, m12
+    pshufb               m6, m12
+    pshufb               m8, m12
+    pshufb               m1, m12
+    shufps               m7, m6, m0, q1032 ;  out14 -out15
+    vpblendd             m0, m6, 0x33      ; -out1   out0
+    punpcklqdq           m6, m8, m1        ;  out12 -out13
+    punpckhqdq           m1, m8, m1        ; -out3   out2
+    ret
+ALIGN function_align
+.main_pass1_end:
+    vpbroadcastd         m8, [o(pw_m2896_2896)]
+    vpbroadcastd        m12, [o(pw_2896_2896)]
+    pmaddwd              m9, m8, m11       ; -out11
+    pmaddwd              m2, m12, m5       ; -out5
+    pmaddwd              m5, m8            ;  out10
+    pmaddwd             m11, m12           ;  out4
+    REPX     {paddd x, m10}, m9, m5, m2, m11
+    REPX     {psrad x, 12 }, m9, m5, m2, m11
+    packssdw             m5, m9            ;  out10 -out11
+    packssdw             m2, m11           ; -out5   out4
+    pmaddwd             m11, m8, m3        ;  out8
+    vpbroadcastd         m8, [o(pw_2896_m2896)]
+    pmaddwd              m3, m12           ; -out7
+    pmaddwd              m8, m4            ; -out9
+    pmaddwd              m4, m12           ;  out6
+    REPX     {paddd x, m10}, m11, m3, m8, m4
+    REPX     {psrad x, 12 }, m11, m3, m8, m4
+    packssdw             m3, m4            ; -out7   out6
+    packssdw             m4, m11, m8       ;  out8  -out9
+    vpbroadcastd        m10, [o(pw_16384)]
+    pxor                 m9, m9
+    ret
+ALIGN function_align
+.main_pass2_end:
+    vpbroadcastd         m8, [o(pw_2896x8)]
+    pshufb               m2, m11, m12
+    pshufb               m5, m12
+    pshufb               m3, m12
+    pshufb               m4, m12
+    punpcklqdq          m11, m5, m2        ;  t15a   t7
+    punpckhqdq           m5, m2            ;  t14a   t6
+    shufps               m2, m3, m4, q1032 ;  t2a    t10
+    vpblendd             m3, m4, 0xcc      ;  t3a    t11
+    psubsw               m4, m2, m3        ;  out8  -out9
+    paddsw               m3, m2            ; -out7   out6
+    paddsw               m2, m5, m11       ; -out5   out4
+    psubsw               m5, m11           ;  out10 -out11
+    REPX   {pmulhrsw x, m8}, m2, m3, m4, m5
+    ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
+    ITX_8X16_LOAD_COEFS
+    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
+    vpbroadcastd         m9, [o(pw_16384)]
+    pslld               m10, m9, 17
+    psubw               m10, m9 ; -16384, 16384
+    vperm2i128           m9, m4, m0, 0x31
+    vinserti128          m0, m4, xm0, 1
+    vperm2i128           m8, m5, m1, 0x31
+    vinserti128          m4, m5, xm1, 1
+    vperm2i128           m5, m7, m3, 0x31
+    vinserti128          m3, m7, xm3, 1
+    vinserti128          m1, m6, xm2, 1
+    vperm2i128           m6, m6, m2, 0x31
+    punpcklwd            m2, m4, m0
+    punpckhwd            m4, m0
+    punpcklwd            m0, m3, m1
+    punpckhwd            m3, m1
+    jmp m(idct_8x16_internal).pass1_end2
+.pass2:
+    call m(iadst_8x16_internal).main
+    call m(iadst_8x16_internal).main_pass2_end
+    vpbroadcastd         m8, [o(pw_2048)]
+    vpbroadcastd        xm9, [o(pw_4096)]
+    psubw                m8, m9
+    vpermq               m9, m0, q3120
+    vpermq               m0, m7, q2031
+    vpermq               m7, m1, q3120
+    vpermq               m1, m6, q2031
+    vpermq               m6, m2, q3120
+    vpermq               m2, m5, q2031
+    vpermq               m5, m3, q3120
+    vpermq               m3, m4, q2031
+    pmulhrsw             m0, m8
+    pmulhrsw             m1, m8
+    pmulhrsw             m2, m8
+    pmulhrsw             m3, m8
+    pmulhrsw             m4, m5, m8
+    pmulhrsw             m5, m6, m8
+    pmulhrsw             m6, m7, m8
+    pmulhrsw             m7, m9, m8
+    jmp m(idct_8x16_internal).end3
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
+    pmulhrsw            m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+    pmulhrsw            m%2, m%4
+%else
+    paddsw              m%1, m%1
+%endif
+    paddsw              m%1, m%2
+%endmacro
+
+cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
+    mova                xm3, [cq+16*0]
+    mova                xm2, [cq+16*2]
+    add                  cq, 16*8
+    vinserti128          m3, [cq+16*0], 1
+    vinserti128          m2, [cq+16*2], 1
+    vpbroadcastd         m9, [o(pw_2896x8)]
+    mova                xm4, [cq-16*4]
+    mova                xm5, [cq-16*2]
+    vinserti128          m4, [cq+16*4], 1
+    vinserti128          m5, [cq+16*6], 1
+    mova                xm7, [cq-16*7]
+    mova                xm6, [cq-16*5]
+    vinserti128          m7, [cq+16*1], 1
+    vinserti128          m6, [cq+16*3], 1
+    mova                xm8, [cq-16*3]
+    mova                xm0, [cq-16*1]
+    vinserti128          m8, [cq+16*5], 1
+    vinserti128          m0, [cq+16*7], 1
+    punpcklwd            m1, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m2, m4, m5
+    punpckhwd            m4, m5
+    punpcklwd            m5, m7, m6
+    punpckhwd            m7, m6
+    punpcklwd            m6, m8, m0
+    punpckhwd            m8, m0
+    REPX   {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8
+    punpckldq            m0, m1, m2
+    punpckhdq            m1, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    punpckldq            m4, m5, m6
+    punpckhdq            m5, m6
+    punpckldq            m6, m7, m8
+    punpckhdq            m7, m8
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m8, [o(pw_1697x16)]
+    REPX {vpermq   x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX {IDTX16   x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7
+    jmp m(idct_8x16_internal).end
+
+%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+    pmovzxbw            m%3, [dstq+%5]
+%ifnum %1
+    paddw               m%3, m%1
+%else
+    paddw               m%3, %1
+%endif
+    pmovzxbw            m%4, [dstq+%6]
+%ifnum %2
+    paddw               m%4, m%2
+%else
+    paddw               m%4, %2
+%endif
+    packuswb            m%3, m%4
+    vpermq              m%3, m%3, q3120
+    mova          [dstq+%5], xm%3
+    vextracti128  [dstq+%6], m%3, 1
+%endmacro
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x4
+%ifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    mov                 r2d, 2
+.dconly:
+    pmulhrsw            xm0, xm2
+    movd                xm2, [pw_2048] ; intentionally rip-relative
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    pxor                 m3, m3
+.dconly_loop:
+    mova                xm1, [dstq]
+    vinserti128          m1, [dstq+strideq], 1
+    punpckhbw            m2, m1, m3
+    punpcklbw            m1, m3
+    paddw                m2, m0
+    paddw                m1, m0
+    packuswb             m1, m2
+    mova             [dstq], xm1
+    vextracti128 [dstq+strideq], m1, 1
+    lea                dstq, [dstq+strideq*2]
+    dec                 r2d
+    jg .dconly_loop
+    RET
+%endif
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
+    mova                xm0, [cq+16*0]
+    mova                xm1, [cq+16*1]
+    mova                xm2, [cq+16*2]
+    mova                xm3, [cq+16*3]
+    mova                xm4, [cq+16*4]
+    mova                xm5, [cq+16*5]
+    mova                xm6, [cq+16*6]
+    mova                xm7, [cq+16*7]
+    call m(idct_4x16_internal).main
+    vinserti128          m6, m2, xm6, 1
+    vinserti128          m2, m0, xm4, 1
+    vinserti128          m0, m1, xm5, 1
+    vinserti128          m1, m3, xm7, 1
+    punpcklwd            m3, m2, m6
+    punpckhwd            m2, m6
+    vpbroadcastd         m6, [o(pw_16384)]
+    punpckhwd            m4, m0, m1
+    punpcklwd            m0, m1
+    mova                 m1, m6
+    jmp m(iadst_16x4_internal).pass1_end
+.pass2:
+    call .main
+    jmp m(iadst_16x4_internal).end
+ALIGN function_align
+.main:
+    vpbroadcastd         m6, [o(pd_2048)]
+    IDCT4_1D              0, 1, 2, 3, 4, 5, 6
+    ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
+    vpermq               m0, [cq+32*0], q1230
+    vpermq               m3, [cq+32*3], q2103
+    vpermq               m1, [cq+32*1], q1230
+    vpermq               m2, [cq+32*2], q2103
+    call m(iadst_4x16_internal).main2
+    call m(iadst_4x16_internal).main_pass1_end
+    punpcklwd            m4, m3, m1
+    punpcklwd            m5, m2, m0
+    punpckhwd            m0, m1
+    punpckhwd            m2, m3
+    vpbroadcastd         m1, [o(pw_16384)]
+    vinserti128          m3, m0, xm2, 1
+    vperm2i128           m2, m0, m2, 0x31
+    vinserti128          m0, m4, xm5, 1
+    vperm2i128           m4, m4, m5, 0x31
+    psubw                m6, m7, m1
+.pass1_end:
+    pmulhrsw             m3, m1
+    pmulhrsw             m2, m6
+    pmulhrsw             m4, m1
+    pmulhrsw             m0, m6
+    punpcklwd            m1, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m2, m4, m0
+    punpckhwd            m4, m0
+    punpckldq            m0, m1, m2
+    punpckhdq            m1, m2
+    punpckldq            m2, m3, m4
+    punpckhdq            m3, m4
+    jmp                tx2q
+.pass2:
+    call .main
+.end:
+    vpbroadcastd         m4, [o(pw_2048)]
+    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
+    WIN64_RESTORE_XMM
+.end2:
+    pxor                 m4, m4
+    mova          [cq+32*0], m4
+    mova          [cq+32*1], m4
+    mova          [cq+32*2], m4
+    mova          [cq+32*3], m4
+.end3:
+    WRITE_16X2            0, 1, 4, 5, strideq*0, strideq*1
+    lea                dstq, [dstq+strideq*2]
+    WRITE_16X2            2, 3, 4, 5, strideq*0, strideq*1
+    RET
+ALIGN function_align
+.main:
+    vpbroadcastd         m6, [o(pw_m3344_3344)]
+    vpbroadcastd         m7, [o(pw_3803_1321)]
+    vpbroadcastd         m8, [o(pw_m1321_2482)]
+    vpbroadcastd         m9, [o(pw_2482_3344)]
+    punpcklwd            m4, m2, m0 ; in2 in0 l
+    punpckhwd            m2, m0     ; in2 in0 h
+    psrld                m5, m6, 16
+    pmaddwd             m10, m6, m4 ; t2:02 l
+    pmaddwd              m6, m2     ; t2:02 h
+    pmaddwd              m0, m7, m4 ; t0:02 l
+    pmaddwd              m7, m2     ; t0:02 h
+    pmaddwd              m4, m8     ; t1:02 l
+    pmaddwd              m8, m2     ; t1:02 h
+    punpckhwd            m2, m3, m1 ; in3 in1 h
+    punpcklwd            m3, m1     ; in3 in1 l
+    pmaddwd              m1, m5, m2 ; t2:3 h
+    pmaddwd              m5, m3     ; t2:3 l
+    paddd                m6, m1
+    vpbroadcastd         m1, [o(pd_2048)]
+    paddd               m10, m5
+    pmaddwd              m5, m9, m3
+    pmaddwd              m9, m2
+    paddd                m0, m1
+    paddd                m7, m1
+    paddd                m0, m5     ; t0 + t3 + 2048 l
+    paddd                m7, m9     ; t0 + t3 + 2048 h
+    vpbroadcastd         m9, [o(pw_m3803_3344)]
+    pmaddwd              m5, m9, m2
+    pmaddwd              m9, m3
+    paddd               m10, m1     ; t2 + 2048 l
+    paddd                m6, m1     ; t2 + 2048 h
+    paddd                m5, m1     ; t1:13 + 2048 h
+    paddd                m1, m9     ; t1:13 + 2048 l
+    vpbroadcastd         m9, [o(pw_m3803_m6688)]
+    pmaddwd              m2, m9
+    pmaddwd              m3, m9
+    paddd                m5, m8     ; t1 + t3 + 2048 h
+    paddd                m1, m4     ; t1 + t3 + 2048 l
+    paddd                m8, m7
+    paddd                m4, m0
+    paddd                m2, m8     ; t0 + t1 - t3 + 2048 h
+    paddd                m3, m4     ; t0 + t1 - t3 + 2048 l
+    REPX      {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3
+    packssdw             m0, m7
+    packssdw             m1, m5
+    packssdw             m3, m2
+    packssdw             m2, m10, m6
+    ret
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
+    vpermq               m0, [cq+32*0], q1230
+    vpermq               m3, [cq+32*3], q2103
+    vpermq               m1, [cq+32*1], q1230
+    vpermq               m2, [cq+32*2], q2103
+    call m(iadst_4x16_internal).main2
+    call m(iadst_4x16_internal).main_pass1_end
+    punpckhwd            m4, m3, m2
+    punpckhwd            m5, m1, m0
+    punpcklwd            m0, m2
+    punpcklwd            m1, m3
+    vpbroadcastd         m6, [o(pw_16384)]
+    vinserti128          m3, m0, xm1, 1
+    vperm2i128           m2, m0, m1, 0x31
+    vinserti128          m0, m4, xm5, 1
+    vperm2i128           m4, m4, m5, 0x31
+    psubw                m1, m7, m6
+    jmp m(iadst_16x4_internal).pass1_end
+ALIGN function_align
+.pass2:
+    call m(iadst_16x4_internal).main
+    vpbroadcastd         m4, [o(pw_2048)]
+    REPX   {pmulhrsw x, m4}, m3, m2, m1, m0
+    pxor                 m4, m4
+    mova          [cq+32*0], m4
+    mova          [cq+32*1], m4
+    mova          [cq+32*2], m4
+    mova          [cq+32*3], m4
+    WRITE_16X2            3, 2, 4, 5, strideq*0, strideq*1
+    lea                dstq, [dstq+strideq*2]
+    WRITE_16X2            1, 0, 4, 5, strideq*0, strideq*1
+    RET
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
+    mova                xm2, [cq+16*0]
+    mova                xm4, [cq+16*1]
+    vinserti128          m2, [cq+16*4], 1
+    vinserti128          m4, [cq+16*5], 1
+    mova                xm0, [cq+16*2]
+    mova                xm1, [cq+16*3]
+    vinserti128          m0, [cq+16*6], 1
+    vinserti128          m1, [cq+16*7], 1
+    vpbroadcastd         m7, [o(pw_1697x16)]
+    vpbroadcastd         m8, [o(pw_16384)]
+    punpcklwd            m3, m2, m4
+    punpckhwd            m2, m4
+    punpcklwd            m4, m0, m1
+    punpckhwd            m0, m1
+    punpcklwd            m1, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m2, m4, m0
+    punpckhwd            m4, m0
+    pmulhrsw             m0, m7, m1
+    pmulhrsw             m5, m7, m2
+    pmulhrsw             m6, m7, m3
+    pmulhrsw             m7, m4
+    REPX   {pmulhrsw x, m8}, m0, m5, m6, m7
+    paddsw               m1, m0
+    paddsw               m2, m5
+    paddsw               m3, m6
+    paddsw               m4, m7
+    punpcklqdq           m0, m1, m2
+    punpckhqdq           m1, m2
+    punpcklqdq           m2, m3, m4
+    punpckhqdq           m3, m4
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m7, [o(pw_1697x8)]
+    pmulhrsw             m4, m7, m0
+    pmulhrsw             m5, m7, m1
+    pmulhrsw             m6, m7, m2
+    pmulhrsw             m7, m3
+    paddsw               m0, m4
+    paddsw               m1, m5
+    paddsw               m2, m6
+    paddsw               m3, m7
+    jmp m(iadst_16x4_internal).end
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x8
+%ifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    mov                 r2d, 4
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+%endif
+%endmacro
+
+%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
+    vpbroadcastd         m8, [o(pw_2896x8)]
+    vpermq               m0, [cq+32*0], q3120
+    add                  cq, 32*4
+    vpermq               m7, [cq+32*3], q%1
+    vpermq               m1, [cq-32*3], q%1
+    vpermq               m6, [cq+32*2], q3120
+    vpermq               m2, [cq-32*2], q3120
+    vpermq               m5, [cq+32*1], q%1
+    vpermq               m3, [cq-32*1], q%1
+    vpermq               m4, [cq+32*0], q3120
+    REPX   {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
+
+cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
+    ITX_16X8_LOAD_COEFS 3120
+    call m(idct_8x16_internal).main
+    vpbroadcastd        m10, [o(pw_16384)]
+    punpckhwd            m8, m0, m2
+    punpcklwd            m0, m2
+    punpckhwd            m2, m1, m3
+    punpcklwd            m1, m3
+    punpcklwd            m9, m4, m6
+    punpckhwd            m4, m6
+    punpcklwd            m6, m5, m7
+    punpckhwd            m5, m7
+    REPX  {pmulhrsw x, m10}, m8, m1, m4, m6
+.pass1_end:
+    REPX  {pmulhrsw x, m10}, m0, m2, m9, m5
+    punpckhwd            m3, m0, m8
+    punpcklwd            m0, m8
+    punpckhwd            m8, m2, m1
+    punpcklwd            m2, m1
+    punpcklwd            m7, m9, m4
+    punpckhwd            m9, m4
+    punpcklwd            m4, m5, m6
+    punpckhwd            m5, m6
+    punpckhdq            m1, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m8
+    punpckhdq            m3, m8
+    punpckldq            m6, m7, m4
+    punpckhdq            m7, m4
+    punpckldq            m8, m9, m5
+    punpckhdq            m9, m5
+    vperm2i128           m4, m0, m6, 0x31
+    vinserti128          m0, xm6, 1
+    vperm2i128           m5, m1, m7, 0x31
+    vinserti128          m1, xm7, 1
+    vperm2i128           m6, m2, m8, 0x31
+    vinserti128          m2, xm8, 1
+    vperm2i128           m7, m3, m9, 0x31
+    vinserti128          m3, xm9, 1
+    jmp                tx2q
+.pass2:
+    call .main
+    vpbroadcastd         m8, [o(pw_2048)]
+.end:
+    REPX   {pmulhrsw x, m8}, m0, m2, m4, m6
+.end2:
+    REPX   {pmulhrsw x, m8}, m1, m3, m5, m7
+    lea                  r3, [strideq*3]
+    WRITE_16X2            0, 1, 8, 0, strideq*0, strideq*1
+    WRITE_16X2            2, 3, 0, 1, strideq*2, r3
+.end3:
+    pxor                 m0, m0
+    REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+.end4:
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            4, 5, 0, 1, strideq*0, strideq*1
+    WRITE_16X2            6, 7, 0, 1, strideq*2, r3
+    RET
+ALIGN function_align
+.main:
+    vpbroadcastd        m10, [o(pd_2048)]
+.main2:
+    IDCT8_1D              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
+    ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
+    ITX_16X8_LOAD_COEFS 1302
+    call m(iadst_8x16_internal).main2
+    call m(iadst_8x16_internal).main_pass1_end
+    psubw               m11, m9, m10
+    punpcklwd            m8, m0, m2
+    punpckhwd            m0, m2
+    punpckhwd            m2, m1, m3
+    punpcklwd            m1, m3
+    punpcklwd            m9, m4, m6
+    punpckhwd            m4, m6
+    punpckhwd            m6, m5, m7
+    punpcklwd            m5, m7
+    REPX  {pmulhrsw x, m11}, m8, m1, m4, m6
+    jmp m(idct_16x8_internal).pass1_end
+ALIGN function_align
+.pass2:
+    call .main
+    call .main_pass2_end
+    pxor                 m8, m8
+    psubw                m8, m9
+    REPX   {pmulhrsw x, m9}, m0, m2, m4, m6
+    jmp m(idct_16x8_internal).end2
+ALIGN function_align
+.main:
+    vpbroadcastd        m10, [o(pd_2048)]
+    ITX_MULSUB_2W         7, 0, 8, 9, 10,  401, 4076 ; t1a, t0a
+    ITX_MULSUB_2W         3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
+    ITX_MULSUB_2W         1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
+    ITX_MULSUB_2W         5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
+    psubsw               m8, m2, m6 ; t6
+    paddsw               m2, m6     ; t2
+    psubsw               m6, m0, m4 ; t4
+    paddsw               m0, m4     ; t0
+    psubsw               m4, m5, m1 ; t7
+    paddsw               m5, m1     ; t3
+    psubsw               m1, m7, m3 ; t5
+    paddsw               m7, m3     ; t1
+    ITX_MULSUB_2W         6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
+    ITX_MULSUB_2W         4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
+    psubsw               m9, m6, m8 ;  t7
+    paddsw               m6, m8     ;  out6
+    psubsw               m3, m7, m5 ;  t3
+    paddsw               m7, m5     ; -out7
+    psubsw               m5, m0, m2 ;  t2
+    paddsw               m0, m2     ;  out0
+    psubsw               m2, m1, m4 ;  t6
+    paddsw               m1, m4     ; -out1
+    ret
+ALIGN function_align
+.main_pass1_end:
+    vpbroadcastd        m11, [o(pw_m2896_2896)]
+    vpbroadcastd        m12, [o(pw_2896_2896)]
+    punpckhwd            m4, m3, m5
+    punpcklwd            m3, m5
+    pmaddwd              m5, m11, m4
+    pmaddwd              m4, m12
+    pmaddwd              m8, m11, m3
+    pmaddwd              m3, m12
+    REPX     {paddd x, m10}, m5, m4, m8, m3
+    REPX     {psrad x, 12 }, m5, m8, m4, m3
+    packssdw             m3, m4     ; -out3
+    packssdw             m4, m8, m5 ;  out4
+    punpcklwd            m5, m9, m2
+    punpckhwd            m9, m2
+    pmaddwd              m2, m12, m5
+    pmaddwd              m5, m11
+    pmaddwd             m12, m9
+    pmaddwd             m11, m9
+    REPX     {paddd x, m10}, m2, m5, m12, m11
+    REPX     {psrad x, 12 }, m2, m12, m5, m11
+    packssdw             m2, m12    ;  out2
+    packssdw             m5, m11    ; -out5
+    ret
+ALIGN function_align
+.main_pass2_end:
+    vpbroadcastd         m8, [o(pw_2896x8)]
+    psubsw               m4, m5, m3
+    paddsw               m3, m5
+    psubsw               m5, m2, m9
+    paddsw               m2, m9
+    pmulhrsw             m2, m8     ;  out2
+    pmulhrsw             m3, m8     ; -out3
+    pmulhrsw             m4, m8     ;  out4
+    pmulhrsw             m5, m8     ; -out5
+    vpbroadcastd         m9, [o(pw_2048)]
+    ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
+    ITX_16X8_LOAD_COEFS 1302
+    call m(iadst_8x16_internal).main2
+    call m(iadst_8x16_internal).main_pass1_end
+    psubw                m9, m10
+    punpcklwd            m8, m6, m4
+    punpckhwd            m6, m4
+    punpcklwd            m4, m7, m5
+    punpckhwd            m7, m5
+    punpckhwd            m5, m3, m1
+    punpcklwd            m3, m1
+    punpckhwd            m1, m2, m0
+    punpcklwd            m2, m0
+    REPX  {pmulhrsw x, m10}, m8, m4, m5, m1
+    REPX  {pmulhrsw x, m9 }, m6, m7, m3, m2
+    punpcklwd            m0, m7, m4
+    punpckhwd            m7, m4
+    punpckhwd            m4, m6, m8
+    punpcklwd            m6, m8
+    punpckhwd            m8, m3, m5
+    punpcklwd            m3, m5
+    punpcklwd            m5, m2, m1
+    punpckhwd            m2, m1
+    punpckhdq            m1, m0, m6
+    punpckldq            m0, m6
+    punpckldq            m6, m7, m4
+    punpckhdq            m7, m4
+    punpckhdq            m4, m3, m5
+    punpckldq            m3, m5
+    punpckldq            m5, m8, m2
+    punpckhdq            m8, m2
+    vinserti128          m2, m6, xm5, 1
+    vperm2i128           m6, m5, 0x31
+    vperm2i128           m5, m1, m4, 0x31
+    vinserti128          m1, xm4, 1
+    vperm2i128           m4, m0, m3, 0x31
+    vinserti128          m0, xm3, 1
+    vinserti128          m3, m7, xm8, 1
+    vperm2i128           m7, m8, 0x31
+    jmp                tx2q
+.pass2:
+    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass2_end
+    pxor                 m8, m8
+    psubw                m8, m9
+    pmulhrsw            m10, m7, m8
+    pmulhrsw             m7, m0, m9
+    pmulhrsw             m0, m6, m9
+    pmulhrsw             m6, m1, m8
+    pmulhrsw             m1, m5, m8
+    pmulhrsw             m5, m2, m9
+    pmulhrsw             m2, m4, m9
+    pmulhrsw             m4, m3, m8
+    lea                  r3, [strideq*3]
+    WRITE_16X2           10, 0, 8, 9, strideq*0, strideq*1
+    WRITE_16X2            1, 2, 0, 1, strideq*2, r3
+    jmp m(idct_16x8_internal).end3
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
+    mova                xm7, [cq+16*0]
+    mova                xm2, [cq+16*1]
+    add                  cq, 16*8
+    vpbroadcastd         m3, [o(pw_2896x8)]
+    vinserti128          m7, [cq+16*0], 1
+    vinserti128          m2, [cq+16*1], 1
+    mova                xm6, [cq-16*6]
+    mova                xm4, [cq-16*5]
+    vinserti128          m6, [cq+16*2], 1
+    vinserti128          m4, [cq+16*3], 1
+    mova                xm8, [cq-16*4]
+    mova                xm5, [cq-16*3]
+    vinserti128          m8, [cq+16*4], 1
+    vinserti128          m5, [cq+16*5], 1
+    mova                xm0, [cq-16*2]
+    mova                xm1, [cq-16*1]
+    vinserti128          m0, [cq+16*6], 1
+    vinserti128          m1, [cq+16*7], 1
+    vpbroadcastd        m10, [o(pw_1697x16)]
+    vpbroadcastd        m11, [o(pw_16384)]
+    REPX   {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
+    punpcklwd            m3, m7, m2
+    punpckhwd            m7, m2
+    punpcklwd            m2, m6, m4
+    punpckhwd            m6, m4
+    punpcklwd            m4, m8, m5
+    punpckhwd            m8, m5
+    punpcklwd            m5, m0, m1
+    punpckhwd            m0, m1
+    punpckldq            m1, m3, m2
+    punpckhdq            m3, m2
+    punpckldq            m2, m4, m5
+    punpckhdq            m4, m5
+    punpckldq            m5, m7, m6
+    punpckhdq            m7, m6
+    punpckldq            m6, m8, m0
+    punpckhdq            m8, m0
+    REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8
+    punpcklqdq           m0, m1, m2
+    punpckhqdq           m1, m2
+    punpcklqdq           m2, m3, m4
+    punpckhqdq           m3, m4
+    punpcklqdq           m4, m5, m6
+    punpckhqdq           m5, m6
+    punpcklqdq           m6, m7, m8
+    punpckhqdq           m7, m8
+    jmp                tx2q
+.pass2:
+    vpbroadcastd         m8, [o(pw_4096)]
+    jmp m(idct_16x8_internal).end
+
+%define o_base pw_5 + 128
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x16
+%ifidn %1_%2, dct_dct
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    mov                 r2d, 8
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+%endif
+%endmacro
+
+%macro ITX_16X16_LOAD_COEFS 0
+    mova                 m0, [cq+32*0]
+    mova                 m1, [cq+32*1]
+    mova                 m2, [cq+32*2]
+    mova                 m3, [cq+32*3]
+    add                  cq, 32*8
+    mova                 m4, [cq-32*4]
+    mova                 m5, [cq-32*3]
+    mova                 m6, [cq-32*2]
+    mova                 m7, [cq-32*1]
+    mova                 m8, [cq+32*0]
+    mova                 m9, [cq+32*1]
+    mova                m10, [cq+32*2]
+    mova                m11, [cq+32*3]
+    mova                m12, [cq+32*4]
+    mova                m13, [cq+32*5]
+    mova                m14, [cq+32*6]
+    mova                m15, [cq+32*7]
+    mova              [rsp], m15
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
+
+cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+    ITX_16X16_LOAD_COEFS
+    call .main
+.pass1_end:
+    vpbroadcastd         m1, [o(pw_8192)]
+    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+    vextracti128 [rsp+16*5], m8, 1
+    mova         [rsp+16*1], xm8
+.pass1_end2:
+    vextracti128 [rsp+16*4], m0, 1
+    mova         [rsp+16*0], xm0
+    REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
+    pmulhrsw             m1, [rsp+32*1]
+    vperm2i128           m8, m1, m9, 0x31
+    vinserti128          m1, xm9, 1
+    vperm2i128           m9, m2, m10, 0x31
+    vinserti128          m2, xm10, 1
+    vperm2i128          m10, m3, m11, 0x31
+    vinserti128          m3, xm11, 1
+    vperm2i128          m11, m4, m12, 0x31
+    vinserti128          m4, xm12, 1
+    vperm2i128          m12, m5, m13, 0x31
+    vinserti128          m5, xm13, 1
+    vperm2i128          m13, m6, m14, 0x31
+    vinserti128          m6, xm14, 1
+    vperm2i128          m14, m7, m15, 0x31
+    vinserti128          m7, xm15, 1
+    mova                m15, [rsp+32*2]
+.pass1_end3:
+    punpcklwd            m0, m9, m10
+    punpckhwd            m9, m10
+    punpcklwd           m10, m15, m8
+    punpckhwd           m15, m8
+    punpckhwd            m8, m11, m12
+    punpcklwd           m11, m12
+    punpckhwd           m12, m13, m14
+    punpcklwd           m13, m14
+    punpckhdq           m14, m11, m13
+    punpckldq           m11, m13
+    punpckldq           m13, m15, m9
+    punpckhdq           m15, m9
+    punpckldq            m9, m10, m0
+    punpckhdq           m10, m0
+    punpckhdq            m0, m8, m12
+    punpckldq            m8, m12
+    punpcklqdq          m12, m13, m8
+    punpckhqdq          m13, m8
+    punpcklqdq           m8, m9, m11
+    punpckhqdq           m9, m11
+    punpckhqdq          m11, m10, m14
+    punpcklqdq          m10, m14
+    punpcklqdq          m14, m15, m0
+    punpckhqdq          m15, m0
+    mova                 m0, [rsp]
+    mova              [rsp], m15
+    punpckhwd           m15, m4, m5
+    punpcklwd            m4, m5
+    punpckhwd            m5, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m6, m7
+    punpcklwd            m6, m7
+    punpckhwd            m7, m2, m3
+    punpcklwd            m2, m3
+    punpckhdq            m3, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m4, m6
+    punpckhdq            m4, m6
+    punpckhdq            m6, m5, m7
+    punpckldq            m5, m7
+    punpckldq            m7, m15, m1
+    punpckhdq           m15, m1
+    punpckhqdq           m1, m0, m2
+    punpcklqdq           m0, m2
+    punpcklqdq           m2, m3, m4
+    punpckhqdq           m3, m4
+    punpcklqdq           m4, m5, m7
+    punpckhqdq           m5, m7
+    punpckhqdq           m7, m6, m15
+    punpcklqdq           m6, m15
+    jmp                tx2q
+.pass2:
+    call .main
+.end:
+    vpbroadcastd         m1, [o(pw_2048)]
+    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+    mova              [rsp], m6
+.end2:
+    REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
+    pmulhrsw             m1, [rsp+32*1]
+    lea                  r3, [strideq*3]
+    WRITE_16X2            0,  1,  6,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2        [rsp],  7,  0,  1, strideq*2, r3
+.end3:
+    pxor                 m2, m2
+    REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            8,  9,  0,  1, strideq*0, strideq*1
+    WRITE_16X2           10, 11,  0,  1, strideq*2, r3
+    REPX {mova [cq+32*x], m2},  0,  1,  2,  3,  4,  5,  6,  7
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2           12, 13,  0,  1, strideq*0, strideq*1
+    WRITE_16X2           14, 15,  0,  1, strideq*2, r3
+    RET
+ALIGN function_align
+.main:
+    vpbroadcastd        m15, [o(pd_2048)]
+    mova [rsp+gprsize+32*1], m1
+    mova [rsp+gprsize+32*2], m9
+    IDCT8_1D              0,  2,  4,  6,  8, 10, 12, 14,  1,  9, 15
+    mova                 m1, [rsp+gprsize+32*2] ; in9
+    mova [rsp+gprsize+32*2], m14 ; tmp7
+    mova                 m9, [rsp+gprsize+32*1] ; in1
+    mova [rsp+gprsize+32*1], m10 ; tmp5
+    mova                m14, [rsp+gprsize+32*0] ; in15
+    mova [rsp+gprsize+32*0], m6  ; tmp3
+    IDCT16_1D_ODDHALF     9,  3,  5,  7,  1, 11, 13, 14,  6, 10, 15
+    mova                 m6, [rsp+gprsize+32*1] ; tmp5
+    psubsw              m15, m0, m14  ; out15
+    paddsw               m0, m14      ; out0
+    psubsw              m14, m2, m13  ; out14
+    paddsw               m2, m13      ; out1
+    mova [rsp+gprsize+32*1], m2
+    psubsw              m13, m4, m11  ; out13
+    paddsw               m2, m4, m11  ; out2
+    psubsw              m11, m8, m7   ; out11
+    paddsw               m4, m8, m7   ; out4
+    mova                 m7, [rsp+gprsize+32*2] ; tmp7
+    psubsw              m10, m6, m5   ; out10
+    paddsw               m5, m6       ; out5
+    psubsw               m8, m7, m9   ; out8
+    paddsw               m7, m9       ; out7
+    psubsw               m9, m12, m3  ; out9
+    paddsw               m6, m12, m3  ; out6
+    mova                 m3, [rsp+gprsize+32*0] ; tmp3
+    psubsw              m12, m3, m1   ; out12
+    paddsw               m3, m1       ; out3
+    ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+    ITX_16X16_LOAD_COEFS
+    call .main
+    call .main_pass1_end
+    pmulhrsw             m0, m1, [cq+32*0]
+    pmulhrsw             m2, m1, [cq+32*1]
+    REPX   {pmulhrsw x, m1}, m4, m6, m8, m10
+    pmulhrsw            m12, m1, [cq+32*2]
+    pmulhrsw            m14, m1, [cq+32*3]
+    vextracti128 [rsp+16*5], m8, 1
+    mova         [rsp+16*1], xm8
+    pxor                 m8, m8
+    psubw                m1, m8, m1
+    jmp m(idct_16x16_internal).pass1_end2
+ALIGN function_align
+.pass2:
+    call .main
+    call .main_pass2_end
+    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+    mova         [rsp+32*0], m6
+    pxor                 m6, m6
+    psubw                m1, m6, m1
+    jmp m(idct_16x16_internal).end2
+ALIGN function_align
+.main:
+    vpbroadcastd        m15, [o(pd_2048)]
+    mova [rsp+gprsize+32*1], m0
+    mova [rsp+gprsize+32*2], m4
+    ITX_MULSUB_2W        13,  2,  0,  4, 15,  995, 3973 ; t3,  t2
+    ITX_MULSUB_2W         9,  6,  0,  4, 15, 2440, 3290 ; t7,  t6
+    ITX_MULSUB_2W         5, 10,  0,  4, 15, 3513, 2106 ; t11, t10
+    ITX_MULSUB_2W         1, 14,  0,  4, 15, 4052,  601 ; t15, t14
+    psubsw               m0, m2, m10  ; t10a
+    paddsw               m2, m10      ; t2a
+    psubsw              m10, m13, m5  ; t11a
+    paddsw              m13, m5       ; t3a
+    psubsw               m5, m6, m14  ; t14a
+    paddsw               m6, m14      ; t6a
+    psubsw              m14, m9, m1   ; t15a
+    paddsw               m9, m1       ; t7a
+    ITX_MULSUB_2W         0, 10,  1,  4, 15, 3406, 2276 ; t11, t10
+    ITX_MULSUB_2W        14,  5,  1,  4, 15, 2276, 3406 ; t14, t15
+    psubsw               m1, m10, m14 ; t14a
+    paddsw              m10, m14      ; t10a
+    psubsw              m14, m0, m5   ; t15a
+    paddsw               m0, m5       ; t11a
+    psubsw               m5, m2, m6   ; t6
+    paddsw               m2, m6       ; t2
+    psubsw               m6, m13, m9  ; t7
+    paddsw              m13, m9       ; t3
+    ITX_MULSUB_2W         6,  5,  4,  9, 15, 3784, 1567 ; t6a, t7a
+    ITX_MULSUB_2W        14,  1,  4,  9, 15, 3784, 1567 ; t14, t15
+    mova                 m9, [rsp+gprsize+32*0] ; in15
+    mova [rsp+gprsize+32*0], m10 ; t10a
+    mova                 m4, [rsp+gprsize+32*1] ; in0
+    mova [rsp+gprsize+32*1], m6  ; t6a
+    mova                 m6, [rsp+gprsize+32*2] ; in4
+    mova [rsp+gprsize+32*2], m2  ; t2
+    ITX_MULSUB_2W         9,  4,  2, 10, 15,  201, 4091 ; t1,  t0
+    ITX_MULSUB_2W        11,  6,  2, 10, 15, 1751, 3703 ; t5,  t4
+    ITX_MULSUB_2W         7,  8,  2, 10, 15, 3035, 2751 ; t9,  t8
+    ITX_MULSUB_2W         3, 12,  2, 10, 15, 3857, 1380 ; t13, t12
+    psubsw              m10, m4, m8  ; t8a
+    paddsw               m8, m4      ; t0a
+    psubsw               m4, m9, m7  ; t9a
+    paddsw               m9, m7      ; t1a
+    psubsw               m7, m6, m12 ; t12a
+    paddsw               m6, m12     ; t4a
+    psubsw              m12, m11, m3 ; t13a
+    paddsw              m11, m3      ; t5a
+    ITX_MULSUB_2W        10,  4,  2,  3, 15,  799, 4017 ; t9,  t8
+    ITX_MULSUB_2W        12,  7,  2,  3, 15, 4017,  799 ; t12, t13
+    psubsw               m3, m9, m11 ; t5
+    paddsw               m9, m11     ; t1
+    psubsw              m11, m4, m12 ; t12a
+    paddsw               m4, m12     ; t8a
+    paddsw              m12, m8, m6  ; t0
+    psubsw               m8, m6      ; t4
+    paddsw               m6, m10, m7 ; t9a
+    psubsw              m10, m7      ; t13a
+    ITX_MULSUB_2W         8,  3,  2,  7, 15, 1567, 3784 ; t5a, t4a
+    ITX_MULSUB_2W        11, 10,  2,  7, 15, 1567, 3784 ; t13, t12
+    mova                 m7, [rsp+gprsize+32*0] ; t10a
+    mova                 m2, [rsp+gprsize+32*1] ; t6a
+    paddsw              m15, m9, m13  ; -out15
+    psubsw               m9, m13      ;  t3a
+    paddsw              m13, m11, m1  ; -out13
+    psubsw              m11, m1       ;  t15a
+    psubsw               m1, m4, m7   ;  t10
+    paddsw               m7, m4       ; -out1
+    psubsw               m4, m3, m2   ;  t6
+    paddsw               m3, m2       ; -out3
+    paddsw               m2, m10, m14 ;  out2
+    psubsw              m10, m14      ;  t14a
+    paddsw              m14, m6, m0   ;  out14
+    psubsw               m6, m0       ;  t11
+    mova                 m0, [rsp+gprsize+32*2] ; t2
+    mova [rsp+gprsize+32*1], m7
+    psubsw               m7, m12, m0  ;  t2a
+    paddsw               m0, m12      ;  out0
+    paddsw              m12, m8, m5   ;  out12
+    psubsw               m8, m5       ;  t7
+    ret
+ALIGN function_align
+.main_pass1_end:
+    mova          [cq+32*0], m0
+    mova          [cq+32*1], m2
+    mova          [cq+32*2], m12
+    mova          [cq+32*3], m14
+    vpbroadcastd        m14, [pw_m2896_2896]
+    vpbroadcastd        m12, [pw_2896_2896]
+    vpbroadcastd         m2, [pd_2048]
+    punpcklwd            m5, m11, m10
+    punpckhwd           m11, m10
+    pmaddwd             m10, m14, m5
+    pmaddwd              m0, m14, m11
+    pmaddwd              m5, m12
+    pmaddwd             m11, m12
+    REPX      {paddd x, m2}, m10, m0, m5, m11
+    REPX      {psrad x, 12}, m10, m0, m5, m11
+    packssdw            m10, m0  ;  out10
+    packssdw             m5, m11 ; -out5
+    punpcklwd           m11, m8, m4
+    punpckhwd            m8, m4
+    pmaddwd              m4, m12, m11
+    pmaddwd              m0, m12, m8
+    pmaddwd             m11, m14
+    pmaddwd              m8, m14
+    REPX      {paddd x, m2}, m4, m0, m11, m8
+    REPX      {psrad x, 12}, m4, m0, m11, m8
+    packssdw             m4, m0  ;  out4
+    packssdw            m11, m8  ; -out11
+    punpcklwd            m8, m9, m7
+    punpckhwd            m9, m7
+    pmaddwd              m7, m12, m8
+    pmaddwd              m0, m12, m9
+    pmaddwd              m8, m14
+    pmaddwd              m9, m14
+    REPX      {paddd x, m2}, m7, m0, m8, m9
+    REPX      {psrad x, 12}, m7, m0, m8, m9
+    packssdw             m7, m0  ; -out7
+    packssdw             m8, m9  ;  out8
+    punpckhwd            m0, m6, m1
+    punpcklwd            m6, m1
+    pmaddwd              m1, m14, m0
+    pmaddwd              m9, m14, m6
+    pmaddwd              m0, m12
+    pmaddwd              m6, m12
+    REPX      {paddd x, m2}, m1, m9, m0, m6
+    REPX      {psrad x, 12}, m1, m9, m0, m6
+    packssdw             m9, m1  ; -out7
+    packssdw             m6, m0  ;  out8
+    vpbroadcastd         m1, [o(pw_8192)]
+    ret
+ALIGN function_align
+.main_pass2_end:
+    ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
+    ; 16-bit here will produce the same result as using 32-bit intermediates.
+    paddsw               m5, m10, m11 ; -out5
+    psubsw              m10, m11      ;  out10
+    psubsw              m11, m4, m8   ; -out11
+    paddsw               m4, m8       ;  out4
+    psubsw               m8, m7, m9   ;  out8
+    paddsw               m7, m9       ; -out7
+    psubsw               m9, m1, m6   ; -out9
+    paddsw               m6, m1       ;  out6
+    vpbroadcastd         m1, [o(pw_2896x8)]
+    REPX   {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
+    vpbroadcastd         m1, [o(pw_2048)]
+    ret
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+    ITX_16X16_LOAD_COEFS
+    call m(iadst_16x16_internal).main
+    call m(iadst_16x16_internal).main_pass1_end
+    pmulhrsw             m6, m1
+    pmulhrsw             m2, m1, m8
+    mova         [rsp+32*2], m6
+    pmulhrsw             m6, m1, m4
+    pmulhrsw             m4, m1, m10
+    pmulhrsw             m8, m1, [cq+32*3]
+    pmulhrsw            m10, m1, [cq+32*2]
+    pmulhrsw            m12, m1, [cq+32*1]
+    pmulhrsw            m14, m1, [cq+32*0]
+    pxor                 m0, m0
+    psubw                m0, m1
+    REPX   {pmulhrsw x, m0}, m3, m5, m7, m11, m15
+    pmulhrsw             m1, m0, m9
+    pmulhrsw             m9, m0, m13
+    pmulhrsw             m0, [rsp+32*1]
+    mova         [rsp+16*0], xm15
+    mova         [rsp+16*1], xm7
+    vperm2i128          m15, m15, m7, 0x31
+    vinserti128          m7, m2, xm14, 1
+    vperm2i128          m14, m2, m14, 0x31
+    vinserti128          m2, m9, xm5, 1
+    vperm2i128           m9, m9, m5, 0x31
+    vinserti128          m5, m4, xm12, 1
+    vperm2i128          m12, m4, m12, 0x31
+    vinserti128          m4, m11, xm3, 1
+    vperm2i128          m11, m11, m3, 0x31
+    vinserti128          m3, m10, xm6, 1
+    vperm2i128          m10, m10, m6, 0x31
+    vinserti128          m6, m1, xm0, 1
+    vperm2i128          m13, m1, m0, 0x31
+    vinserti128          m1, m8, [rsp+32*2], 1
+    vperm2i128           m8, m8, [rsp+32*2], 0x31
+    jmp m(idct_16x16_internal).pass1_end3
+.pass2:
+    call m(iadst_16x16_internal).main
+    call m(iadst_16x16_internal).main_pass2_end
+    pmulhrsw             m0, m1
+    pmulhrsw             m8, m1
+    mova         [rsp+32*0], m0
+    mova         [rsp+32*2], m8
+    pxor                 m0, m0
+    psubw                m0, m1
+    pmulhrsw             m8, m0, m7
+    pmulhrsw             m7, m0, m9
+    pmulhrsw             m9, m1, m6
+    pmulhrsw             m6, m1, m10
+    pmulhrsw            m10, m0, m5
+    pmulhrsw             m5, m0, m11
+    pmulhrsw            m11, m1, m4
+    pmulhrsw             m4, m1, m12
+    pmulhrsw            m12, m0, m3
+    pmulhrsw             m3, m0, m13
+    pmulhrsw            m13, m1, m2
+    pmulhrsw             m1, m14
+    pmulhrsw            m14, m0, [rsp+32*1]
+    pmulhrsw             m0, m15
+    lea                  r3, [strideq*3]
+    WRITE_16X2            0,  1,  2,  0, strideq*0, strideq*1
+    mova                m15, [rsp+32*0]
+    WRITE_16X2            3,  4,  0,  1, strideq*2, r3
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            5,  6,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            7, [rsp+32*2],  0,  1, strideq*2, r3
+    jmp m(idct_16x16_internal).end3
+
+%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
+    pmulhrsw            m%2, m%3, m%1
+    psraw               m%2, 1
+    pavgw               m%1, m%2 ; signs are guaranteed to be equal
+%endmacro
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+    vpbroadcastd         m7, [o(pw_1697x16)]
+    mova                xm0, [cq+16* 0]
+    vinserti128          m0, [cq+16*16], 1
+    mova               xm15, [cq+16* 1]
+    vinserti128         m15, [cq+16*17], 1
+    mova                xm1, [cq+16* 2]
+    vinserti128          m1, [cq+16*18], 1
+    mova                xm8, [cq+16* 3]
+    vinserti128          m8, [cq+16*19], 1
+    mova                xm2, [cq+16* 4]
+    vinserti128          m2, [cq+16*20], 1
+    mova                xm9, [cq+16* 5]
+    vinserti128          m9, [cq+16*21], 1
+    mova                xm3, [cq+16* 6]
+    vinserti128          m3, [cq+16*22], 1
+    mova               xm10, [cq+16* 7]
+    add                  cq, 16*16
+    vinserti128         m10, [cq+16* 7], 1
+    mova                xm4, [cq-16* 8]
+    vinserti128          m4, [cq+16* 8], 1
+    mova               xm11, [cq-16* 7]
+    vinserti128         m11, [cq+16* 9], 1
+    mova                xm5, [cq-16* 6]
+    vinserti128          m5, [cq+16*10], 1
+    mova               xm12, [cq-16* 5]
+    vinserti128         m12, [cq+16*11], 1
+    mova               xm13, [cq-16* 3]
+    vinserti128         m13, [cq+16*13], 1
+    mova               xm14, [cq-16* 1]
+    vinserti128         m14, [cq+16*15], 1
+    REPX  {IDTX16B x, 6, 7},  0, 15,  1,  8,  2,  9,  3, \
+                             10,  4, 11,  5, 12, 13, 14
+    mova                xm6, [cq-16* 4]
+    vinserti128          m6, [cq+16*12], 1
+    mova              [rsp], m0
+    IDTX16B               6, 0, 7
+    mova                xm0, [cq-16* 2]
+    vinserti128          m0, [cq+16*14], 1
+    pmulhrsw             m7, m0
+    psraw                m7, 1
+    pavgw                m7, m0
+    jmp m(idct_16x16_internal).pass1_end3
+ALIGN function_align
+.pass2:
+    vpbroadcastd        m15, [o(pw_1697x16)]
+    mova         [rsp+32*1], m0
+    REPX  {IDTX16 x, 0, 15},  1,  2,  3,  4,  5,  6,  7, \
+                              8,  9, 10, 11, 12, 13, 14
+    mova                 m0, [rsp+32*1]
+    mova         [rsp+32*1], m1
+    IDTX16                0, 1, 15
+    mova                 m1, [rsp+32*0]
+    pmulhrsw            m15, m1
+    paddsw               m1, m1
+    paddsw              m15, m1
+    jmp m(idct_16x16_internal).end
+
+%define o_base deint_shuf + 128
+
+%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
+%if %3
+    vpbroadcastd        m15, [o(pw_2896x8)]
+    pmulhrsw             m0, m15, [%1+%2*0]
+    pmulhrsw             m1, m15, [%1+%2*1]
+    pmulhrsw             m2, m15, [%1+%2*2]
+    pmulhrsw             m3, m15, [%1+%2*3]
+    pmulhrsw             m4, m15, [%1+%2*4]
+    pmulhrsw             m5, m15, [%1+%2*5]
+    pmulhrsw             m6, m15, [%1+%2*6]
+    pmulhrsw             m7, m15, [%1+%2*7]
+%else
+    mova                 m0, [%1+%2*0]
+    mova                 m1, [%1+%2*1]
+    mova                 m2, [%1+%2*2]
+    mova                 m3, [%1+%2*3]
+    mova                 m4, [%1+%2*4]
+    mova                 m5, [%1+%2*5]
+    mova                 m6, [%1+%2*6]
+    mova                 m7, [%1+%2*7]
+%endif
+%endmacro
+
+%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2
+%if %3
+%if %3 == 1
+    vpbroadcastd        m15, [o(pw_2896x8)]
+%endif
+    pmulhrsw             m8, m15, [%1+%2*0]
+    pmulhrsw             m9, m15, [%1+%2*1]
+    pmulhrsw            m10, m15, [%1+%2*2]
+    pmulhrsw            m11, m15, [%1+%2*3]
+    pmulhrsw            m12, m15, [%1+%2*4]
+    pmulhrsw            m13, m15, [%1+%2*5]
+    pmulhrsw            m14, m15, [%1+%2*6]
+    pmulhrsw            m15,      [%1+%2*7]
+%else
+    mova                 m8, [%1+%2*0]
+    mova                 m9, [%1+%2*1]
+    mova                m10, [%1+%2*2]
+    mova                m11, [%1+%2*3]
+    mova                m12, [%1+%2*4]
+    mova                m13, [%1+%2*5]
+    mova                m14, [%1+%2*6]
+    mova                m15, [%1+%2*7]
+%endif
+%endmacro
+
+%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
+    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%4_%5x8]
+    punpcklwd           m%1, m%2, m%2
+    pmulhrsw            m%1, m%3
+    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%6_%7x8]
+    punpckhwd           m%2, m%2
+    pmulhrsw            m%2, m%3
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jz .dconly
+    PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
+    %undef cmp
+    cmp                eobd, 106
+    jle .fast
+    LOAD_8ROWS      cq+32*1, 32*2
+    call m(idct_16x8_internal).main
+    vperm2i128          m11, m0, m4, 0x31
+    vinserti128          m0, xm4, 1
+    vperm2i128           m4, m1, m5, 0x31
+    vinserti128          m1, xm5, 1
+    vperm2i128           m5, m2, m6, 0x31
+    vinserti128          m2, xm6, 1
+    vperm2i128           m6, m3, m7, 0x31
+    vinserti128          m3, xm7, 1
+    pxor                 m7, m7
+    REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
+    punpckhwd            m7, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m2, m3
+    punpcklwd            m2, m3
+    punpcklwd            m3, m11, m4
+    punpckhwd           m11, m4
+    punpckhwd            m4, m5, m6
+    punpcklwd            m5, m6
+    punpckhdq            m6, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m5
+    punpckhdq            m3, m5
+    punpckhdq            m5, m11, m4
+    punpckldq           m11, m4
+    punpckldq            m4, m7, m1
+    punpckhdq            m7, m1
+    punpckhqdq          m12, m6, m0
+    punpcklqdq           m0, m6     ; out4
+    punpckhqdq          m13, m7, m4
+    punpcklqdq           m4, m7     ; out5
+    punpckhqdq          m14, m3, m2
+    punpcklqdq           m2, m3     ; out6
+    punpckhqdq          m15, m5, m11
+    punpcklqdq          m11, m5     ; out7
+    mova         [rsp+32*0], m0
+    mova         [rsp+32*1], m4
+    mova         [rsp+32*2], m2
+.fast:
+    LOAD_8ROWS      cq+32*0, 32*2
+    call m(idct_16x8_internal).main
+    vperm2i128           m8, m0, m4, 0x31
+    vinserti128          m0, xm4, 1
+    vperm2i128           m4, m1, m5, 0x31
+    vinserti128          m1, xm5, 1
+    vperm2i128           m5, m2, m6, 0x31
+    vinserti128          m2, xm6, 1
+    vperm2i128           m6, m3, m7, 0x31
+    vinserti128          m3, xm7, 1
+    vpbroadcastd         m9, [o(pw_8192)]
+    pxor                 m7, m7
+    REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
+    punpckhwd            m7, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m2, m3
+    punpcklwd            m2, m3
+    punpckhwd            m3, m8, m4
+    punpcklwd            m8, m4
+    punpckhwd            m4, m5, m6
+    punpcklwd            m5, m6
+    punpckhdq            m6, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m8, m5
+    punpckhdq            m8, m5
+    punpckhdq            m5, m3, m4
+    punpckldq            m3, m4
+    punpckhdq            m4, m7, m1
+    punpckldq            m7, m1
+    punpcklqdq           m1, m7, m4
+    punpckhqdq           m7, m4     ; out9
+    punpckhqdq           m4, m2, m8 ; out10
+    punpcklqdq           m2, m8
+    punpckhqdq           m8, m3, m5
+    punpcklqdq           m3, m5
+    punpckhqdq           m5, m0, m6 ; out8
+    punpcklqdq           m0, m6
+    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7
+    cmp                eobd, 106
+    jg .full
+    mova         [rsp+32*0], m5
+    mova         [rsp+32*1], m7
+    mova         [rsp+32*2], m4
+    pmulhrsw            m11, m9, m8
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7
+    call .main_fast
+    jmp .pass2
+.dconly:
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm2
+    psrlw               xm2, 2 ; pw_2048
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    mov                 r2d, 8
+    jmp m(inv_txfm_add_dct_dct_8x8).end2
+.full:
+    REPX   {pmulhrsw x, m9}, m12, m13, m14, m15
+    pmulhrsw             m6, m9, [rsp+32*2]
+    mova         [rsp+32*2], m4
+    pmulhrsw             m4, m9, [rsp+32*0]
+    mova         [rsp+32*0], m5
+    pmulhrsw             m5, m9, [rsp+32*1]
+    mova         [rsp+32*1], m7
+    pmulhrsw             m7, m9, m11
+    pmulhrsw            m11, m9, m8
+    call .main
+.pass2:
+    vpbroadcastd        m12, [o(pw_2048)]
+    REPX  {pmulhrsw x, m12}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
+                             m8,  m9,  m10, m11,      m13, m14, m15
+    pmulhrsw            m12, [rsp]
+    REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14
+    REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15
+    mova         [rsp+32*0], m4
+    mova         [rsp+32*1], m6
+    lea                  r3, [strideq*3]
+    WRITE_8X4             0,  1,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4             2,  3,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4    [rsp+32*0],  5,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4    [rsp+32*1],  7,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4             8,  9,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4            10, 11,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4            12, 13,  4,  6
+    lea                dstq, [dstq+strideq*4]
+    WRITE_8X4            14, 15,  4,  6
+    RET
+ALIGN function_align
+.main_fast: ; bottom half is zero
+    call m(idct_8x16_internal).main
+    mova                 m8, [rsp+gprsize+0*32]
+    mova [rsp+gprsize+0*32], m0
+    mova                 m9, [rsp+gprsize+1*32]
+    mova [rsp+gprsize+1*32], m1
+    mova                 m0, [rsp+gprsize+2*32]
+    mova [rsp+gprsize+2*32], m6
+    lea                  r5, [rax-(o_base)+pw_201_4091x8]
+    ITX_UNPACK_MULHRSW    1,  8,  6,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
+    ITX_UNPACK_MULHRSW   15,  9,  6,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+    ITX_UNPACK_MULHRSW   14,  0,  6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+    ITX_UNPACK_MULHRSW   13, 11,  6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
+    jmp .main2
+ALIGN function_align
+.main:
+    call m(idct_8x16_internal).main
+    mova                 m8, [rsp+gprsize+0*32]
+    mova [rsp+gprsize+0*32], m0
+    mova                 m9, [rsp+gprsize+1*32]
+    mova [rsp+gprsize+1*32], m1
+    mova                 m0, [rsp+gprsize+2*32]
+    mova [rsp+gprsize+2*32], m6
+    punpcklwd            m1, m15, m8  ; in31 in1
+    punpckhwd            m8, m15      ; in3  in29
+    punpcklwd           m15, m14, m9  ; in27 in5
+    punpckhwd            m9, m14      ; in7  in25
+    punpcklwd           m14, m13, m0  ; in23 in9
+    punpckhwd            m0, m13      ; in11 in21
+    punpcklwd           m13, m12, m11 ; in19 in13
+    punpckhwd           m11, m12      ; in15 in17
+    ITX_MUL2X_PACK        1,  6, 12, 10,  201, 4091, 3 ; t16a, t31a
+    ITX_MUL2X_PACK        8,  6, 12, 10, 4052,  601, 3 ; t23a, t24a
+    ITX_MUL2X_PACK       15,  6, 12, 10,  995, 3973, 3 ; t20a, t27a
+    ITX_MUL2X_PACK        9,  6, 12, 10, 3857, 1380, 3 ; t19a, t28a
+    ITX_MUL2X_PACK       14,  6, 12, 10, 1751, 3703, 3 ; t18a, t29a
+    ITX_MUL2X_PACK        0,  6, 12, 10, 3513, 2106, 3 ; t21a, t26a
+    ITX_MUL2X_PACK       13,  6, 12, 10, 2440, 3290, 3 ; t22a, t25a
+    ITX_MUL2X_PACK       11,  6, 12, 10, 3035, 2751, 3 ; t17a, t30a
+.main2:
+    psubsw               m6, m1, m11  ; t17 t30
+    paddsw               m1, m11      ; t16 t31
+    psubsw              m11, m9, m14  ; t18 t29
+    paddsw               m9, m14      ; t19 t28
+    psubsw              m14, m15, m0  ; t21 t26
+    paddsw              m15, m0       ; t20 t27
+    psubsw               m0, m8, m13  ; t22 t25
+    paddsw               m8, m13      ; t23 t24
+    ITX_MUL2X_PACK        6, 12, 13, 10,   799, 4017, 3 ; t17a t30a
+    ITX_MUL2X_PACK       11, 12, 13, 10, m4017,  799, 3 ; t18a t29a
+    ITX_MUL2X_PACK       14, 12, 13, 10,  3406, 2276, 3 ; t21a t26a
+    ITX_MUL2X_PACK        0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
+    psubsw              m13, m1, m9   ; t19a t28a
+    paddsw               m1, m9       ; t16a t31a
+    psubsw               m9, m8, m15  ; t20a t27a
+    paddsw               m8, m15      ; t23a t24a
+    psubsw              m15, m6, m11  ; t18  t29
+    paddsw               m6, m11      ; t17  t30
+    psubsw              m11, m0, m14  ; t21  t26
+    paddsw               m0, m14      ; t22  t25
+    ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 3 ; t18a t29a
+    ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 3 ; t19  t28
+    ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 3 ; t20  t27
+    ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a
+    vbroadcasti128      m12, [o(deint_shuf)]
+    psubsw              m14, m1, m8   ; t23  t24
+    paddsw               m1, m8       ; t16  t31
+    psubsw               m8, m6, m0   ; t22a t25a
+    paddsw               m6, m0       ; t17a t30a
+    psubsw               m0, m15, m11 ; t21  t26
+    paddsw              m15, m11      ; t18  t29
+    psubsw              m11, m13, m9  ; t20a t27a
+    paddsw              m13, m9       ; t19a t28a
+    REPX    {pshufb x, m12}, m1, m6, m15, m13
+    ITX_MUL2X_PACK       14,  9, 12, 10, 2896, 2896 ; t24a t23a
+    vpbroadcastd         m9, [o(pw_m2896_2896)]
+    ITX_MUL2X_PACK        8, 12,  _, 10, 12,  9, 4  ; t22  t25
+    vpbroadcastd        m12, [o(pw_2896_2896)]
+    ITX_MUL2X_PACK        0, 12,  _, 10, 12,  9, 4  ; t21a t26a
+    vpbroadcastd        m12, [o(pw_2896_2896)]
+    ITX_MUL2X_PACK       11,  9,  _, 10,  9, 12, 4  ; t27  t20
+    shufps               m9, m14, m8, q1032 ; t23a t22
+    vpblendd            m14, m8, 0xcc       ; t24a t25
+    shufps               m8, m11, m0, q1032 ; t20  t21a
+    vpblendd            m11, m0, 0xcc       ; t27  t26a
+    punpcklqdq           m0, m1, m6   ; t16  t17a
+    punpckhqdq           m1, m6       ; t31  t30a
+    psubsw              m10, m5, m8   ; out20 out21
+    paddsw               m5, m8       ; out11 out10
+    psubsw               m6, m3, m14  ; out24 out25
+    paddsw               m3, m14      ; out7  out6
+    psubsw               m8, m7, m0   ; out16 out17
+    paddsw               m7, m0       ; out15 out14
+    mova                 m0, [rsp+gprsize+0*32]
+    punpcklqdq          m12, m13, m15 ; t19a t18
+    punpckhqdq          m13, m15      ; t28a t29
+    psubsw              m15, m0, m1   ; out31 out30
+    paddsw               m0, m1       ; out0  out1
+    mova                 m1, [rsp+gprsize+1*32]
+    mova [rsp+gprsize+0*32], m6
+    mova                 m6, [rsp+gprsize+2*32]
+    psubsw              m14, m1, m13  ; out28 out29
+    paddsw               m1, m13      ; out3  out2
+    psubsw              m13, m2, m11  ; out27 out26
+    paddsw               m2, m11      ; out4  out5
+    psubsw              m11, m4, m9   ; out23 out22
+    paddsw               m4, m9       ; out8  out9
+    psubsw               m9, m6, m12  ; out19 out18
+    paddsw               m6, m12      ; out12 out13
+    ret
+
+%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
+    vbroadcasti128      m%1, [cq+16*%3]
+    vbroadcasti128      m%2, [cq+16*%4]
+    shufpd              m%1, m%2, 0x0c
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    mov                 r2d, 8
+.dconly:
+    pmulhrsw            xm0, xm2
+    movd                xm2, [pw_2048] ; intentionally rip-relative
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    pxor                 m3, m3
+.dconly_loop:
+    mova                 m1, [dstq]
+    punpckhbw            m2, m1, m3
+    punpcklbw            m1, m3
+    paddw                m2, m0
+    paddw                m1, m0
+    packuswb             m1, m2
+    mova             [dstq], m1
+    add                dstq, strideq
+    dec                 r2d
+    jg .dconly_loop
+    RET
+.normal:
+    PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
+    %undef cmp
+    LOAD_PACKED_16X2      0,  7,  0,  2 ; in0  in2
+    LOAD_PACKED_16X2      4,  7,  1,  3 ; in1  in3
+    LOAD_PACKED_16X2      1,  7,  4,  6 ; in4  in6
+    LOAD_PACKED_16X2      5,  7,  5,  7 ; in5  in7
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
+    add                  cq, 16*16
+    LOAD_PACKED_16X2      2,  7, -8, -6 ; in8  in10
+    LOAD_PACKED_16X2      6,  7, -7, -5 ; in9  in11
+    LOAD_PACKED_16X2      3,  7, -4, -2 ; in12 in14
+    LOAD_PACKED_16X2     11,  7, -3, -1 ; in13 in15
+    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1
+    mova         [rsp+32*0], m4
+    mova         [rsp+32*1], m5
+    mova         [rsp+32*2], m6
+    cmp                eobd, 106
+    jg .full
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7
+    call m(inv_txfm_add_dct_dct_8x32).main_fast
+    jmp .pass2
+.full:
+    LOAD_PACKED_16X2      4,  7,  0,  2 ; in16 in18
+    LOAD_PACKED_16X2     12,  7,  3,  1 ; in19 in17
+    LOAD_PACKED_16X2      5,  7,  4,  6 ; in20 in22
+    LOAD_PACKED_16X2     13,  7,  7,  5 ; in23 in21
+    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
+    add                  cq, 16*8
+    LOAD_PACKED_16X2      6,  7,  0,  2 ; in24 in26
+    LOAD_PACKED_16X2     14,  7,  3,  1 ; in27 in25
+    LOAD_PACKED_16X2      7,  8,  4,  6 ; in28 in30
+    LOAD_PACKED_16X2     15,  8,  7,  5 ; in31 in29
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
+    call m(inv_txfm_add_dct_dct_8x32).main
+.pass2:
+    vpbroadcastd        m12, [o(pw_8192)]
+    REPX  {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15
+    mova         [rsp+32*1], m9
+    mova         [rsp+32*2], m10
+    punpckhwd            m9, m0, m2
+    punpcklwd            m0, m2
+    punpckhwd            m2, m1, m3
+    punpcklwd            m1, m3
+    punpcklwd           m10, m4, m6
+    punpckhwd            m4, m6
+    punpcklwd            m6, m5, m7
+    punpckhwd            m5, m7
+    punpckhwd            m3, m0, m9
+    punpcklwd            m0, m9
+    punpckhwd            m9, m2, m1
+    punpcklwd            m2, m1
+    punpcklwd            m7, m10, m4
+    punpckhwd           m10, m4
+    punpcklwd            m4, m5, m6
+    punpckhwd            m5, m6
+    punpckhdq            m1, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m9
+    punpckhdq            m3, m9
+    punpckldq            m6, m7, m4
+    punpckhdq            m7, m4
+    punpckldq            m9, m10, m5
+    punpckhdq           m10, m5
+    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10
+    pmulhrsw            m12, [rsp+32*0]
+    mova         [rsp+32*0], m8
+    vperm2i128           m4, m0, m6, 0x31
+    vinserti128          m0, xm6, 1
+    vperm2i128           m5, m1, m7, 0x31
+    vinserti128          m1, xm7, 1
+    vperm2i128           m6, m2, m9, 0x31
+    vinserti128          m2, xm9, 1
+    vperm2i128           m7, m3, m10, 0x31
+    vinserti128          m3, xm10, 1
+    call m(idct_16x8_internal).main
+    vpbroadcastd         m8, [o(pw_2048)]
+    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+    lea                  r2, [strideq*3]
+    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r2
+    lea                  r3, [dstq+strideq*4]
+    %define dstq r3
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  0,  1, strideq*2, r2
+    mova                 m0, [rsp+32*0]
+    mova                 m1, [rsp+32*1]
+    mova                 m2, [rsp+32*2]
+    punpckhwd            m7, m0, m2
+    punpcklwd            m0, m2
+    punpckhwd            m2, m1, m11
+    punpcklwd            m1, m11
+    punpckhwd            m4, m12, m14
+    punpcklwd           m12, m14
+    punpckhwd            m5, m13, m15
+    punpcklwd           m13, m15
+    punpckhwd            m3, m0, m7
+    punpcklwd            m0, m7
+    punpckhwd            m9, m2, m1
+    punpcklwd            m2, m1
+    punpcklwd            m7, m12, m4
+    punpckhwd           m12, m4
+    punpcklwd            m4, m5, m13
+    punpckhwd            m5, m13
+    punpckhdq            m1, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m3, m9
+    punpckhdq            m3, m9
+    punpckldq            m6, m7, m4
+    punpckhdq            m7, m4
+    punpckldq            m9, m12, m5
+    punpckhdq           m12, m5
+    vperm2i128           m4, m0, m6, 0x31
+    vinserti128          m0, xm6, 1
+    vperm2i128           m5, m1, m7, 0x31
+    vinserti128          m1, xm7, 1
+    vperm2i128           m6, m2, m9, 0x31
+    vinserti128          m2, xm9, 1
+    vperm2i128           m7, m3, m12, 0x31
+    vinserti128          m3, xm12, 1
+    call m(idct_16x8_internal).main2
+    vpbroadcastd         m8, [o(pw_2048)]
+    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+    add                  r0, 16
+    add                  r3, 16
+    %define dstq r0
+    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r2
+    %define dstq r3
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  0,  1, strideq*2, r2
+    RET
+
+cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob
+    vpbroadcastd         m9, [pw_5]
+    lea                  r4, [strideq*3]
+    sub                eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
+.loop:
+    mova                xm0,[cq+16* 0]
+    mova                xm1, [cq+16* 4]
+    vinserti128          m0, [cq+16* 1], 1
+    vinserti128          m1, [cq+16* 5], 1
+    pxor                 m8, m8
+    mova          [cq+32*0], m8
+    mova          [cq+32*2], m8
+    add                  cq, 16*16
+    mova                xm2, [cq-16* 8]
+    mova                xm3, [cq-16* 4]
+    vinserti128          m2, [cq-16* 7], 1
+    vinserti128          m3, [cq-16* 3], 1
+    mova                xm4, [cq+16* 0]
+    mova                xm5, [cq+16* 4]
+    vinserti128          m4, [cq+16* 1], 1
+    vinserti128          m5, [cq+16* 5], 1
+    mova                xm6, [cq+16* 8]
+    mova                xm7, [cq+16*12]
+    vinserti128          m6, [cq+16* 9], 1
+    vinserti128          m7, [cq+16*13], 1
+    REPX {mova [cq+32*x], m8}, -4, -2,  0,  2,  4,  6
+    REPX  {paddsw    x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+    call .transpose8x8
+    REPX  {psraw     x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
+    WRITE_8X4             0,  4,  8, 10, strideq*8, strideq*4, r4*4
+    add                dstq, strideq
+    WRITE_8X4             1,  5,  0,  4, strideq*8, strideq*4, r4*4
+    add                dstq, strideq
+    WRITE_8X4             2,  6,  0,  4, strideq*8, strideq*4, r4*4
+    add                dstq, strideq
+    WRITE_8X4             3,  7,  0,  4, strideq*8, strideq*4, r4*4
+    add                dstq, strideq
+    sub                  cq, 16*16-32
+    lea                dstq, [dstq+r4*4]
+    add                eobd, 0x80000000
+    jnc .loop
+    RET
+ALIGN function_align
+.transpose8x8:
+    punpckhwd            m8, m4, m5
+    punpcklwd            m4, m5
+    punpckhwd            m5, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m6, m7
+    punpcklwd            m6, m7
+    punpckhwd            m7, m2, m3
+    punpcklwd            m2, m3
+    punpckhdq            m3, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m4, m6
+    punpckhdq            m4, m6
+    punpckhdq            m6, m5, m7
+    punpckldq            m5, m7
+    punpckldq            m7, m8, m1
+    punpckhdq            m8, m1
+    punpckhqdq           m1, m0, m2
+    punpcklqdq           m0, m2
+    punpcklqdq           m2, m3, m4
+    punpckhqdq           m3, m4
+    punpcklqdq           m4, m5, m7
+    punpckhqdq           m5, m7
+    punpckhqdq           m7, m6, m8
+    punpcklqdq           m6, m8
+    ret
+
+cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob
+    add                  cq, 16*8
+    vpbroadcastd         m9, [pw_4096]
+    lea                  r4, [strideq*3]
+    lea                  r5, [dstq+strideq*4]
+    sub                eobd, 107
+.loop:
+    mova                xm0, [cq-16*8]
+    mova                xm1, [cq-16*7]
+    vinserti128          m0, [cq+16*0], 1
+    vinserti128          m1, [cq+16*1], 1
+    mova                xm2, [cq-16*6]
+    mova                xm3, [cq-16*5]
+    vinserti128          m2, [cq+16*2], 1
+    vinserti128          m3, [cq+16*3], 1
+    mova                xm4, [cq-16*4]
+    mova                xm5, [cq-16*3]
+    vinserti128          m4, [cq+16*4], 1
+    vinserti128          m5, [cq+16*5], 1
+    mova                xm6, [cq-16*2]
+    mova                xm7, [cq-16*1]
+    vinserti128          m6, [cq+16*6], 1
+    vinserti128          m7, [cq+16*7], 1
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1,  0,  1,  2,  3
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r4
+    %define dstq r5
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  0,  1, strideq*2, r4
+    add                  cq, 16*16
+    add                  r0, 16
+    add                  r5, 16
+    add                eobd, 0x80000000
+    jnc .loop
+    RET
+
+%define o_base pw_5 + 128
+
+%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs
+%if %3
+    vpbroadcastd        m15, [o(pw_2896x8)]
+    pmulhrsw             m0, m15, [%1+%2* 0]
+    pmulhrsw             m1, m15, [%1+%2* 1]
+    pmulhrsw             m2, m15, [%1+%2* 2]
+    pmulhrsw             m3, m15, [%1+%2* 3]
+    pmulhrsw             m4, m15, [%1+%2* 4]
+    pmulhrsw             m5, m15, [%1+%2* 5]
+    pmulhrsw             m6, m15, [%1+%2* 6]
+    pmulhrsw             m7, m15, [%1+%2* 7]
+    pmulhrsw             m8, m15, [%1+%2* 8]
+    pmulhrsw             m9, m15, [%1+%2* 9]
+    pmulhrsw            m10, m15, [%1+%2*10]
+    pmulhrsw            m11, m15, [%1+%2*11]
+    pmulhrsw            m12, m15, [%1+%2*12]
+    pmulhrsw            m13, m15, [%1+%2*13]
+    pmulhrsw            m14, m15, [%1+%2*14]
+    pmulhrsw            m15,      [%1+%2*15]
+%else
+    mova                 m0, [%1+%2* 0]
+    mova                 m1, [%1+%2* 1]
+    mova                 m2, [%1+%2* 2]
+    mova                 m3, [%1+%2* 3]
+    mova                 m4, [%1+%2* 4]
+    mova                 m5, [%1+%2* 5]
+    mova                 m6, [%1+%2* 6]
+    mova                 m7, [%1+%2* 7]
+    mova                 m8, [%1+%2* 8]
+    mova                 m9, [%1+%2* 9]
+    mova                m10, [%1+%2*10]
+    mova                m11, [%1+%2*11]
+    mova                m12, [%1+%2*12]
+    mova                m13, [%1+%2*13]
+    mova                m14, [%1+%2*14]
+    mova                m15, [%1+%2*15]
+%endif
+    mova              [rsp], m15
+%if %4
+    pxor                m15, m15
+    REPX {mova [%1+%2*x], m15}, 0,  1,  2,  3,  4,  5,  6,  7, \
+                                8,  9, 10, 11, 12, 13, 14, 15
+%endif
+%endmacro
+
+%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
+    mova                m%4, [%2]
+    paddsw              m%3, m%1, m%4
+    psubsw              m%1, m%4
+    pmovzxbw            m%4, [dstq+%6]
+    pmulhrsw            m%3, m%5
+    pmulhrsw            m%1, m%5
+    paddw               m%3, m%4
+    pmovzxbw            m%4, [r2+%7]
+    paddw               m%1, m%4
+    packuswb            m%3, m%1
+    vpermq              m%3, m%3, q3120
+    mova          [dstq+%6], xm%3
+    vextracti128    [r2+%7], m%3, 1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jz .dconly
+    PROLOGUE              0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
+                                           base, tmp3
+    %undef cmp
+    LOAD_16ROWS          cq, 64, 1
+    call m(idct_16x16_internal).main
+    lea               tmp1q, [rsp+32*7]
+    lea               tmp2q, [tmp1q+32*8]
+    lea               tmp3q, [tmp1q+32*16]
+    mova                 m1, [rsp+32*1]
+    mova         [rsp+32*0], m6
+    mova         [rsp+32*1], m7
+    vpbroadcastd         m7, [o(pw_16384)]
+    call .transpose_2x8x8_round
+    mova                m15, [rsp+32*0]
+    mova         [tmp3q-32*4+ 0], xm0
+    vextracti128 [tmp3q+32*0+ 0], m0, 1
+    mova         [tmp3q-32*3+ 0], xm2
+    vextracti128 [tmp3q+32*1+ 0], m2, 1
+    mova         [tmp3q-32*2+ 0], xm4
+    vextracti128 [tmp3q+32*2+ 0], m4, 1
+    mova         [tmp3q-32*1+ 0], xm6
+    vextracti128 [tmp3q+32*3+ 0], m6, 1
+    mova         [tmp3q-32*4+16], xm8
+    vextracti128 [tmp3q+32*0+16], m8, 1
+    mova         [tmp3q-32*3+16], xm10
+    vextracti128 [tmp3q+32*1+16], m10, 1
+    mova         [tmp3q-32*2+16], xm12
+    vextracti128 [tmp3q+32*2+16], m12, 1
+    mova         [tmp3q-32*1+16], xm14
+    vextracti128 [tmp3q+32*3+16], m14, 1
+    cmp                eobd, 150
+    jg .full
+    vinserti128          m0, m1, xm9, 1
+    vperm2i128           m4, m1, m9, 0x31
+    vinserti128          m2, m5, xm13, 1
+    vperm2i128           m6, m5, m13, 0x31
+    vinserti128          m1, m3, xm11, 1
+    vperm2i128           m5, m3, m11, 0x31
+    vinserti128          m3, m7, xm15, 1
+    vperm2i128           m7, m7, m15, 0x31
+    call .main_oddhalf_fast
+    pxor                 m8, m8
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+    jmp .idct16
+.dconly:
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    mov                 r2d, 16
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+.full:
+    mova       [tmp1q-32*4], m1
+    mova       [tmp1q-32*3], m3
+    mova       [tmp1q-32*2], m5
+    mova       [tmp1q-32*1], m7
+    mova       [tmp1q+32*0], m9
+    mova       [tmp1q+32*1], m11
+    mova       [tmp1q+32*2], m13
+    mova       [tmp1q+32*3], m15
+    LOAD_16ROWS       cq+32, 64, 1
+    call m(idct_16x16_internal).main
+    lea                  r2, [tmp3q+32*8]
+    mova                 m1, [rsp+32*1]
+    mova         [rsp+32*0], m6
+    mova         [rsp+32*1], m7
+    vpbroadcastd         m7, [o(pw_16384)]
+    call .transpose_2x8x8_round
+    mova                m15, [rsp+32*0]
+    mova         [r2-32*4+ 0], xm0
+    vextracti128 [r2+32*0+ 0], m0, 1
+    mova         [r2-32*3+ 0], xm2
+    vextracti128 [r2+32*1+ 0], m2, 1
+    mova         [r2-32*2+ 0], xm4
+    vextracti128 [r2+32*2+ 0], m4, 1
+    mova         [r2-32*1+ 0], xm6
+    vextracti128 [r2+32*3+ 0], m6, 1
+    mova         [r2-32*4+16], xm8
+    vextracti128 [r2+32*0+16], m8, 1
+    mova         [r2-32*3+16], xm10
+    vextracti128 [r2+32*1+16], m10, 1
+    mova         [r2-32*2+16], xm12
+    vextracti128 [r2+32*2+16], m12, 1
+    mova         [r2-32*1+16], xm14
+    vextracti128 [r2+32*3+16], m14, 1
+    vinserti128          m8, m1, xm9, 1
+    vperm2i128          m12, m1, m9, 0x31
+    mova                xm0, [tmp1q-32*4]
+    mova                xm1, [tmp1q-32*3]
+    vinserti128          m0, [tmp1q+32*0], 1
+    vinserti128          m1, [tmp1q+32*1], 1
+    vinserti128         m10, m5, xm13, 1
+    vperm2i128          m14, m5, m13, 0x31
+    mova                xm4, [tmp1q-32*4+16]
+    mova                xm5, [tmp1q-32*3+16]
+    vinserti128          m4, [tmp1q+32*0+16], 1
+    vinserti128          m5, [tmp1q+32*1+16], 1
+    vinserti128          m9, m3, xm11, 1
+    vperm2i128          m13, m3, m11, 0x31
+    mova                xm2, [tmp1q-32*2]
+    mova                xm3, [tmp1q-32*1]
+    vinserti128          m2, [tmp1q+32*2], 1
+    vinserti128          m3, [tmp1q+32*3], 1
+    vinserti128         m11, m7, xm15, 1
+    vperm2i128          m15, m7, m15, 0x31
+    mova                xm6, [tmp1q-32*2+16]
+    mova                xm7, [tmp1q-32*1+16]
+    vinserti128          m6, [tmp1q+32*2+16], 1
+    vinserti128          m7, [tmp1q+32*3+16], 1
+    call .main_oddhalf
+    LOAD_8ROWS_H    r2-32*4, 32
+.idct16:
+    LOAD_8ROWS   tmp3q-32*4, 32
+    mova              [rsp], m15
+    call m(idct_16x16_internal).main
+    imul                 r2, strideq, 19
+    lea                  r3, [strideq*3]
+    add                  r2, dstq
+    call .pass2_end
+    RET
+ALIGN function_align
+.main_oddhalf_fast: ; lower half is zero
+    mova [rsp+gprsize+32*1], m7
+    pxor                 m7, m7
+    mova [rsp+gprsize+32*0], m7
+    mova [rsp+gprsize+32*2], m7
+    vpbroadcastd        m11, [o(pw_3703x8)]
+    vpbroadcastd         m7, [o(pw_1751x8)]
+    vpbroadcastd        m12, [o(pw_m1380x8)]
+    vpbroadcastd         m8, [o(pw_3857x8)]
+    vpbroadcastd        m13, [o(pw_3973x8)]
+    vpbroadcastd        m15, [o(pw_995x8)]
+    pmulhrsw            m11, m4  ; t29a
+    pmulhrsw             m4, m7  ; t18a
+    pmulhrsw            m12, m3  ; t19a
+    pmulhrsw             m3, m8  ; t28a
+    pmulhrsw            m13, m2  ; t27a
+    pmulhrsw             m2, m15 ; t20a
+    vpbroadcastd        m10, [o(pw_m2106x8)]
+    vpbroadcastd         m7, [o(pw_3513x8)]
+    vpbroadcastd         m9, [o(pw_3290x8)]
+    vpbroadcastd         m8, [o(pw_2440x8)]
+    vpbroadcastd        m14, [o(pw_m601x8)]
+    vpbroadcastd        m15, [o(pw_4052x8)]
+    pmulhrsw            m10, m5  ; t21a
+    pmulhrsw             m5, m7  ; t26a
+    pmulhrsw             m9, m6  ; t25a
+    pmulhrsw             m6, m8  ; t22a
+    pmulhrsw            m14, m1  ; t23a
+    pmulhrsw             m1, m15 ; t24a
+    vpbroadcastd        m15, [o(pd_2048)]
+    jmp .main2
+ALIGN function_align
+.main_oddhalf:
+    mova [rsp+gprsize+32*0], m15
+    mova [rsp+gprsize+32*1], m7
+    mova [rsp+gprsize+32*2], m8
+    vpbroadcastd        m15, [o(pd_2048)]
+    ITX_MULSUB_2W         4, 11,  7,  8, 15, 1751, 3703 ; t18a, t29a
+    ITX_MULSUB_2W        12,  3,  7,  8, 15, 3857, 1380 ; t19a, t28a
+    ITX_MULSUB_2W         2, 13,  7,  8, 15,  995, 3973 ; t20a, t27a
+    ITX_MULSUB_2W        10,  5,  7,  8, 15, 3513, 2106 ; t21a, t26a
+    ITX_MULSUB_2W         6,  9,  7,  8, 15, 2440, 3290 ; t22a, t25a
+    ITX_MULSUB_2W        14,  1,  7,  8, 15, 4052,  601 ; t23a, t24a
+.main2:
+    psubsw               m7, m12, m4  ; t18
+    paddsw              m12, m4       ; t19
+    psubsw               m4, m2, m10  ; t21
+    paddsw               m2, m10      ; t20
+    psubsw              m10, m14, m6  ; t22
+    paddsw              m14, m6       ; t23
+    psubsw               m6, m1, m9   ; t25
+    paddsw               m1, m9       ; t24
+    psubsw               m9, m13, m5  ; t26
+    paddsw              m13, m5       ; t27
+    psubsw               m5, m3, m11  ; t29
+    paddsw               m3, m11      ; t28
+    ITX_MULSUB_2W         5,  7,  8, 11, 15, m4017,  799 ; t18a, t29a
+    ITX_MULSUB_2W         9,  4,  8, 11, 15,  3406, 2276 ; t21a, t26a
+    ITX_MULSUB_2W         6, 10,  8, 11, 15, m2276, 3406 ; t22a, t25a
+    psubsw               m8, m14, m2  ; t20a
+    paddsw              m14, m2       ; t23a
+    psubsw               m2, m1, m13  ; t27a
+    paddsw               m1, m13      ; t24a
+    psubsw              m13, m6, m9   ; t21
+    paddsw               m6, m9       ; t22
+    psubsw               m9, m10, m4  ; t26
+    paddsw              m10, m4       ; t25
+    ITX_MULSUB_2W         2,  8,  4, 11, 15, m3784, 1567 ; t20,  t27
+    ITX_MULSUB_2W         9, 13,  4, 11, 15, m3784, 1567 ; t21a, t26a
+    mova                 m4, [rsp+gprsize+32*0] ; in31
+    mova [rsp+gprsize+32*0], m6  ; t22
+    mova                 m6, [rsp+gprsize+32*1] ; in15
+    mova [rsp+gprsize+32*1], m14 ; t23a
+    mova                m14, [rsp+gprsize+32*2] ; in17
+    mova [rsp+gprsize+32*2], m1  ; t24a
+    ITX_MULSUB_2W         0,  4,  1, 11, 15,  201, 4091 ; t16a, t31a
+    ITX_MULSUB_2W        14,  6,  1, 11, 15, 3035, 2751 ; t17a, t30a
+    psubsw               m1, m0, m14  ; t17
+    paddsw               m0, m14      ; t16
+    psubsw              m14, m4, m6   ; t30
+    paddsw               m4, m6       ; t31
+    ITX_MULSUB_2W        14,  1,  6, 11, 15,  799, 4017 ; t17a, t30a
+    psubsw               m6, m0, m12  ; t19a
+    paddsw               m0, m12      ; t16a
+    psubsw              m12, m4, m3   ; t28a
+    paddsw               m4, m3       ; t31a
+    psubsw               m3, m14, m5  ; t18
+    paddsw              m14, m5       ; t17
+    psubsw               m5, m1, m7   ; t29
+    paddsw               m1, m7       ; t30
+    ITX_MULSUB_2W         5,  3,  7, 11, 15, 1567, 3784 ; t18a, t29a
+    ITX_MULSUB_2W        12,  6,  7, 11, 15, 1567, 3784 ; t19,  t28
+    psubsw               m7, m1, m10  ; t25a
+    paddsw               m1, m10      ; t30a
+    psubsw              m10, m5, m9   ; t21
+    paddsw               m5, m9       ; t18
+    psubsw               m9, m12, m2  ; t20a
+    paddsw              m12, m2       ; t19a
+    psubsw               m2, m3, m13  ; t26
+    paddsw               m3, m13      ; t29
+    psubsw              m13, m6, m8   ; t27a
+    paddsw               m6, m8       ; t28a
+    mova       [tmp1q-32*2], m5
+    mova       [tmp1q-32*1], m12
+    mova       [tmp2q+32*0], m6
+    mova       [tmp2q+32*1], m3
+    mova       [tmp2q+32*2], m1
+    mova                 m5, [rsp+gprsize+32*0] ; t22
+    mova                 m6, [rsp+gprsize+32*1] ; t23
+    mova                 m3, [rsp+gprsize+32*2] ; t24a
+    psubsw               m1, m14, m5  ; t22a
+    paddsw              m14, m5       ; t17a
+    psubsw               m5, m0, m6   ; t23
+    paddsw               m0, m6       ; t16
+    psubsw               m6, m4, m3   ; t24
+    paddsw               m4, m3       ; t31
+    vpbroadcastd         m8, [o(pw_m2896_2896)]
+    vpbroadcastd         m3, [o(pw_2896_2896)]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m14
+    mova       [tmp2q+32*3], m4
+    ITX_MULSUB_2W        13,  9,  0,  4, 15,  3,  8 ; t20,  t27
+    ITX_MULSUB_2W         2, 10,  0,  4, 15,  3,  8 ; t21a, t26a
+    ITX_MULSUB_2W         7,  1,  0,  4, 15,  3,  8 ; t22,  t25
+    ITX_MULSUB_2W         6,  5,  0,  4, 15,  3,  8 ; t23a, t24a
+    mova       [tmp1q+32*0], m13
+    mova       [tmp1q+32*1], m2
+    mova       [tmp1q+32*2], m7
+    mova       [tmp1q+32*3], m6
+    mova       [tmp2q-32*4], m5
+    mova       [tmp2q-32*3], m1
+    mova       [tmp2q-32*2], m10
+    mova       [tmp2q-32*1], m9
+    ret
+ALIGN function_align
+.transpose_2x8x8_round:
+    punpckhwd            m6, m12, m13
+    punpcklwd           m12, m13
+    punpckhwd           m13, m8, m9
+    punpcklwd            m8, m9
+    punpckhwd            m9, m14, m15
+    punpcklwd           m14, m15
+    punpckhwd           m15, m10, m11
+    punpcklwd           m10, m11
+    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5
+    punpckhdq           m11, m8, m10
+    punpckldq            m8, m10
+    punpckldq           m10, m12, m14
+    punpckhdq           m12, m14
+    punpckhdq           m14, m13, m15
+    punpckldq           m13, m15
+    punpckldq           m15, m6, m9
+    punpckhdq            m6, m9
+    punpckhqdq           m9, m8, m10
+    punpcklqdq           m8, m10
+    punpcklqdq          m10, m11, m12
+    punpckhqdq          m11, m12
+    punpcklqdq          m12, m13, m15
+    punpckhqdq          m13, m15
+    punpckhqdq          m15, m14, m6
+    punpcklqdq          m14, m6
+    pmulhrsw             m6, m7, [rsp+gprsize+32*0]
+    REPX   {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15
+    pmulhrsw             m7, [rsp+gprsize+32*1]
+    mova [rsp+gprsize+32*0], m15
+    punpckhwd           m15, m4, m5
+    punpcklwd            m4, m5
+    punpckhwd            m5, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m6, m7
+    punpcklwd            m6, m7
+    punpckhwd            m7, m2, m3
+    punpcklwd            m2, m3
+    punpckhdq            m3, m0, m2
+    punpckldq            m0, m2
+    punpckldq            m2, m4, m6
+    punpckhdq            m4, m6
+    punpckhdq            m6, m5, m7
+    punpckldq            m5, m7
+    punpckldq            m7, m15, m1
+    punpckhdq           m15, m1
+    punpckhqdq           m1, m0, m2
+    punpcklqdq           m0, m2
+    punpcklqdq           m2, m3, m4
+    punpckhqdq           m3, m4
+    punpcklqdq           m4, m5, m7
+    punpckhqdq           m5, m7
+    punpckhqdq           m7, m6, m15
+    punpcklqdq           m6, m15
+    ret
+ALIGN function_align
+.pass2_end:
+    mova [rsp+gprsize+32*0], m7
+    mova [rsp+gprsize+32*2], m15
+    vpbroadcastd        m15, [o(pw_2048)]
+    IDCT32_PASS2_END      0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4
+    IDCT32_PASS2_END      4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8
+    IDCT32_PASS2_END      8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4
+    IDCT32_PASS2_END     12, tmp1q-32*1, 0, 4, 15, r3*4,      strideq*0
+    add                dstq, strideq
+    sub                  r2, strideq
+    mova                 m1, [rsp+gprsize+32*1]
+    IDCT32_PASS2_END      1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4
+    IDCT32_PASS2_END      5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8
+    IDCT32_PASS2_END      9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4
+    IDCT32_PASS2_END     13, tmp1q-32*2, 0, 4, 15, r3*4,      strideq*0
+    add                dstq, strideq
+    sub                  r2, strideq
+    IDCT32_PASS2_END      2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4
+    IDCT32_PASS2_END      6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8
+    IDCT32_PASS2_END     10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4
+    IDCT32_PASS2_END     14, tmp1q-32*3, 0, 4, 15, r3*4,      strideq*0
+    add                dstq, strideq
+    sub                  r2, strideq
+    mova                 m7, [rsp+gprsize+32*0]
+    mova                 m1, [rsp+gprsize+32*2]
+    IDCT32_PASS2_END      3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4
+    IDCT32_PASS2_END      7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
+    IDCT32_PASS2_END     11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
+    IDCT32_PASS2_END      1, tmp1q-32*4, 0, 4, 15, r3*4,      strideq*0
+    ret
+
+; Perform the final sumsub step and YMM lane shuffling
+%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
+    mova                m%3, [tmp2q+32*( 3-%1)]
+    psubsw              m%4, m%1, m%3
+    paddsw              m%1, m%3
+    mova                m%3, [tmp1q+32*(11-%2)]
+    mova         [tmp1q+32*(11-%2)+16], xm%4
+    vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
+    paddsw              m%4, m%2, m%3
+    psubsw              m%2, m%3
+    mova         [tmp1q+32*(11-%2)], xm%2
+    vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
+    vperm2i128          m%2, m%1, m%4, 0x31
+    vinserti128         m%1, xm%4, 1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    mov                 r2d, 16
+    jmp m(inv_txfm_add_dct_dct_32x8).dconly
+.normal:
+    PROLOGUE              0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
+    vpbroadcastd        m15, [o(pw_2896x8)]
+    pmulhrsw             m0, m15, [cq+32* 1]
+    pmulhrsw             m1, m15, [cq+32* 3]
+    pmulhrsw             m2, m15, [cq+32* 5]
+    pmulhrsw             m3, m15, [cq+32* 7]
+    pmulhrsw             m4, m15, [cq+32* 9]
+    pmulhrsw             m5, m15, [cq+32*11]
+    pmulhrsw             m6, m15, [cq+32*13]
+    pmulhrsw             m7, m15, [cq+32*15]
+    pmulhrsw             m8, m15, [cq+32*17]
+    pmulhrsw             m9, m15, [cq+32*19]
+    pmulhrsw            m10, m15, [cq+32*21]
+    pmulhrsw            m11, m15, [cq+32*23]
+    pmulhrsw            m12, m15, [cq+32*25]
+    pmulhrsw            m13, m15, [cq+32*27]
+    pmulhrsw            m14, m15, [cq+32*29]
+    pmulhrsw            m15,      [cq+32*31]
+    lea               tmp1q, [rsp+32*7]
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
+    LOAD_16ROWS     cq+32*0, 32*2, 1, 0
+    pxor                m15, m15
+    mov                 r3d, 8
+.zero_loop:
+    mova          [cq+32*0], m15
+    mova          [cq+32*1], m15
+    mova          [cq+32*2], m15
+    mova          [cq+32*3], m15
+    add                  cq, 32*4
+    dec                 r3d
+    jg .zero_loop
+    call m(idct_16x16_internal).main
+    call .pass1_end
+    lea                  r2, [strideq*3]
+    mov                  r3, dstq
+.pass2:
+    vpbroadcastd         m7, [o(pw_16384)]
+    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
+    call m(idct_16x16_internal).main
+    mova         [rsp+32*2], m15
+    vpbroadcastd        m15, [o(pw_2048)]
+    REPX  {pmulhrsw x, m15}, m2, m3, m0
+    WRITE_16X2            2,  3,  1,  2, strideq*2, r2
+    pmulhrsw             m1, m15, [rsp+32*1]
+    WRITE_16X2            0,  1,  2,  3, strideq*0, strideq*1
+    lea                dstq, [dstq+strideq*4]
+    REPX  {pmulhrsw x, m15}, m4, m5, m6, m7
+    WRITE_16X2            4,  5,  2,  3, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  2,  3, strideq*2, r2
+    lea                dstq, [dstq+strideq*4]
+    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11
+    WRITE_16X2            8,  9,  2,  3, strideq*0, strideq*1
+    WRITE_16X2           10, 11,  2,  3, strideq*2, r2
+    lea                dstq, [dstq+strideq*4]
+    REPX  {pmulhrsw x, m15}, m11, m12, m13, m14
+    pmulhrsw            m15, [rsp+32*2]
+    WRITE_16X2           12, 13,  2,  3, strideq*0, strideq*1
+    WRITE_16X2           14, 15,  2,  3, strideq*2, r2
+    test                 r3, r3
+    jnz .right_half
+    RET
+.right_half:
+    LOAD_8ROWS   tmp1q-32*4, 32
+    LOAD_8ROWS_H tmp2q-32*4, 32
+    lea                dstq, [r3+16]
+    xor                 r3d, r3d
+    mova         [rsp+32*0], m6
+    mova         [rsp+32*1], m7
+    jmp .pass2
+ALIGN function_align
+.pass1_end:
+    mova [rsp+gprsize+32*0], m9
+    IDCT32_PASS1_END      0,  8,  1,  9
+    IDCT32_PASS1_END      2, 10,  1,  9
+    IDCT32_PASS1_END      3, 11,  1,  9
+    IDCT32_PASS1_END      4, 12,  1,  9
+    IDCT32_PASS1_END      5, 13,  1,  9
+    IDCT32_PASS1_END      6, 14,  1,  9
+    IDCT32_PASS1_END      7, 15,  1,  9
+    mova                 m1, [rsp+gprsize+32*1]
+    mova                 m9, [rsp+gprsize+32*0]
+    mova [rsp+gprsize+32*0], m6
+    mova [rsp+gprsize+32*1], m7
+    IDCT32_PASS1_END      1,  9,  6,  7
+    ret
+
+cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob
+%undef cmp
+    lea                 rax, [o_base]
+    vpbroadcastd         m9, [o(pw_2896x8)]
+    vpbroadcastd        m10, [o(pw_1697x16)]
+    vpbroadcastd        m12, [o(pw_8192)]
+    cmp                eobd, 43   ; if (eob > 43)
+    setg                r4b       ;   iteration_count++
+    cmp                eobd, 150  ; if (eob > 150)
+    setg                 al       ;   iteration_count++
+    add                eobd, -279 ; if (eob > 278)
+    adc                 r4b, al   ;   iteration_count++
+    lea                  r3, [strideq*3]
+    mov                 rax, cq
+    paddw               m11, m12, m12 ; pw_16384
+.loop:
+    mova                xm0, [cq+64* 0]
+    mova                xm1, [cq+64* 1]
+    vinserti128          m0, [cq+64* 8], 1
+    vinserti128          m1, [cq+64* 9], 1
+    mova                xm2, [cq+64* 2]
+    mova                xm3, [cq+64* 3]
+    vinserti128          m2, [cq+64*10], 1
+    vinserti128          m3, [cq+64*11], 1
+    mova                xm4, [cq+64* 4]
+    mova                xm5, [cq+64* 5]
+    vinserti128          m4, [cq+64*12], 1
+    vinserti128          m5, [cq+64*13], 1
+    mova                xm6, [cq+64* 6]
+    mova                xm7, [cq+64* 7]
+    vinserti128          m6, [cq+64*14], 1
+    vinserti128          m7, [cq+64*15], 1
+    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX  {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  0,  1, strideq*2, r3
+    lea                dstq, [dstq+strideq*4]
+    add                  cq, 16
+    dec                 r4b
+    jge .loop
+    sub                  cq, 32
+    pxor                 m0, m0
+    mov                 r0d, 8
+    cmp                  cq, rax
+    ja .zero_loop
+.zero_loop_half:
+    mova         [rax+64*0], m0
+    mova         [rax+64*1], m0
+    add                 rax, 64*4
+    mova         [rax-64*2], m0
+    mova         [rax-64*1], m0
+    sub                 r0d, 2
+    jg .zero_loop_half
+    RET
+.zero_loop:
+    mova         [rax+32*0], m0
+    mova         [rax+32*1], m0
+    mova         [rax+32*2], m0
+    mova         [rax+32*3], m0
+    add                 rax, 32*4
+    dec                 r0d
+    jg .zero_loop
+    RET
+
+cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
+%undef cmp
+    lea                 rax, [o_base]
+    vpbroadcastd         m9, [o(pw_2896x8)]
+    vpbroadcastd        m10, [o(pw_1697x16)]
+    vpbroadcastd        m11, [o(pw_2048)]
+    cmp                eobd, 35  ; if (eob > 35)
+    setg                r4b      ;   iteration_count++
+    cmp                eobd, 150 ; if (eob > 150)
+    setg                r3b      ;   iteration_count += 2
+    lea                 r4d, [r4+r3*2]
+    lea                  r3, [strideq*3]
+    mov                  r5, dstq
+    mov                 rax, cq
+.loop:
+    mova                xm0, [cq+32* 0]
+    mova                xm1, [cq+32* 1]
+    vinserti128          m0, [cq+32* 8], 1
+    vinserti128          m1, [cq+32* 9], 1
+    mova                xm2, [cq+32* 2]
+    mova                xm3, [cq+32* 3]
+    vinserti128          m2, [cq+32*10], 1
+    vinserti128          m3, [cq+32*11], 1
+    mova                xm4, [cq+32* 4]
+    mova                xm5, [cq+32* 5]
+    vinserti128          m4, [cq+32*12], 1
+    vinserti128          m5, [cq+32*13], 1
+    mova                xm6, [cq+32* 6]
+    mova                xm7, [cq+32* 7]
+    vinserti128          m6, [cq+32*14], 1
+    vinserti128          m7, [cq+32*15], 1
+    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX  {paddsw   x, x  }, m0, m1, m2, m3, m4, m5, m6, m7
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    REPX  {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7
+    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  0,  1, strideq*2, r3
+    lea                dstq, [dstq+strideq*4]
+    add                  cq, 16
+    dec                 r4b
+    jl .ret
+    test                r4b, 1
+    jz .loop
+    add                  cq, 32*15
+    lea                dstq, [r5+16]
+    jmp .loop
+.ret:
+    sub                  cd, eax
+    pxor                 m0, m0
+    add                  cd, 384
+.zero_loop:
+    mova         [rax+32*0], m0
+    mova         [rax+32*1], m0
+    mova         [rax+32*2], m0
+    mova         [rax+32*3], m0
+    add                 rax, 32*4
+    sub                  cd, 128
+    jge .zero_loop
+    RET
+
+cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    mov                 r2d, 32
+    jmp m(inv_txfm_add_dct_dct_32x8).dconly
+.normal:
+    PROLOGUE              0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \
+                                           base, tmp3, tmp4
+    %undef cmp
+    lea               tmp1q, [rsp+32*7]
+    lea               tmp2q, [tmp1q+32*8]
+    sub                eobd, 136
+    mov               tmp4d, eobd
+.pass1_loop:
+    LOAD_8ROWS      cq+64*1, 64*2
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
+    test              tmp4d, tmp4d
+    jl .fast
+    LOAD_8ROWS_H   cq+64*17, 64*2
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
+    LOAD_8ROWS_H   cq+64*16, 64*2
+    pxor                 m0, m0
+    REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
+                               24, 25, 26, 27, 28, 29, 30, 31
+    mova              [rsp], m15
+    jmp .idct16
+.fast:
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    pxor                 m8, m8
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m8
+.idct16:
+    LOAD_8ROWS      cq+64*0, 64*2
+    pxor                m15, m15
+    REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
+    call m(idct_16x16_internal).main
+    call m(inv_txfm_add_dct_dct_32x16).pass1_end
+    vpbroadcastd         m7, [o(pw_8192)]
+    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
+    lea               tmp3q, [tmp1q+32*32]
+    mova                m15, [rsp]
+    mova       [tmp3q-32*4], m0
+    mova       [tmp3q-32*3], m2
+    mova       [tmp3q-32*2], m4
+    mova       [tmp3q-32*1], m6
+    mova       [tmp3q+32*0], m8
+    mova       [tmp3q+32*1], m10
+    mova       [tmp3q+32*2], m12
+    mova       [tmp3q+32*3], m14
+    add               tmp3q, 32*8
+    mova       [tmp3q-32*4], m1
+    mova       [tmp3q-32*3], m3
+    mova       [tmp3q-32*2], m5
+    mova       [tmp3q-32*1], m7
+    mova       [tmp3q+32*0], m9
+    mova       [tmp3q+32*1], m11
+    mova       [tmp3q+32*2], m13
+    mova       [tmp3q+32*3], m15
+    vpbroadcastd         m9, [o(pw_8192)]
+    pmulhrsw             m0, m9, [tmp1q-32*4]
+    pmulhrsw             m1, m9, [tmp1q-32*3]
+    pmulhrsw             m2, m9, [tmp1q-32*2]
+    pmulhrsw             m3, m9, [tmp1q-32*1]
+    pmulhrsw             m4, m9, [tmp1q+32*0]
+    pmulhrsw             m5, m9, [tmp1q+32*1]
+    pmulhrsw             m6, m9, [tmp1q+32*2]
+    pmulhrsw             m7, m9, [tmp1q+32*3]
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    mova       [tmp1q-32*4], m0
+    pmulhrsw             m0, m9, [tmp2q-32*4]
+    mova       [tmp2q-32*4], m1
+    pmulhrsw             m1, m9, [tmp2q-32*3]
+    mova       [tmp1q-32*3], m2
+    pmulhrsw             m2, m9, [tmp2q-32*2]
+    mova       [tmp2q-32*3], m3
+    pmulhrsw             m3, m9, [tmp2q-32*1]
+    mova       [tmp1q-32*2], m4
+    pmulhrsw             m4, m9, [tmp2q+32*0]
+    mova       [tmp2q-32*2], m5
+    pmulhrsw             m5, m9, [tmp2q+32*1]
+    mova       [tmp1q-32*1], m6
+    pmulhrsw             m6, m9, [tmp2q+32*2]
+    mova       [tmp2q-32*1], m7
+    pmulhrsw             m7, m9, [tmp2q+32*3]
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    mova       [tmp1q+32*0], m0
+    mova       [tmp2q+32*0], m1
+    mova       [tmp1q+32*1], m2
+    mova       [tmp2q+32*1], m3
+    mova       [tmp1q+32*2], m4
+    mova       [tmp2q+32*2], m5
+    mova       [tmp1q+32*3], m6
+    mova       [tmp2q+32*3], m7
+    add                  cq, 32
+    add               tmp1q, 32*16
+    add               tmp2q, 32*16
+    add                eobd, 0x80000000
+    jnc .pass1_loop
+    add               tmp1q, 32*24
+    imul                 r2, strideq, 19
+    lea                  r3, [strideq*3]
+    add                  r2, dstq
+    test              tmp4d, tmp4d
+    jge .pass2_loop
+    add               tmp1q, 32*16
+    add               tmp2q, 32*16
+    add               tmp3q, 32*16
+.pass2_loop:
+    LOAD_8ROWS   tmp2q-32*4, 32
+    test              tmp4d, tmp4d
+    jl .fast2
+    LOAD_8ROWS_H tmp3q-32*4, 32
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
+    sub               tmp3q, 32*8
+    LOAD_8ROWS_H tmp3q-32*4, 32
+    sub               tmp3q, 32*16
+    jmp .pass2_loop_end
+.fast2:
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    sub               tmp3q, 32*24
+    pxor                 m8, m8
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+.pass2_loop_end:
+    LOAD_8ROWS   tmp3q-32*4, 32
+    mova              [rsp], m15
+    call m(idct_16x16_internal).main
+    call m(inv_txfm_add_dct_dct_16x32).pass2_end
+    lea               tmp3q, [tmp1q-32*32]
+    cmp               tmp2q, tmp3q
+    jb .ret
+    sub               tmp2q, 32*32
+    sub                dstq, r3
+    lea                  r2, [r2+r3+16]
+    add                dstq, 16
+    jmp .pass2_loop
+.ret:
+    RET
+
+cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob
+    %undef cmp
+    vpbroadcastd         m9, [pw_8192]
+    sub                eobd, 136 ; if (eob < 136)
+    shr                eobd, 30  ;     topleft 16x16 only
+    lea                eobd, [eobq*2-8]
+    lea                  r4, [strideq*3]
+    mov                  r5, dstq
+    lea                 rax, [cq+32]
+.loop:
+    mova                xm0, [cq+64* 0]
+    mova                xm1, [cq+64* 1]
+    vinserti128          m0, [cq+64* 8], 1
+    vinserti128          m1, [cq+64* 9], 1
+    mova                xm2, [cq+64* 2]
+    mova                xm3, [cq+64* 3]
+    vinserti128          m2, [cq+64*10], 1
+    vinserti128          m3, [cq+64*11], 1
+    mova                xm4, [cq+64* 4]
+    mova                xm5, [cq+64* 5]
+    vinserti128          m4, [cq+64*12], 1
+    vinserti128          m5, [cq+64*13], 1
+    mova                xm6, [cq+64* 6]
+    mova                xm7, [cq+64* 7]
+    vinserti128          m6, [cq+64*14], 1
+    vinserti128          m7, [cq+64*15], 1
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
+    WRITE_16X2            2,  3,  0,  1, strideq*2, r4
+    lea                dstq, [dstq+strideq*4]
+    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  0,  1, strideq*2, r4
+    lea                dstq, [dstq+strideq*4]
+    add                  cq, 16
+    inc                eobd
+    jz .ret
+    test               eobd, 3
+    jnz .loop
+    add                  cq, 64*15
+    lea                dstq, [r5+16]
+    jmp .loop
+.ret:
+    pxor                 m0, m0
+    mov                 r0d, 16
+    cmp                  cq, rax
+    jne .zero_loop
+.zero_loop_topleft:
+    mova         [rax-32*1], m0
+    mova         [rax+32*1], m0
+    mova         [rax+32*3], m0
+    mova         [rax+32*5], m0
+    add                 rax, 64*4
+    sub                 r0d, 4
+    jg .zero_loop_topleft
+    RET
+.zero_loop:
+    mova         [rax-32*1], m0
+    mova         [rax+32*0], m0
+    mova         [rax+32*1], m0
+    mova         [rax+32*2], m0
+    add                 rax, 32*4
+    dec                 r0d
+    jg .zero_loop
+    RET
+
+%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
+%if %1 & 1
+    mova                m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n
+    mova                m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n
+%else
+    mova                m%5, [tmp1q-32*(45-%1)]
+    mova                m%4, [tmp2q-32*(20+%1)]
+%endif
+    psubsw              m%6, m%5, m%4 ; idct32 out31-n
+    paddsw              m%5, m%4      ; idct32 out 0+n
+    psubsw              m%4, m%6, m%3 ; out32+n
+    paddsw              m%6, m%3      ; out31-n
+    psubsw              m%3, m%5, m%2 ; out63-n
+    paddsw              m%5, m%2      ; out 0+n
+%if %0 == 6 ; pass 1
+%if %1 & 1
+    mova [tmp2q-32*(19-%1)], m%4
+    mova [tmp1q-32*(14+%1)], m%6
+    mova [tmp1q+32*(18-%1)], m%3
+    mova [tmp2q-32*(51-%1)], m%5
+%else
+    mova [tmp1q-32*(13-%1)], m%4
+    mova [tmp2q-32*(20+%1)], m%6
+    mova [tmp2q+32*(12-%1)], m%3
+    mova [tmp1q-32*(45-%1)], m%5
+%endif
+%else ; pass 2
+    REPX  {pmulhrsw x, m14}, m%4, m%6, m%3, m%5
+%if %1 & 1
+    %define %%d0 r2
+    %define %%d1 dstq
+%else
+    %define %%d0 dstq
+    %define %%d1 r2
+%endif
+    pmovzxbw            m%2, [%%d0+%9 ]
+    paddw               m%2, m%4
+    pmovzxbw            m%4, [%%d1+%8 ]
+    paddw               m%4, m%6
+    pmovzxbw            m%6, [%%d1+%10]
+    paddw               m%3, m%6
+    pmovzxbw            m%6, [%%d0+%7 ]
+    paddw               m%5, m%6
+    packuswb            m%2, m%4
+    packuswb            m%3, m%5
+    vpermq              m%2, m%2, q3120
+    vpermq              m%3, m%3, q3120
+    mova         [%%d0+%9 ], xm%2
+    vextracti128 [%%d1+%8 ], m%2, 1
+    mova         [%%d1+%10], xm%3
+    vextracti128 [%%d0+%7 ], m%3, 1
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    mov                 r2d, 32
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+.normal:
+    PROLOGUE              0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
+    %undef cmp
+    lea               tmp1q, [rsp+32*23]
+    lea               tmp2q, [tmp1q+32*24]
+    sub                eobd, 151
+    mov                 r7d, eobd
+.pass1_loop:
+    LOAD_16ROWS          cq, 64
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova         [rsp+32*0], m6
+    mova         [rsp+32*1], m7
+    vpbroadcastd         m7, [o(pw_8192)]
+    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
+    mova                m15, [rsp+32*0]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m2
+    mova       [tmp1q-32*2], m4
+    mova       [tmp1q-32*1], m6
+    mova       [tmp1q+32*0], m8
+    mova       [tmp1q+32*1], m10
+    mova       [tmp1q+32*2], m12
+    mova       [tmp1q+32*3], m14
+    mova       [tmp2q-32*4], m1
+    mova       [tmp2q-32*3], m3
+    mova       [tmp2q-32*2], m5
+    mova       [tmp2q-32*1], m7
+    mova       [tmp2q+32*0], m9
+    mova       [tmp2q+32*1], m11
+    mova       [tmp2q+32*2], m13
+    mova       [tmp2q+32*3], m15
+    add                  cq, 32
+    add               tmp1q, 32*8
+    add               tmp2q, 32*8
+    add                eobd, 0x80000000
+    jnc .pass1_loop
+    lea                  r2, [rsp+32*23]
+    mova                xm0, [r2-32*4+ 0]
+    mova                xm1, [r2-32*2+ 0]
+    vinserti128          m0, [r2+32*0+ 0], 1
+    vinserti128          m1, [r2+32*2+ 0], 1
+    mova                xm2, [r2-32*4+16]
+    mova                xm3, [r2-32*2+16]
+    vinserti128          m2, [r2+32*0+16], 1
+    vinserti128          m3, [r2+32*2+16], 1
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+    test                r7d, r7d
+    jl .fast
+    lea                  r3, [r2+32*8]
+    mova                xm4, [r3-32*4+ 0]
+    mova                xm5, [r3-32*2+ 0]
+    vinserti128          m4, [r3+32*0+ 0], 1
+    vinserti128          m5, [r3+32*2+ 0], 1
+    mova                xm6, [r3-32*4+16]
+    mova                xm7, [r3-32*2+16]
+    vinserti128          m6, [r3+32*0+16], 1
+    vinserti128          m7, [r3+32*2+16], 1
+.fast:
+    mova              [rsp], m8
+    lea               tmp1q, [rsp+32*7]
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m1
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m5
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    add               tmp1q, 32*8
+    mova       [tmp1q-32*4], m8
+    mova       [tmp1q-32*3], m9
+    mova       [tmp1q-32*2], m10
+    mova       [tmp1q-32*1], m11
+    mova       [tmp1q+32*0], m12
+    mova       [tmp1q+32*1], m13
+    mova       [tmp1q+32*2], m14
+    mova       [tmp1q+32*3], m15
+    mova                xm0, [r2-32*3+ 0]
+    mova                xm1, [r2-32*1+ 0]
+    vinserti128          m0, [r2+32*1+ 0], 1
+    vinserti128          m1, [r2+32*3+ 0], 1
+    mova                xm2, [r2-32*3+16]
+    mova                xm3, [r2-32*1+16]
+    vinserti128          m2, [r2+32*1+16], 1
+    vinserti128          m3, [r2+32*3+16], 1
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7
+    test                r7d, r7d
+    jl .fast2
+    mova                xm4, [r3-32*3+ 0]
+    mova                xm5, [r3-32*1+ 0]
+    vinserti128          m4, [r3+32*1+ 0], 1
+    vinserti128          m5, [r3+32*3+ 0], 1
+    mova                xm6, [r3-32*3+16]
+    mova                xm7, [r3-32*1+16]
+    vinserti128          m6, [r3+32*1+16], 1
+    vinserti128          m7, [r3+32*3+16], 1
+.fast2:
+    add               tmp1q, 32*8
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    add                  r2, 32*24
+    vpbroadcastd        m15, [o(pd_2048)]
+    add               tmp1q, 32*16
+    add               tmp2q, 32*32
+    mova                xm0, [r2-32*4+ 0]
+    mova                xm3, [r2-32*1+16]
+    vinserti128          m0, [r2+32*0+ 0], 1
+    vinserti128          m3, [r2+32*3+16], 1
+    mova                xm4, [r2-32*4+16]
+    mova                xm7, [r2-32*1+ 0]
+    vinserti128          m4, [r2+32*0+16], 1
+    vinserti128          m7, [r2+32*3+ 0], 1
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m2, m5, m6
+    test                r7d, r7d
+    jl .fast3
+    add                  r3, 32*24
+    mova                xm1, [r3-32*1+16]
+    mova                xm2, [r3-32*4+ 0]
+    vinserti128          m1, [r3+32*3+16], 1
+    vinserti128          m2, [r3+32*0+ 0], 1
+    mova                xm5, [r3-32*1+ 0]
+    mova                xm6, [r3-32*4+16]
+    vinserti128          m5, [r3+32*3+ 0], 1
+    vinserti128          m6, [r3+32*0+16], 1
+.fast3:
+    add                 rax, o_idct64_offset
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    add                 rax, 8
+    add               tmp1q, 32*8
+    sub               tmp2q, 32*8
+    mova                xm0, [r2-32*2+ 0]
+    mova                xm3, [r2-32*3+16]
+    vinserti128          m0, [r2+32*2+ 0], 1
+    vinserti128          m3, [r2+32*1+16], 1
+    mova                xm4, [r2-32*2+16]
+    mova                xm7, [r2-32*3+ 0]
+    vinserti128          m4, [r2+32*2+16], 1
+    vinserti128          m7, [r2+32*1+ 0], 1
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m2, m5, m6
+    test                r7d, r7d
+    jl .fast4
+    mova                xm1, [r3-32*3+16]
+    mova                xm2, [r3-32*2+ 0]
+    vinserti128          m1, [r3+32*1+16], 1
+    vinserti128          m2, [r3+32*2+ 0], 1
+    mova                xm5, [r3-32*3+ 0]
+    mova                xm6, [r3-32*2+16]
+    vinserti128          m5, [r3+32*1+ 0], 1
+    vinserti128          m6, [r3+32*2+16], 1
+.fast4:
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
+    RET
+ALIGN function_align
+%define o_base idct64_mul - 8
+.main_part1:
+    ; idct64 steps 1-5:
+    ; in1/31/17/15/ 9/23/25/ 7 ->
+    ;     t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a
+    ; in5/27/21/11/13/19/29/ 3 ->
+    ;     t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a
+    vpbroadcastd        m11, [o(idct64_mul+4* 0)]
+    vpbroadcastd        m13, [o(idct64_mul+4* 1)]
+    vpbroadcastd        m10, [o(idct64_mul+4* 4)]
+    vpbroadcastd        m12, [o(idct64_mul+4* 5)]
+    pmulhrsw            m11, m0  ; t63a
+    pmulhrsw             m0, m13 ; t32a
+    pmulhrsw            m10, m1  ; t62a
+    pmulhrsw             m1, m12 ; t33a
+    vpbroadcastd         m9, [o(idct64_mul+4* 8)]
+    vpbroadcastd        m13, [o(idct64_mul+4* 9)]
+    vpbroadcastd         m8, [o(idct64_mul+4*12)]
+    vpbroadcastd        m12, [o(idct64_mul+4*13)]
+    pmulhrsw             m9, m2  ; t61a
+    pmulhrsw             m2, m13 ; t34a
+    pmulhrsw             m8, m3  ; t60a
+    pmulhrsw             m3, m12 ; t35a
+    psubsw              m12, m0, m1   ; t33
+    paddsw               m0, m1       ; t32
+    psubsw               m1, m3, m2   ; t34
+    paddsw               m3, m2       ; t35
+    psubsw               m2, m8, m9   ; t61
+    paddsw               m8, m9       ; t60
+    psubsw               m9, m11, m10 ; t62
+    paddsw              m11, m10      ; t63
+    ITX_MULSUB_2W         2,  1, 10, 13, 15, m4076, 401 ; t34a, t61a
+    vpbroadcastd        m14, [o(pw_401_4076)]
+    ITX_MULSUB_2W         9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
+    psubsw              m10, m0, m3  ; t35a
+    paddsw               m0, m3      ; t32a
+    psubsw               m3, m11, m8 ; t60a
+    paddsw              m11, m8      ; t63a
+    psubsw               m8, m9, m2  ; t34
+    paddsw               m9, m2      ; t33
+    psubsw               m2, m12, m1 ; t61
+    paddsw              m12, m1      ; t62
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m9
+    mova       [tmp2q+32*2], m12
+    mova       [tmp2q+32*3], m11
+    vpbroadcastd        m13, [o(pw_m4017_799)]
+    vpbroadcastd        m14, [o(pw_799_4017)]
+    ITX_MULSUB_2W         2,  8,  0,  1, 15, 14, 13 ; t34a, t61a
+    ITX_MULSUB_2W         3, 10,  0,  1, 15, 14, 13 ; t35,  t60
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp2q+32*0], m10
+    mova       [tmp2q+32*1], m8
+    vpbroadcastd         m3, [o(idct64_mul+4*16)]
+    vpbroadcastd        m11, [o(idct64_mul+4*17)]
+    vpbroadcastd         m2, [o(idct64_mul+4*20)]
+    vpbroadcastd        m10, [o(idct64_mul+4*21)]
+    vpbroadcastd         m1, [o(idct64_mul+4*24)]
+    vpbroadcastd         m9, [o(idct64_mul+4*25)]
+    vpbroadcastd         m0, [o(idct64_mul+4*28)]
+    vpbroadcastd         m8, [o(idct64_mul+4*29)]
+    pmulhrsw             m3, m4  ; t59a
+    pmulhrsw             m4, m11 ; t36a
+    pmulhrsw             m2, m5  ; t58a
+    pmulhrsw             m5, m10 ; t37a
+    pmulhrsw             m1, m6  ; t57a
+    pmulhrsw             m6, m9  ; t38a
+    pmulhrsw             m0, m7  ; t56a
+    pmulhrsw             m7, m8  ; t39a
+    psubsw               m8, m4, m5 ; t37
+    paddsw               m4, m5     ; t36
+    psubsw               m5, m7, m6 ; t38
+    paddsw               m7, m6     ; t39
+    psubsw               m6, m0, m1 ; t57
+    paddsw               m0, m1     ; t56
+    psubsw               m1, m3, m2 ; t58
+    paddsw               m3, m2     ; t59
+    ITX_MULSUB_2W         6,  5,  2,  9, 15, m2598, 3166 ; t38a, t57a
+    vpbroadcastd        m10, [o(pw_3166_2598)]
+    ITX_MULSUB_2W         1,  8,  2,  9, 15, 10,  9 ; t37a, t58a
+    psubsw               m2, m7, m4 ; t36a
+    paddsw               m7, m4     ; t39a
+    psubsw               m4, m0, m3 ; t59a
+    paddsw               m0, m3     ; t56a
+    psubsw               m3, m6, m1 ; t37
+    paddsw               m6, m1     ; t38
+    psubsw               m1, m5, m8 ; t58
+    paddsw               m5, m8     ; t57
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    mova       [tmp2q-32*4], m0
+    mova       [tmp2q-32*3], m5
+    vpbroadcastd         m6, [o(pw_m799_m4017)]
+    vpbroadcastd         m7, [o(pw_m4017_799)]
+    ITX_MULSUB_2W         4,  2,  0,  5, 15,  7,  6 ; t36,  t59
+    ITX_MULSUB_2W         1,  3,  0,  5, 15,  7,  6 ; t37a, t58a
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m1
+    mova       [tmp2q-32*2], m3
+    mova       [tmp2q-32*1], m2
+    ret
+%define o_base pw_5 + 128
+.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub
+    sub                 rax, o_idct64_offset + 8
+    vpbroadcastd        m11, [o(pw_1567_3784)]
+    vpbroadcastd        m12, [o(pw_m3784_1567)]
+    vpbroadcastd        m13, [o(pw_2896_2896)]
+    vpbroadcastd        m14, [o(pw_m2896_2896)]
+.main_part2_pass1_loop:
+    call .main_part2_internal
+    IDCT64_PART2_END      0,  7,  0,  6,  9, 10
+    IDCT64_PART2_END      7,  8,  5,  0,  6,  7
+    IDCT64_PART2_END      8,  2,  1,  0,  6,  7
+    IDCT64_PART2_END     15,  3,  4,  0,  6,  7
+    cmp               tmp1q, tmp2q
+    jne .main_part2_pass1_loop
+    ret
+.main_part2_internal:
+    mova                 m0, [tmp1q-32*12] ; t32a
+    mova                 m6, [tmp2q-32*13] ; t39a
+    mova                 m1, [tmp1q-32* 4] ; t40a
+    mova                 m5, [tmp2q+32* 3] ; t55a
+    add               tmp1q, 32
+    sub               tmp2q, 32
+    mova                 m2, [tmp1q+32* 3] ; t48a
+    mova                 m4, [tmp2q-32* 4] ; t47a
+    mova                 m3, [tmp1q+32*11] ; t56a
+    mova                 m7, [tmp2q+32*12] ; t63a
+    psubsw               m8, m0, m6 ; t39
+    paddsw               m0, m6     ; t32
+    psubsw               m6, m4, m1 ; t40
+    paddsw               m4, m1     ; t47
+    psubsw               m1, m2, m5 ; t55
+    paddsw               m2, m5     ; t48
+    psubsw               m5, m7, m3 ; t56
+    paddsw               m7, m3     ; t63
+    ITX_MULSUB_2W         5,  8,  3,  9, 15, 11, 12 ; t39a, t56a
+    vpbroadcastd         m9, [o(pw_m1567_m3784)]
+    ITX_MULSUB_2W         1,  6,  3,  9, 15, 12,  9 ; t40a, t55a
+    psubsw               m3, m0, m4 ; t47a
+    paddsw               m0, m4     ; t32a
+    psubsw               m4, m7, m2 ; t48a
+    paddsw               m7, m2     ; t63a
+    psubsw               m2, m5, m1 ; t40
+    paddsw               m5, m1     ; t39
+    psubsw               m1, m8, m6 ; t55
+    paddsw               m8, m6     ; t56
+    ITX_MULSUB_2W         4,  3,  6,  9, 15, 13, 14 ; t47,  t48
+    ITX_MULSUB_2W         1,  2,  6,  9, 15, 13, 14 ; t40a, t55a
+    ret
+.main_part2_pass2:
+    sub                 rax, o_idct64_offset + 8
+    vpbroadcastd        m11, [o(pw_1567_3784)]
+    vpbroadcastd        m12, [o(pw_m3784_1567)]
+    vpbroadcastd        m13, [o(pw_2896_2896)]
+    lea                  r9, [strideq*5]    ; stride*5
+    lea                  r3, [r9+strideq*1] ; stride*6
+    lea                  r7, [r9+strideq*2] ; stride*7
+    lea                  r8, [r3+strideq*2] ; stride*8
+    lea                  r2, [dstq+r7]
+.main_part2_pass2_loop:
+    vpbroadcastd        m14, [o(pw_m2896_2896)]
+    call .main_part2_internal
+    vpbroadcastd        m14, [o(pw_2048)]
+    IDCT64_PART2_END      0,  7,  0,  6,  9, 10, strideq*0, r3*4, r8*4, r7*8
+    IDCT64_PART2_END      7,  8,  5,  0,  6,  7, strideq*0, r3*4, r8*4, r7*8
+    IDCT64_PART2_END      8,  2,  1,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
+    IDCT64_PART2_END     15,  3,  4,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
+    add                dstq, strideq
+    sub                  r2, strideq
+    cmp               tmp1q, tmp2q
+    jne .main_part2_pass2_loop
+    ret
+
+cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    mov                 r2d, 16
+.dconly:
+    pmulhrsw            xm0, xm2
+    movd                xm2, [o(pw_2048)]
+    pmulhrsw            xm0, xm1
+    pmulhrsw            xm0, xm2
+    vpbroadcastw         m0, xm0
+    pxor                 m1, m1
+.dconly_loop:
+    mova                 m2, [dstq+32*0]
+    mova                 m3, [dstq+32*1]
+    punpckhbw            m4, m2, m1
+    punpcklbw            m2, m1
+    punpckhbw            m5, m3, m1
+    punpcklbw            m3, m1
+    paddw                m4, m0
+    paddw                m2, m0
+    paddw                m5, m0
+    paddw                m3, m0
+    packuswb             m2, m4
+    packuswb             m3, m5
+    mova        [dstq+32*0], m2
+    mova        [dstq+32*1], m3
+    add                dstq, strideq
+    dec                 r2d
+    jg .dconly_loop
+    RET
+.normal:
+    PROLOGUE              0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
+    LOAD_8ROWS      cq+32*0, 32*4
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m8
+    lea               tmp1q, [rsp+32*7]
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m1
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m5
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    add               tmp1q, 32*8
+    mova       [tmp1q-32*4], m8
+    mova       [tmp1q-32*3], m9
+    mova       [tmp1q-32*2], m10
+    mova       [tmp1q-32*1], m11
+    mova       [tmp1q+32*0], m12
+    mova       [tmp1q+32*1], m13
+    mova       [tmp1q+32*2], m14
+    mova       [tmp1q+32*3], m15
+    LOAD_8ROWS      cq+32*2, 32*4
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+    add               tmp1q, 32*8
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    vpbroadcastd        m15, [o(pd_2048)]
+    add               tmp1q, 32*16
+    add               tmp2q, 32*32
+    mova                 m0, [cq+32* 1]
+    mova                 m1, [cq+32*31]
+    mova                 m2, [cq+32*17]
+    mova                 m3, [cq+32*15]
+    mova                 m4, [cq+32* 9]
+    mova                 m5, [cq+32*23]
+    mova                 m6, [cq+32*25]
+    mova                 m7, [cq+32* 7]
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+    add                 rax, o_idct64_offset
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    add                 rax, 8
+    add               tmp1q, 32*8
+    sub               tmp2q, 32*8
+    mova                 m0, [cq+32* 5]
+    mova                 m1, [cq+32*27]
+    mova                 m2, [cq+32*21]
+    mova                 m3, [cq+32*11]
+    mova                 m4, [cq+32*13]
+    mova                 m5, [cq+32*19]
+    mova                 m6, [cq+32*29]
+    mova                 m7, [cq+32* 3]
+    pxor                 m8, m8
+    REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1
+    sub               tmp1q, 32*36
+    lea                  r2, [strideq*3]
+    mov               tmp2d, 4
+.pass2_loop:
+    lea                  r3, [tmp1q-32*8]
+    mova                xm0, [r3   -32*4]
+    mova                xm1, [r3   -32*3]
+    vinserti128          m0, [tmp1q-32*4], 1
+    vinserti128          m1, [tmp1q-32*3], 1
+    mova                xm2, [r3   -32*2]
+    mova                xm3, [r3   -32*1]
+    vinserti128          m2, [tmp1q-32*2], 1
+    vinserti128          m3, [tmp1q-32*1], 1
+    mova                xm4, [r3   +32*0]
+    mova                xm5, [r3   +32*1]
+    vinserti128          m4, [tmp1q+32*0], 1
+    vinserti128          m5, [tmp1q+32*1], 1
+    mova                xm6, [r3   +32*2]
+    mova                xm7, [r3   +32*3]
+    vinserti128          m6, [tmp1q+32*2], 1
+    vinserti128          m7, [tmp1q+32*3], 1
+    mova                xm8, [r3   -32*4+16]
+    mova                xm9, [r3   -32*3+16]
+    vinserti128          m8, [tmp1q-32*4+16], 1
+    vinserti128          m9, [tmp1q-32*3+16], 1
+    mova               xm10, [r3   -32*2+16]
+    mova               xm11, [r3   -32*1+16]
+    vinserti128         m10, [tmp1q-32*2+16], 1
+    vinserti128         m11, [tmp1q-32*1+16], 1
+    mova               xm12, [r3   +32*0+16]
+    mova               xm13, [r3   +32*1+16]
+    vinserti128         m12, [tmp1q+32*0+16], 1
+    vinserti128         m13, [tmp1q+32*1+16], 1
+    mova               xm14, [r3   +32*2+16]
+    mova               xm15, [r3   +32*3+16]
+    vinserti128         m14, [tmp1q+32*2+16], 1
+    vinserti128         m15, [tmp1q+32*3+16], 1
+    mova         [rsp+32*0], m6
+    mova         [rsp+32*1], m7
+    vpbroadcastd         m7, [o(pw_8192)]
+    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
+    call m(idct_16x16_internal).main
+    mova         [rsp+32*0], m15
+    vpbroadcastd        m15, [o(pw_2048)]
+    REPX  {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7
+    WRITE_16X2            2,  3,  1,  2, strideq*2, r2
+    pmulhrsw             m1, m15, [rsp+32*1]
+    WRITE_16X2            0,  1,  2,  3, strideq*0, strideq*1
+    lea                  r3, [dstq+strideq*4]
+    %define dstq r3
+    WRITE_16X2            4,  5,  2,  3, strideq*0, strideq*1
+    WRITE_16X2            6,  7,  2,  3, strideq*2, r2
+    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
+    lea                  r3, [r3+strideq*4]
+    WRITE_16X2            8,  9,  2,  3, strideq*0, strideq*1
+    WRITE_16X2           10, 11,  2,  3, strideq*2, r2
+    pmulhrsw            m15, [rsp+32*0]
+    lea                  r3, [r3+strideq*4]
+    WRITE_16X2           12, 13,  2,  3, strideq*0, strideq*1
+    WRITE_16X2           14, 15,  2,  3, strideq*2, r2
+    add               tmp1q, 32*16
+    add                  r0, 16
+    dec               tmp2d
+    jg .pass2_loop
+    RET
+
+cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    mov                 r2d, 64
+    jmp m(inv_txfm_add_dct_dct_32x8).dconly
+.normal:
+    PROLOGUE              0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2
+    lea               tmp1q, [rsp+32*7]
+    lea                r10d, [eobq-136]
+    sar                r10d, 31
+.pass1_loop:
+    lea               tmp2q, [tmp1q+32*16]
+    LOAD_8ROWS      cq+64*1, 64*2, 1
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
+    test               r10b, r10b
+    jnz .fast
+    LOAD_8ROWS_H   cq+64*17, 64*2, 2
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
+    LOAD_8ROWS_H   cq+64*16, 64*2, 1
+    mova              [rsp], m15
+    pxor                m15, m15
+    REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \
+                                24, 25, 26, 27, 28, 29, 30, 31
+    jmp .idct16
+.fast:
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    pxor                 m8, m8
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m8
+.idct16:
+    LOAD_8ROWS      cq+64*0, 64*2, 1
+    pxor                m15, m15
+    REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
+    call m(idct_16x16_internal).main
+    call m(inv_txfm_add_dct_dct_32x16).pass1_end
+    vpbroadcastd         m7, [o(pw_16384)]
+    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
+    lea                  r3, [tmp1q+32*48]
+    mova                m15, [rsp]
+    mova          [r3-32*4], m0
+    mova          [r3-32*3], m2
+    mova          [r3-32*2], m4
+    mova          [r3-32*1], m6
+    mova          [r3+32*0], m8
+    mova          [r3+32*1], m10
+    mova          [r3+32*2], m12
+    mova          [r3+32*3], m14
+    add                  r3, 32*24
+    mova          [r3-32*4], m1
+    mova          [r3-32*3], m3
+    mova          [r3-32*2], m5
+    mova          [r3-32*1], m7
+    mova          [r3+32*0], m9
+    mova          [r3+32*1], m11
+    mova          [r3+32*2], m13
+    mova          [r3+32*3], m15
+    vpbroadcastd         m9, [o(pw_16384)]
+    pmulhrsw             m0, m9, [tmp1q-32*4]
+    pmulhrsw             m1, m9, [tmp1q-32*3]
+    pmulhrsw             m2, m9, [tmp1q-32*2]
+    pmulhrsw             m3, m9, [tmp1q-32*1]
+    pmulhrsw             m4, m9, [tmp1q+32*0]
+    pmulhrsw             m5, m9, [tmp1q+32*1]
+    pmulhrsw             m6, m9, [tmp1q+32*2]
+    pmulhrsw             m7, m9, [tmp1q+32*3]
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    mova       [tmp1q-32*4], m0
+    pmulhrsw             m0, m9, [tmp2q-32*4]
+    mova       [tmp2q-32*4], m1
+    pmulhrsw             m1, m9, [tmp2q-32*3]
+    mova       [tmp1q-32*3], m2
+    pmulhrsw             m2, m9, [tmp2q-32*2]
+    mova       [tmp2q-32*3], m3
+    pmulhrsw             m3, m9, [tmp2q-32*1]
+    mova       [tmp1q-32*2], m4
+    pmulhrsw             m4, m9, [tmp2q+32*0]
+    mova       [tmp2q-32*2], m5
+    pmulhrsw             m5, m9, [tmp2q+32*1]
+    mova       [tmp1q-32*1], m6
+    pmulhrsw             m6, m9, [tmp2q+32*2]
+    mova       [tmp2q-32*1], m7
+    pmulhrsw             m7, m9, [tmp2q+32*3]
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    mova       [tmp1q+32*0], m0
+    mova       [tmp2q+32*0], m1
+    mova       [tmp1q+32*1], m2
+    mova       [tmp2q+32*1], m3
+    mova       [tmp1q+32*2], m4
+    mova       [tmp2q+32*2], m5
+    mova       [tmp1q+32*3], m6
+    mova       [tmp2q+32*3], m7
+    add                  cq, 32
+    add               tmp1q, 32*8
+    add                r10d, 0x80000000
+    jnc .pass1_loop
+    lea                  r2, [rsp+32*55]
+    lea                  r7, [r2+32*24]
+.pass2_loop:
+    lea                  r3, [r2+32*8]
+    lea                  r8, [r7+32*8]
+    mova                 m0, [r2-32*4]
+    mova                 m1, [r2-32*2]
+    mova                 m2, [r2+32*0]
+    mova                 m3, [r2+32*2]
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+    test               r10b, r10b
+    jnz .fast2
+    mova                 m4, [r3-32*4]
+    mova                 m5, [r3-32*2]
+    mova                 m6, [r3+32*0]
+    mova                 m7, [r3+32*2]
+.fast2:
+    mova              [rsp], m8
+    lea               tmp1q, [rsp+32*39]
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m1
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m5
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    add               tmp1q, 32*8
+    mova       [tmp1q-32*4], m8
+    mova       [tmp1q-32*3], m9
+    mova       [tmp1q-32*2], m10
+    mova       [tmp1q-32*1], m11
+    mova       [tmp1q+32*0], m12
+    mova       [tmp1q+32*1], m13
+    mova       [tmp1q+32*2], m14
+    mova       [tmp1q+32*3], m15
+    mova                 m0, [r2-32*3]
+    mova                 m1, [r2-32*1]
+    mova                 m2, [r2+32*1]
+    mova                 m3, [r2+32*3]
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7
+    test               r10b, r10b
+    jnz .fast3
+    mova                 m4, [r3-32*3]
+    mova                 m5, [r3-32*1]
+    mova                 m6, [r3+32*1]
+    mova                 m7, [r3+32*3]
+.fast3:
+    add               tmp1q, 32*8
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    vpbroadcastd        m15, [o(pd_2048)]
+    add               tmp1q, 32*16
+    add               tmp2q, 32*32
+    mova                 m0, [r7-32*4]
+    mova                 m3, [r7+32*3]
+    mova                 m4, [r7+32*0]
+    mova                 m7, [r7-32*1]
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m2, m5, m6
+    test               r10b, r10b
+    jnz .fast4
+    mova                 m1, [r8+32*3]
+    mova                 m2, [r8-32*4]
+    mova                 m5, [r8-32*1]
+    mova                 m6, [r8+32*0]
+.fast4:
+    add                 rax, o_idct64_offset
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    add                 rax, 8
+    add               tmp1q, 32*8
+    sub               tmp2q, 32*8
+    mova                 m0, [r7-32*2]
+    mova                 m3, [r7+32*1]
+    mova                 m4, [r7+32*2]
+    mova                 m7, [r7-32*3]
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m2, m5, m6
+    test               r10b, r10b
+    jnz .fast5
+    mova                 m1, [r8+32*1]
+    mova                 m2, [r8-32*2]
+    mova                 m5, [r8-32*3]
+    mova                 m6, [r8+32*2]
+.fast5:
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
+    add                r10d, 0x80000000
+    jc .ret
+    lea                  r2, [rsp+32*7]
+    lea                  r7, [r2+32*16]
+    sub                dstq, r8
+    lea                dstq, [dstq+strideq*4+16]
+    jmp .pass2_loop
+.ret:
+    RET
+
+cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_16384)]
+    mov                [cq], eobd
+    pmulhrsw            xm0, xm1
+    mov                 r2d, 32
+    jmp m(inv_txfm_add_dct_dct_64x16).dconly
+.normal:
+    PROLOGUE              0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \
+                                            base, tmp3, tmp4
+    lea               tmp1q, [rsp+32*7]
+    lea               tmp4d, [eobq-136]
+.pass1_loop:
+    LOAD_8ROWS      cq+64*0, 64*4, 1
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m8
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m1
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m5
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    add               tmp1q, 32*8
+    mova       [tmp1q-32*4], m8
+    mova       [tmp1q-32*3], m9
+    mova       [tmp1q-32*2], m10
+    mova       [tmp1q-32*1], m11
+    mova       [tmp1q+32*0], m12
+    mova       [tmp1q+32*1], m13
+    mova       [tmp1q+32*2], m14
+    mova       [tmp1q+32*3], m15
+    LOAD_8ROWS      cq+64*2, 64*4, 1
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+    add               tmp1q, 32*8
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    vpbroadcastd        m15, [o(pd_2048)]
+    add               tmp1q, 32*16
+    add               tmp2q, 32*32
+    vpbroadcastd         m7, [o(pw_2896x8)]
+    pmulhrsw             m0, m7, [cq+64* 1]
+    pmulhrsw             m1, m7, [cq+64*31]
+    pmulhrsw             m2, m7, [cq+64*17]
+    pmulhrsw             m3, m7, [cq+64*15]
+    pmulhrsw             m4, m7, [cq+64* 9]
+    pmulhrsw             m5, m7, [cq+64*23]
+    pmulhrsw             m6, m7, [cq+64*25]
+    pmulhrsw             m7,     [cq+64* 7]
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+    add                 rax, o_idct64_offset
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    vpbroadcastd         m7, [o(pw_2896x8-(o_idct64_offset))]
+    add                 rax, 8
+    add               tmp1q, 32*8
+    sub               tmp2q, 32*8
+    pmulhrsw             m0, m7, [cq+64* 5]
+    pmulhrsw             m1, m7, [cq+64*27]
+    pmulhrsw             m2, m7, [cq+64*21]
+    pmulhrsw             m3, m7, [cq+64*11]
+    pmulhrsw             m4, m7, [cq+64*13]
+    pmulhrsw             m5, m7, [cq+64*19]
+    pmulhrsw             m6, m7, [cq+64*29]
+    pmulhrsw             m7,     [cq+64* 3]
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1
+    sub               tmp1q, 32*44
+    vpbroadcastd        m10, [o(pw_16384)]
+    call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave
+    add                  cq, 32
+    add               tmp4d, 0x80000000
+    jnc .pass1_loop
+    lea               tmp1q, [rsp+32*15]
+    imul                 r2, strideq, 19
+    lea                  r3, [strideq*3]
+    add                  r2, dstq
+    mov               tmp4b, 4
+.pass2_loop:
+    lea               tmp2q, [tmp1q+32*64]
+    LOAD_8ROWS   tmp1q-32*4, 32
+    test              tmp4d, 0x40000000
+    jnz .fast
+    LOAD_8ROWS_H tmp2q-32*4, 32
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
+    lea               tmp3q, [tmp2q-32*8]
+    LOAD_8ROWS_H tmp3q-32*4, 32
+    mova              [rsp], m15
+    jmp .idct16
+.fast:
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    pxor                 m8, m8
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m8
+.idct16:
+    lea               tmp3q, [tmp1q-32*8]
+    LOAD_8ROWS   tmp3q-32*4, 32
+    call m(idct_16x16_internal).main
+    call m(inv_txfm_add_dct_dct_16x32).pass2_end
+    add               tmp1q, 32*16
+    sub                dstq, r3
+    lea                  r2, [r2+r3+16]
+    add                dstq, 16
+    dec               tmp4b
+    jg .pass2_loop
+    RET
+ALIGN function_align
+.transpose_round_interleave:
+    mov               tmp3d, 4
+.loop:
+    lea               tmp2q, [tmp1q+32*8]
+    mova                xm0, [tmp1q-32*4]
+    mova                xm1, [tmp1q-32*3]
+    vinserti128          m0, [tmp2q-32*4], 1
+    vinserti128          m1, [tmp2q-32*3], 1
+    mova                xm2, [tmp1q-32*2]
+    mova                xm3, [tmp1q-32*1]
+    vinserti128          m2, [tmp2q-32*2], 1
+    vinserti128          m3, [tmp2q-32*1], 1
+    mova                xm4, [tmp1q+32*0]
+    mova                xm5, [tmp1q+32*1]
+    vinserti128          m4, [tmp2q+32*0], 1
+    vinserti128          m5, [tmp2q+32*1], 1
+    mova                xm6, [tmp1q+32*2]
+    mova                xm7, [tmp1q+32*3]
+    vinserti128          m6, [tmp2q+32*2], 1
+    vinserti128          m7, [tmp2q+32*3], 1
+    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    mova                xm8, [tmp1q-32*4+16]
+    mova                xm9, [tmp1q-32*3+16]
+    vinserti128          m8, [tmp2q-32*4+16], 1
+    vinserti128          m9, [tmp2q-32*3+16], 1
+    mova       [tmp1q-32*4], m0
+    mova       [tmp2q-32*4], m1
+    mova       [tmp1q-32*3], m2
+    mova       [tmp2q-32*3], m3
+    mova                xm2, [tmp1q-32*2+16]
+    mova                xm3, [tmp1q-32*1+16]
+    vinserti128          m2, [tmp2q-32*2+16], 1
+    vinserti128          m3, [tmp2q-32*1+16], 1
+    mova       [tmp1q-32*2], m4
+    mova       [tmp2q-32*2], m5
+    mova       [tmp1q-32*1], m6
+    mova       [tmp2q-32*1], m7
+    mova                xm4, [tmp1q+32*0+16]
+    mova                xm5, [tmp1q+32*1+16]
+    vinserti128          m4, [tmp2q+32*0+16], 1
+    vinserti128          m5, [tmp2q+32*1+16], 1
+    mova                xm6, [tmp1q+32*2+16]
+    mova                xm7, [tmp1q+32*3+16]
+    vinserti128          m6, [tmp2q+32*2+16], 1
+    vinserti128          m7, [tmp2q+32*3+16], 1
+    pmulhrsw             m0, m8, m10
+    pmulhrsw             m1, m9, m10
+    REPX  {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7
+    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
+    mova       [tmp1q+32*0], m0
+    mova       [tmp2q+32*0], m1
+    mova       [tmp1q+32*1], m2
+    mova       [tmp2q+32*1], m3
+    mova       [tmp1q+32*2], m4
+    mova       [tmp2q+32*2], m5
+    mova       [tmp1q+32*3], m6
+    mova       [tmp2q+32*3], m7
+    add               tmp1q, 32*16
+    dec               tmp3d
+    jg .loop
+    ret
+
+cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob
+    lea                 rax, [o_base]
+    test               eobd, eobd
+    jnz .normal
+    movd                xm1, [o(pw_2896x8)]
+    pmulhrsw            xm0, xm1, [cq]
+    movd                xm2, [o(pw_8192)]
+    mov                [cq], eobd
+    mov                 r2d, 64
+    jmp m(inv_txfm_add_dct_dct_64x16).dconly
+.normal:
+    PROLOGUE              0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2
+    lea               tmp1q, [rsp+32*71]
+    lea                r10d, [eobq-136]
+.pass1_loop:
+    LOAD_8ROWS      cq+64*0, 64*4
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m8
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m1
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m5
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    add               tmp1q, 32*8
+    mova       [tmp1q-32*4], m8
+    mova       [tmp1q-32*3], m9
+    mova       [tmp1q-32*2], m10
+    mova       [tmp1q-32*1], m11
+    mova       [tmp1q+32*0], m12
+    mova       [tmp1q+32*1], m13
+    mova       [tmp1q+32*2], m14
+    mova       [tmp1q+32*3], m15
+    LOAD_8ROWS      cq+64*2, 64*4
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+    add               tmp1q, 32*8
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    vpbroadcastd        m15, [o(pd_2048)]
+    add               tmp1q, 32*16
+    add               tmp2q, 32*32
+    mova                 m0, [cq+64* 1]
+    mova                 m1, [cq+64*31]
+    mova                 m2, [cq+64*17]
+    mova                 m3, [cq+64*15]
+    mova                 m4, [cq+64* 9]
+    mova                 m5, [cq+64*23]
+    mova                 m6, [cq+64*25]
+    mova                 m7, [cq+64* 7]
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+    add                 rax, o_idct64_offset
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    add                 rax, 8
+    add               tmp1q, 32*8
+    sub               tmp2q, 32*8
+    mova                 m0, [cq+64* 5]
+    mova                 m1, [cq+64*27]
+    mova                 m2, [cq+64*21]
+    mova                 m3, [cq+64*11]
+    mova                 m4, [cq+64*13]
+    mova                 m5, [cq+64*19]
+    mova                 m6, [cq+64*29]
+    mova                 m7, [cq+64* 3]
+    pxor                 m8, m8
+    REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1
+    sub               tmp1q, 32*44
+    vpbroadcastd        m10, [o(pw_8192)]
+    call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave
+    add                  cq, 32
+    add                r10d, 0x80000000
+    jnc .pass1_loop
+    lea               tmp1q, [rsp+32*7]
+    mov                r10b, 4
+.pass2_loop:
+    lea                  r2, [tmp1q+32*64]
+    mova                 m0, [r2-32*4]
+    mova                 m1, [r2-32*2]
+    mova                 m2, [r2+32*0]
+    mova                 m3, [r2+32*2]
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+    mova              [rsp], m4
+    test               r10d, 0x40000000
+    jnz .fast
+    lea                  r3, [r2+32*64]
+    mova                 m4, [r3-32*4]
+    mova                 m5, [r3-32*2]
+    mova                 m6, [r3+32*0]
+    mova                 m7, [r3+32*2]
+.fast:
+    call m(idct_16x16_internal).main
+    mova                 m1, [rsp+32*1]
+    mova       [tmp1q-32*4], m0
+    mova       [tmp1q-32*3], m1
+    mova       [tmp1q-32*2], m2
+    mova       [tmp1q-32*1], m3
+    mova       [tmp1q+32*0], m4
+    mova       [tmp1q+32*1], m5
+    mova       [tmp1q+32*2], m6
+    mova       [tmp1q+32*3], m7
+    add               tmp1q, 32*8
+    mova       [tmp1q-32*4], m8
+    mova       [tmp1q-32*3], m9
+    mova       [tmp1q-32*2], m10
+    mova       [tmp1q-32*1], m11
+    mova       [tmp1q+32*0], m12
+    mova       [tmp1q+32*1], m13
+    mova       [tmp1q+32*2], m14
+    mova       [tmp1q+32*3], m15
+    mova                 m0, [r2-32*3]
+    mova                 m1, [r2-32*1]
+    mova                 m2, [r2+32*1]
+    mova                 m3, [r2+32*3]
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7
+    test               r10d, 0x40000000
+    jnz .fast2
+    mova                 m4, [r3-32*3]
+    mova                 m5, [r3-32*1]
+    mova                 m6, [r3+32*1]
+    mova                 m7, [r3+32*3]
+.fast2:
+    add               tmp1q, 32*8
+    lea               tmp2q, [tmp1q+32*8]
+    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
+    vpbroadcastd        m15, [o(pd_2048)]
+    add                  r2, 32*8
+    add                  r3, 32*8
+    add               tmp1q, 32*16
+    add               tmp2q, 32*32
+    mova                 m0, [r2-32*4] ;  1
+    mova                 m3, [r2+32*3] ; 15
+    mova                 m4, [r2+32*0] ;  9
+    mova                 m7, [r2-32*1] ;  7
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m2, m5, m6
+    test               r10d, 0x40000000
+    jnz .fast3
+    mova                 m1, [r3+32*3] ; 31
+    mova                 m2, [r3-32*4] ; 17
+    mova                 m5, [r3-32*1] ; 23
+    mova                 m6, [r3+32*0] ; 25
+.fast3:
+    add                 rax, o_idct64_offset
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    add                 rax, 8
+    add               tmp1q, 32*8
+    sub               tmp2q, 32*8
+    mova                 m0, [r2-32*2] ;  5
+    mova                 m3, [r2+32*1] ; 11
+    mova                 m4, [r2+32*2] ; 13
+    mova                 m7, [r2-32*3] ;  3
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m2, m5, m6
+    test               r10d, 0x40000000
+    jnz .fast4
+    mova                 m1, [r3+32*1] ; 27
+    mova                 m2, [r3-32*2] ; 21
+    mova                 m5, [r3-32*3] ; 19
+    mova                 m6, [r3+32*2] ; 29
+.fast4:
+    call m(inv_txfm_add_dct_dct_16x64).main_part1
+    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
+    sub               tmp1q, 32*28
+    sub                dstq, r8
+    lea                dstq, [dstq+strideq*4+16]
+    dec                r10b
+    jg .pass2_loop
+    RET
+
+%endif ; ARCH_X86_64
diff --git a/src/x86/itx_init_tmpl.c b/src/x86/itx_init_tmpl.c
new file mode 100644 (file)
index 0000000..7d0c58c
--- /dev/null
@@ -0,0 +1,187 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_identity_identity_##w##x##h##_##opt)
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(dav1d_inv_txfm_add_dct_adst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_dct_flipadst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_dct_identity_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_adst_dct_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_adst_adst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_adst_flipadst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_flipadst_dct_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_flipadst_adst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_identity_dct_##w##x##h##_##opt)
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(dav1d_inv_txfm_add_adst_identity_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_flipadst_identity_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_identity_adst_##w##x##h##_##opt); \
+decl_itx_fn(dav1d_inv_txfm_add_identity_flipadst_##w##x##h##_##opt)
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(dav1d_inv_txfm_add_wht_wht_##w##x##h##_##opt)
+
+decl_itx17_fns( 4,  4, avx2);
+decl_itx16_fns( 4,  8, avx2);
+decl_itx16_fns( 4, 16, avx2);
+decl_itx16_fns( 8,  4, avx2);
+decl_itx16_fns( 8,  8, avx2);
+decl_itx16_fns( 8, 16, avx2);
+decl_itx2_fns ( 8, 32, avx2);
+decl_itx16_fns(16,  4, avx2);
+decl_itx16_fns(16,  8, avx2);
+decl_itx12_fns(16, 16, avx2);
+decl_itx2_fns (16, 32, avx2);
+decl_itx2_fns (32,  8, avx2);
+decl_itx2_fns (32, 16, avx2);
+decl_itx2_fns (32, 32, avx2);
+
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_avx2);
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_avx2);
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_avx2);
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2);
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2);
+
+decl_itx17_fns( 4,  4, ssse3);
+decl_itx16_fns( 4,  8, ssse3);
+decl_itx16_fns( 8,  4, ssse3);
+decl_itx16_fns( 8,  8, ssse3);
+decl_itx16_fns( 4, 16, ssse3);
+decl_itx16_fns(16,  4, ssse3);
+decl_itx16_fns( 8, 16, ssse3);
+decl_itx16_fns(16,  8, ssse3);
+decl_itx12_fns(16, 16, ssse3);
+decl_itx2_fns ( 8, 32, ssse3);
+decl_itx2_fns (32,  8, ssse3);
+decl_itx2_fns (16, 32, ssse3);
+decl_itx2_fns (32, 16, ssse3);
+decl_itx2_fns (32, 32, ssse3);
+
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_ssse3);
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_ssse3);
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_ssse3);
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_ssse3);
+decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_ssse3);
+
+COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+        dav1d_inv_txfm_add_##type##_##w##x##h##_##ext
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+    assign_itx1_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+    assign_itx2_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
+    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
+    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
+    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
+    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
+    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
+    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
+    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
+    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+    assign_itx12_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
+    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
+    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
+    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+    assign_itx16_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, wht_wht,           WHT_WHT,           ext)
+
+
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+    assign_itx17_fn(,   4,  4, ssse3);
+    assign_itx16_fn(R,  4,  8, ssse3);
+    assign_itx16_fn(R,  8,  4, ssse3);
+    assign_itx16_fn(,   8,  8, ssse3);
+    assign_itx16_fn(R,  4, 16, ssse3);
+    assign_itx16_fn(R, 16,  4, ssse3);
+    assign_itx16_fn(R,  8, 16, ssse3);
+    assign_itx16_fn(R, 16,  8, ssse3);
+    assign_itx12_fn(,  16, 16, ssse3);
+    assign_itx2_fn (R,  8, 32, ssse3);
+    assign_itx2_fn (R, 32,  8, ssse3);
+    assign_itx2_fn (R, 16, 32, ssse3);
+    assign_itx2_fn (R, 32, 16, ssse3);
+    assign_itx2_fn (,  32, 32, ssse3);
+    assign_itx1_fn (R, 16, 64, ssse3);
+    assign_itx1_fn (R, 32, 64, ssse3);
+    assign_itx1_fn (R, 64, 16, ssse3);
+    assign_itx1_fn (R, 64, 32, ssse3);
+    assign_itx1_fn ( , 64, 64, ssse3);
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+    assign_itx17_fn( ,  4,  4, avx2);
+    assign_itx16_fn(R,  4,  8, avx2);
+    assign_itx16_fn(R,  4, 16, avx2);
+    assign_itx16_fn(R,  8,  4, avx2);
+    assign_itx16_fn( ,  8,  8, avx2);
+    assign_itx16_fn(R,  8, 16, avx2);
+    assign_itx2_fn (R,  8, 32, avx2);
+    assign_itx16_fn(R, 16,  4, avx2);
+    assign_itx16_fn(R, 16,  8, avx2);
+    assign_itx12_fn( , 16, 16, avx2);
+    assign_itx2_fn (R, 16, 32, avx2);
+    assign_itx1_fn (R, 16, 64, avx2);
+    assign_itx2_fn (R, 32,  8, avx2);
+    assign_itx2_fn (R, 32, 16, avx2);
+    assign_itx2_fn ( , 32, 32, avx2);
+    assign_itx1_fn (R, 32, 64, avx2);
+    assign_itx1_fn (R, 64, 16, avx2);
+    assign_itx1_fn (R, 64, 32, avx2);
+    assign_itx1_fn ( , 64, 64, avx2);
+#endif
+}
diff --git a/src/x86/itx_ssse3.asm b/src/x86/itx_ssse3.asm
new file mode 100644 (file)
index 0000000..91cf666
--- /dev/null
@@ -0,0 +1,6558 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+
+SECTION_RODATA 16
+
+deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
+
+deint_shuf1: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
+deint_shuf2: db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7
+
+%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1
+pw_%1_m%2:  times 4 dw  %1, -%2
+%if %3 != 2
+pw_%2_%1:   times 4 dw  %2,  %1
+%endif
+%if %3
+pw_m%1_m%2: times 4 dw -%1, -%2
+%endif
+%endmacro
+
+;adst4
+pw_1321_3803:   times 4 dw  1321,  3803
+pw_2482_m1321:  times 4 dw  2482, -1321
+pw_3344_2482:   times 4 dw  3344,  2482
+pw_3344_m3803:  times 4 dw  3344, -3803
+pw_3344_m3344:  times 4 dw  3344, -3344
+pw_0_3344       times 4 dw     0,  3344
+pw_m6688_m3803: times 4 dw -6688, -3803
+
+COEF_PAIR 2896, 2896
+COEF_PAIR 1567, 3784
+COEF_PAIR  799, 4017
+COEF_PAIR 3406, 2276
+COEF_PAIR  401, 4076
+COEF_PAIR 1931, 3612
+COEF_PAIR 3166, 2598
+COEF_PAIR 3920, 1189
+COEF_PAIR 3784, 1567, 1
+COEF_PAIR  995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 3513, 2106
+COEF_PAIR 3857, 1380
+COEF_PAIR 4017,  799, 1
+COEF_PAIR  201, 4091
+COEF_PAIR 2440, 3290
+COEF_PAIR 3035, 2751
+COEF_PAIR 4052,  601
+COEF_PAIR 2276, 3406, 1
+COEF_PAIR 4076,  401, 2
+COEF_PAIR 2598, 3166, 2
+COEF_PAIR 3612, 1931, 2
+COEF_PAIR 1189, 3920, 2
+
+pd_2048:        times 4 dd  2048
+pw_2048:        times 8 dw  2048
+pw_m2048:       times 8 dw -2048
+pw_4096:        times 8 dw  4096
+pw_16384:       times 8 dw  16384
+pw_m16384:      times 8 dw  -16384
+pw_1697x16:     times 8 dw  1697*16
+pw_1697x8:      times 8 dw  1697*8
+pw_2896x8:      times 8 dw  2896*8
+pw_3344x8:      times 8 dw  3344*8
+pw_8192:        times 8 dw  8192
+pw_m8192:       times 8 dw -8192
+pw_5:           times 8 dw  5
+pw_201x8:       times 8 dw   201*8
+pw_4091x8:      times 8 dw  4091*8
+pw_m2751x8:     times 8 dw -2751*8
+pw_3035x8:      times 8 dw  3035*8
+pw_1751x8:      times 8 dw  1751*8
+pw_3703x8:      times 8 dw  3703*8
+pw_m1380x8:     times 8 dw -1380*8
+pw_3857x8:      times 8 dw  3857*8
+pw_995x8:       times 8 dw   995*8
+pw_3973x8:      times 8 dw  3973*8
+pw_m2106x8:     times 8 dw -2106*8
+pw_3513x8:      times 8 dw  3513*8
+pw_2440x8:      times 8 dw  2440*8
+pw_3290x8:      times 8 dw  3290*8
+pw_m601x8:      times 8 dw  -601*8
+pw_4052x8:      times 8 dw  4052*8
+
+pw_4095x8:      times 8 dw  4095*8
+pw_101x8:       times 8 dw   101*8
+pw_2967x8:      times 8 dw  2967*8
+pw_m2824x8:     times 8 dw -2824*8
+pw_3745x8:      times 8 dw  3745*8
+pw_1660x8:      times 8 dw  1660*8
+pw_3822x8:      times 8 dw  3822*8
+pw_m1474x8:     times 8 dw -1474*8
+pw_3996x8:      times 8 dw  3996*8
+pw_897x8:       times 8 dw   897*8
+pw_3461x8:      times 8 dw  3461*8
+pw_m2191x8:     times 8 dw -2191*8
+pw_3349x8:      times 8 dw  3349*8
+pw_2359x8:      times 8 dw  2359*8
+pw_4036x8:      times 8 dw  4036*8
+pw_m700x8:      times 8 dw  -700*8
+pw_4065x8:      times 8 dw  4065*8
+pw_501x8:       times 8 dw   501*8
+pw_3229x8:      times 8 dw  3229*8
+pw_m2520x8:     times 8 dw -2520*8
+pw_3564x8:      times 8 dw  3564*8
+pw_2019x8:      times 8 dw  2019*8
+pw_3948x8:      times 8 dw  3948*8
+pw_m1092x8:     times 8 dw -1092*8
+pw_3889x8:      times 8 dw  3889*8
+pw_1285x8:      times 8 dw  1285*8
+pw_3659x8:      times 8 dw  3659*8
+pw_m1842x8:     times 8 dw -1842*8
+pw_3102x8:      times 8 dw  3102*8
+pw_2675x8:      times 8 dw  2675*8
+pw_4085x8:      times 8 dw  4085*8
+pw_m301x8:      times 8 dw  -301*8
+
+SECTION .text
+
+%macro REPX 2-*
+    %xdefine %%f(x) %1
+%rep %0 - 1
+    %rotate 1
+    %%f(%1)
+%endrep
+%endmacro
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%if ARCH_X86_64
+%define o(x) x
+%else
+%define o(x) r5-$$+x ; PIC
+%endif
+
+%macro WRITE_4X4 9  ;src[1-2], tmp[1-3], row[1-4]
+    lea                  r2, [dstq+strideq*2]
+%assign %%i 1
+%rotate 5
+%rep 4
+    %if %1 & 2
+        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
+    %else
+        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+    %endif
+    %assign %%i %%i + 1
+    %rotate 1
+%endrep
+
+    movd                 m%3, [%%row_adr1]        ;dst0
+    movd                 m%5, [%%row_adr2]        ;dst1
+    punpckldq            m%3, m%5                 ;high: dst1 :low: dst0
+    movd                 m%4, [%%row_adr3]        ;dst2
+    movd                 m%5, [%%row_adr4]        ;dst3
+    punpckldq            m%4, m%5                 ;high: dst3 :low: dst2
+
+    pxor                 m%5, m%5
+    punpcklbw            m%3, m%5                 ;extend byte to word
+    punpcklbw            m%4, m%5                 ;extend byte to word
+
+    paddw                m%3, m%1                 ;high: dst1 + out1 ;low: dst0 + out0
+    paddw                m%4, m%2                 ;high: dst3 + out3 ;low: dst2 + out2
+
+    packuswb             m%3, m%4                 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
+
+    movd        [%%row_adr1], m%3                  ;store dst0 + out0
+    pshuflw              m%4, m%3, q1032
+    movd        [%%row_adr2], m%4                  ;store dst1 + out1
+    punpckhqdq           m%3, m%3
+    movd        [%%row_adr3], m%3                  ;store dst2 + out2
+    psrlq                m%3, 32
+    movd        [%%row_adr4], m%3                  ;store dst3 + out3
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+    mova                 m2, [o(pw_%5)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+%endif
+
+    WRITE_4X4            0, 1, 2, 3, 4, %1, %2, %3, %4
+    ret
+%endmacro
+
+; flags: 1 = swap, 2: coef_regs, 4: no_pack
+%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
+%if %6 & 2
+    pmaddwd              m%2, m%4, m%1
+    pmaddwd              m%1, m%5
+%elif %6 & 1
+    pmaddwd              m%2, m%1, [o(pw_%5_%4)]
+    pmaddwd              m%1, [o(pw_%4_m%5)]
+%else
+    pmaddwd              m%2, m%1, [o(pw_%4_m%5)]
+    pmaddwd              m%1, [o(pw_%5_%4)]
+%endif
+    paddd                m%2, m%3
+    paddd                m%1, m%3
+    psrad                m%2, 12
+    psrad                m%1, 12
+%if %6 & 4 == 0
+    packssdw             m%1, m%2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0-1   ;pw_2896x8
+    mova                 m3, [o(pd_2048)]
+    punpckhwd            m2, m0, m1            ;unpacked in1 in3
+    punpcklwd            m0, m1                ;unpacked in0 in2
+    ITX_MUL2X_PACK        2, 1, 3, 1567, 3784
+    ITX_MUL2X_PACK        0, 1, 3, 2896, 2896
+    psubsw               m1, m0, m2            ;high: out2 ;low: out3
+    paddsw               m0, m2                ;high: out1 ;low: out0
+%endmacro
+
+%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack
+cglobal inv_txfm_add_%1_%2_%3, 4, 6, %4, dst, stride, coeff, eob, tx2
+    %define %%p1 m(i%1_%3_internal)
+%if ARCH_X86_32
+    LEA                    r5, $$
+%endif
+%if has_epilogue
+%ifidn %1_%2, dct_dct
+    test                 eobd, eobd
+    jz %%end
+%endif
+    lea                  tx2q, [o(m(i%2_%3_internal).pass2)]
+    call %%p1
+    RET
+%%end:
+%else
+    lea                  tx2q, [o(m(i%2_%3_internal).pass2)]
+%ifidn %1_%2, dct_dct
+    test                 eobd, eobd
+    jnz %%p1
+%else
+    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x4, 6
+%ifidn %1_%2, dct_dct
+    pshuflw              m0, [coeffq], q0000
+    punpcklqdq           m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+    mov            [coeffq], eobd                ;0
+    pmulhrsw             m0, m1
+    mova                 m1, m0
+    TAIL_CALL m(iadst_4x4_internal).end2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0
+    mova                 m1, [coeffq+16*1]      ;high: in3 ;low in2
+
+    IDCT4_1D_PACKED
+
+    mova                 m2, [o(deint_shuf)]
+    shufps               m3, m0, m1, q1331
+    shufps               m0, m1, q0220
+    pshufb               m0, m2                 ;high: in1 ;low: in0
+    pshufb               m1, m3, m2             ;high: in3 ;low :in2
+    jmp                tx2q
+
+.pass2:
+    IDCT4_1D_PACKED
+
+    pxor                 m2, m2
+    mova      [coeffq+16*0], m2
+    mova      [coeffq+16*1], m2                 ;memset(coeff, 0, sizeof(*coeff) * sh * sw);
+
+    ITX4_END     0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    call .main
+    punpckhwd            m2, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m0, m2       ;high: in3 ;low :in2
+    punpcklwd            m0, m2           ;high: in1 ;low: in0
+    jmp                tx2q
+
+.pass2:
+    call .main
+
+.end:
+    pxor                 m2, m2
+    mova      [coeffq+16*0], m2
+    mova      [coeffq+16*1], m2
+
+.end2:
+    ITX4_END              0, 1, 2, 3
+
+ALIGN function_align
+.main:
+    punpcklwd            m2, m0, m1                ;unpacked in0 in2
+    punpckhwd            m0, m1                    ;unpacked in1 in3
+    mova                 m3, m0
+    pmaddwd              m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
+    pmaddwd              m0, [o(pw_0_3344)]        ;3344 * in3
+    paddd                m1, m0                    ;t2
+    pmaddwd              m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+    pmaddwd              m2, [o(pw_2482_m1321)]    ;2482 * in0 - 1321 * in2
+    pmaddwd              m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
+    paddd                m4, m0                    ;t0 + t3
+    pmaddwd              m3, [o(pw_m6688_m3803)]   ;-2 * 3344 * in1 - 3803 * in3
+    mova                 m0, [o(pd_2048)]
+    paddd                m1, m0                    ;t2 + 2048
+    paddd                m2, m0
+    paddd                m0, m4                    ;t0 + t3 + 2048
+    paddd                m5, m2                    ;t1 + t3 + 2048
+    paddd                m2, m4
+    paddd                m2, m3                    ;t0 + t1 - t3 + 2048
+    REPX      {psrad x, 12}, m1, m0, m5, m2
+    packssdw             m0, m5                    ;high: out1 ;low: out0
+    packssdw             m1, m2                    ;high: out3 ;low: out3
+    ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    call m(iadst_4x4_internal).main
+    punpcklwd            m2, m1, m0
+    punpckhwd            m1, m0
+    punpcklwd            m0, m1, m2            ;high: in3 ;low :in2
+    punpckhwd            m1, m2                ;high: in1 ;low: in0
+    jmp                tx2q
+
+.pass2:
+    call m(iadst_4x4_internal).main
+
+.end:
+    pxor                 m2, m2
+    mova      [coeffq+16*0], m2
+    mova      [coeffq+16*1], m2
+
+.end2:
+    ITX4_END              3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    mova                 m3, [o(pw_1697x8)]
+    pmulhrsw             m2, m0, m3
+    pmulhrsw             m3, m1
+    paddsw               m0, m2
+    paddsw               m1, m3
+    punpckhwd            m2, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m0, m2            ;high: in3 ;low :in2
+    punpcklwd            m0, m2                ;high: in1 ;low: in0
+    jmp                tx2q
+
+.pass2:
+    mova                 m3, [o(pw_1697x8)]
+    pmulhrsw             m2, m3, m0
+    pmulhrsw             m3, m1
+    paddsw               m0, m2
+    paddsw               m1, m3
+    jmp m(iadst_4x4_internal).end
+
+%macro IWHT4_1D_PACKED 0
+    punpckhqdq           m3, m0, m1            ;low: in1 high: in3
+    punpcklqdq           m0, m1                ;low: in0 high: in2
+    psubw                m2, m0, m3            ;low: in0 - in1 high: in2 - in3
+    paddw                m0, m3                ;low: in0 + in1 high: in2 + in3
+    punpckhqdq           m2, m2                ;t2 t2
+    punpcklqdq           m0, m0                ;t0 t0
+    psubw                m1, m0, m2
+    psraw                m1, 1                 ;t4 t4
+    psubw                m1, m3                ;low: t1/out2 high: t3/out1
+    psubw                m0, m1                ;high: out0
+    paddw                m2, m1                ;low: out3
+%endmacro
+
+cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    pxor                 m2, m2
+    mova      [coeffq+16*0], m2
+    mova      [coeffq+16*1], m2
+    psraw                m0, 2
+    psraw                m1, 2
+
+    IWHT4_1D_PACKED
+
+    punpckhwd            m0, m1
+    punpcklwd            m3, m1, m2
+    punpckhdq            m1, m0, m3
+    punpckldq            m0, m3
+
+    IWHT4_1D_PACKED
+
+    shufpd               m0, m2, 0x01
+    ITX4_END              0, 3, 2, 1, 0
+
+
+%macro IDCT8_1D_PACKED 0
+    mova                 m6, [o(pd_2048)]
+    punpckhwd            m4, m0, m3                 ;unpacked in1 in7
+    punpcklwd            m0, m2                     ;unpacked in0 in4
+    punpckhwd            m2, m1                     ;unpacked in5 in3
+    punpcklwd            m1, m3                     ;unpacked in2 in6
+    ITX_MUL2X_PACK        4, 3, 6,  799, 4017       ;low: t7a high: t4a
+    ITX_MUL2X_PACK        2, 3, 6, 3406, 2276       ;low: t6a high: t5a
+    ITX_MUL2X_PACK        1, 3, 6, 1567, 3784       ;low: t3  high: t2
+    psubsw               m3, m4, m2                 ;low: t6a high: t5a
+    paddsw               m4, m2                     ;low: t7  high: t4
+    pshufb               m3, [o(deint_shuf1)]
+    ITX_MUL2X_PACK        0, 2, 6, 2896, 2896       ;low: t0  high: t1
+    ITX_MUL2X_PACK        3, 2, 6, 2896, 2896       ;low: t6  high: t5
+    psubsw               m2, m0, m1                 ;low: tmp3 high: tmp2
+    paddsw               m0, m1                     ;low: tmp0 high: tmp1
+    punpcklqdq           m1, m4, m3                 ;low: t7   high: t6
+    punpckhqdq           m4, m3                     ;low: t4   high: t5
+    psubsw               m3, m0, m1                 ;low: out7 high: out6
+    paddsw               m0, m1                     ;low: out0 high: out1
+    paddsw               m1, m2, m4                 ;low: out3 high: out2
+    psubsw               m2, m4                     ;low: out4 high: out5
+%endmacro
+
+;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
+    punpckhwd           m%4, m%1, m%2
+    punpcklwd           m%1, m%2
+%if %7 < 8
+    pmaddwd             m%2, m%7, m%1
+    pmaddwd             m%3, m%7, m%4
+%else
+    mova                m%2, [o(pw_%7_%6)]
+%if %8
+    pmaddwd             m%3, m%1, m%2
+    pmaddwd             m%2, m%4
+%else
+    pmaddwd             m%3, m%4, m%2
+    pmaddwd             m%2, m%1
+%endif
+%endif
+    paddd               m%3, m%5
+    paddd               m%2, m%5
+    psrad               m%3, 12
+    psrad               m%2, 12
+%if %8
+    packssdw            m%3, m%2
+%else
+    packssdw            m%2, m%3                 ;dst2
+%endif
+%if %7 < 8
+    pmaddwd             m%4, m%6
+    pmaddwd             m%1, m%6
+%elif %8
+    mova                m%2, [o(pw_%6_m%7)]
+    pmaddwd             m%4, m%2
+    pmaddwd             m%1, m%2
+%else
+    mova                m%3, [o(pw_%6_m%7)]
+    pmaddwd             m%4, m%3
+    pmaddwd             m%1, m%3
+%endif
+    paddd               m%4, m%5
+    paddd               m%1, m%5
+    psrad               m%4, 12
+    psrad               m%1, 12
+    packssdw            m%1, m%4                 ;dst1
+%endmacro
+
+%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
+    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
+    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
+    psubsw              m%3, m%1, m%2                      ;out2
+    paddsw              m%2, m%1                           ;out1
+    paddsw              m%1, m%5, m%4                      ;out0
+    psubsw              m%4, m%5                           ;out3
+%endmacro
+
+%macro WRITE_4X8 4 ;row[1-4]
+    WRITE_4X4             0, 1, 4, 5, 6, %1, %2, %3, %4
+    lea                dstq, [dstq+strideq*4]
+    WRITE_4X4             2, 3, 4, 5, 6, %1, %2, %3, %4
+%endmacro
+
+%macro INV_4X8 0
+    punpckhwd            m4, m2, m3
+    punpcklwd            m2, m3
+    punpckhwd            m3, m0, m1
+    punpcklwd            m0, m1
+    punpckhdq            m1, m0, m2                  ;low: in2 high: in3
+    punpckldq            m0, m2                      ;low: in0 high: in1
+    punpckldq            m2, m3, m4                  ;low: in4 high: in5
+    punpckhdq            m3, m4                      ;low: in6 high: in7
+%endmacro
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x8, 8
+%ifidn %1_%2, dct_dct
+    pshuflw              m0, [coeffq], q0000
+    punpcklqdq           m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+    mov           [coeffq], eobd
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, [o(pw_2048)]
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    TAIL_CALL m(iadst_4x8_internal).end3
+%endif
+%endmacro
+
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
+
+cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+.pass1:
+    call m(idct_8x4_internal).main
+    jmp m(iadst_4x8_internal).pass1_end
+
+.pass2:
+    call .main
+    shufps               m1, m1, q1032
+    shufps               m3, m3, q1032
+    mova                 m4, [o(pw_2048)]
+    jmp m(iadst_4x8_internal).end2
+
+ALIGN function_align
+.main:
+    IDCT8_1D_PACKED
+    ret
+
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+.pass1:
+    call m(iadst_8x4_internal).main
+
+.pass1_end:
+    INV_4X8
+    jmp                tx2q
+
+.pass2:
+    shufps               m0, m0, q1032
+    shufps               m1, m1, q1032
+    call .main
+    mova                 m4, [o(pw_2048)]
+    pxor                 m5, m5
+    psubw                m5, m4
+
+.end:
+    punpcklqdq           m4, m5
+
+.end2:
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+    pmulhrsw             m2, m4
+    pmulhrsw             m3, m4
+    pxor                 m5, m5
+    mova      [coeffq+16*0], m5
+    mova      [coeffq+16*1], m5
+    mova      [coeffq+16*2], m5
+    mova      [coeffq+16*3], m5
+
+.end3:
+    WRITE_4X8             0, 1, 2, 3
+    RET
+
+ALIGN function_align
+.main:
+    mova                 m6, [o(pd_2048)]
+    punpckhwd            m4, m3, m0                ;unpacked in7 in0
+    punpckhwd            m5, m2, m1                ;unpacked in5 in2
+    punpcklwd            m1, m2                    ;unpacked in3 in4
+    punpcklwd            m0, m3                    ;unpacked in1 in6
+    ITX_MUL2X_PACK        4, 2, 6,  401, 4076      ;low:  t0a   high:  t1a
+    ITX_MUL2X_PACK        5, 2, 6, 1931, 3612      ;low:  t2a   high:  t3a
+    ITX_MUL2X_PACK        1, 2, 6, 3166, 2598      ;low:  t4a   high:  t5a
+    ITX_MUL2X_PACK        0, 2, 6, 3920, 1189      ;low:  t6a   high:  t7a
+
+    psubsw               m3, m4, m1                ;low:  t4    high:  t5
+    paddsw               m4, m1                    ;low:  t0    high:  t1
+    psubsw               m2, m5, m0                ;low:  t6    high:  t7
+    paddsw               m5, m0                    ;low:  t2    high:  t3
+
+    shufps               m1, m3, m2, q1032
+    punpckhwd            m2, m1
+    punpcklwd            m3, m1
+    ITX_MUL2X_PACK        3, 0, 6, 1567, 3784, 1   ;low:  t5a   high:  t4a
+    ITX_MUL2X_PACK        2, 0, 6, 3784, 1567      ;low:  t7a   high:  t6a
+
+    psubsw               m1, m4, m5                ;low:  t2    high:  t3
+    paddsw               m4, m5                    ;low:  out0  high: -out7
+    psubsw               m5, m3, m2                ;low:  t7    high:  t6
+    paddsw               m3, m2                    ;low:  out6  high: -out1
+    shufps               m0, m4, m3, q3210         ;low:  out0  high: -out1
+    shufps               m3, m4, q3210             ;low:  out6  high: -out7
+
+    mova                 m2, [o(pw_2896_m2896)]
+    mova                 m7, [o(pw_2896_2896)]
+    shufps               m4, m1, m5, q1032         ;low:  t3    high:  t7
+    shufps               m1, m5, q3210             ;low:  t2    high:  t6
+    punpcklwd            m5, m1, m4
+    punpckhwd            m1, m4
+    pmaddwd              m4, m2, m1                ;-out5
+    pmaddwd              m2, m5                    ; out4
+    pmaddwd              m1, m7                    ; out2
+    pmaddwd              m5, m7                    ;-out3
+    REPX      {paddd x, m6}, m4, m2, m1, m5
+    REPX      {psrad x, 12}, m4, m2, m1, m5
+    packssdw             m1, m5                    ;low:  out2  high: -out3
+    packssdw             m2, m4                    ;low:  out4  high: -out5
+    ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+.pass1:
+    call m(iadst_8x4_internal).main
+
+    punpcklwd            m4, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m5, m1, m0
+    punpckhwd            m1, m0
+    punpckldq            m2, m3, m1                  ;low: in4 high: in5
+    punpckhdq            m3, m1                      ;low: in6 high: in7
+    punpckldq            m0, m4, m5                  ;low: in0 high: in1
+    punpckhdq            m1, m4, m5                  ;low: in2 high: in3
+    jmp                tx2q
+
+.pass2:
+    shufps               m0, m0, q1032
+    shufps               m1, m1, q1032
+    call m(iadst_4x8_internal).main
+
+    mova                 m4, m0
+    mova                 m5, m1
+    pshufd               m0, m3, q1032
+    pshufd               m1, m2, q1032
+    pshufd               m2, m5, q1032
+    pshufd               m3, m4, q1032
+    mova                 m5, [o(pw_2048)]
+    pxor                 m4, m4
+    psubw                m4, m5
+    jmp m(iadst_4x8_internal).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+.pass1:
+    mova                 m7, [o(pw_1697x8)]
+    pmulhrsw             m4, m7, m0
+    pmulhrsw             m5, m7, m1
+    pmulhrsw             m6, m7, m2
+    pmulhrsw             m7, m3
+    paddsw               m0, m4
+    paddsw               m1, m5
+    paddsw               m2, m6
+    paddsw               m3, m7
+    jmp m(iadst_4x8_internal).pass1_end
+
+.pass2:
+    mova                 m4, [o(pw_4096)]
+    jmp m(iadst_4x8_internal).end2
+
+
+%macro WRITE_8X2 5       ;coefs[1-2], tmp[1-3]
+    movq                 m%3, [dstq        ]
+    movq                 m%4, [dstq+strideq]
+    pxor                 m%5, m%5
+    punpcklbw            m%3, m%5                 ;extend byte to word
+    punpcklbw            m%4, m%5                 ;extend byte to word
+%ifnum %1
+    paddw                m%3, m%1
+%else
+    paddw                m%3, %1
+%endif
+%ifnum %2
+    paddw                m%4, m%2
+%else
+    paddw                m%4, %2
+%endif
+    packuswb             m%3, m%4
+    movq      [dstq        ], m%3
+    punpckhqdq           m%3, m%3
+    movq      [dstq+strideq], m%3
+%endmacro
+
+%macro WRITE_8X4 7      ;coefs[1-4], tmp[1-3]
+    WRITE_8X2             %1, %2, %5, %6, %7
+    lea                dstq, [dstq+strideq*2]
+    WRITE_8X2             %3, %4, %5, %6, %7
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x4, 8
+%ifidn %1_%2, dct_dct
+    pshuflw              m0, [coeffq], q0000
+    punpcklqdq           m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m1
+    mova                 m2, [o(pw_2048)]
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m2
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    TAIL_CALL m(iadst_8x4_internal).end2
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    call m(idct_4x8_internal).main
+
+    mova                 m4, [o(deint_shuf1)]
+    mova                 m5, [o(deint_shuf2)]
+    pshufb               m0, m4
+    pshufb               m1, m5
+    pshufb               m2, m4
+    pshufb               m3, m5
+    punpckhdq            m4, m0, m1
+    punpckldq            m0, m1
+    punpckhdq            m5, m2, m3
+    punpckldq            m2, m3
+    punpckhqdq           m1, m0, m2                      ;in1
+    punpcklqdq           m0, m2                          ;in0
+    punpckhqdq           m3, m4, m5                      ;in3
+    punpcklqdq           m2 ,m4, m5                      ;in2
+    jmp                tx2q
+
+.pass2:
+    call .main
+    jmp m(iadst_8x4_internal).end
+
+ALIGN function_align
+.main:
+    mova                 m6, [o(pd_2048)]
+    IDCT4_1D             0, 1, 2, 3, 4, 5, 6
+    ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    shufps               m0, m0, q1032
+    shufps               m1, m1, q1032
+    call m(iadst_4x8_internal).main
+
+    punpckhwd            m4, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m2, m3
+    punpcklwd            m2, m3
+    pxor                 m5, m5
+    psubsw               m3, m5, m1
+    psubsw               m5, m4
+    punpckhdq            m4, m5, m3
+    punpckldq            m5, m3
+    punpckhdq            m3, m0, m2
+    punpckldq            m0, m2
+    punpckhwd            m1, m0, m5      ;in1
+    punpcklwd            m0, m5          ;in0
+    punpcklwd            m2, m3, m4      ;in2
+    punpckhwd            m3, m4          ;in3
+    jmp              tx2q
+
+.pass2:
+    call .main
+
+.end:
+    mova                 m4, [o(pw_2048)]
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+    pmulhrsw             m2, m4
+    pmulhrsw             m3, m4
+
+.end2:
+    pxor                 m6, m6
+    mova      [coeffq+16*0], m6
+    mova      [coeffq+16*1], m6
+    mova      [coeffq+16*2], m6
+    mova      [coeffq+16*3], m6
+.end3:
+    WRITE_8X4             0, 1, 2, 3, 4, 5, 6
+    RET
+
+ALIGN function_align
+.main:
+    punpckhwd            m6, m0, m2                    ;unpacked in0 in2
+    punpcklwd            m0, m2                        ;unpacked in0 in2
+    punpckhwd            m7, m1, m3                    ;unpacked in1 in3
+    punpcklwd            m1, m3                        ;unpacked in1 in3
+
+    mova                 m2, [o(pw_3344_m3344)]
+    mova                 m4, [o(pw_0_3344)]
+    pmaddwd              m3, m2, m6                    ;3344 * in0 - 3344 * in2
+    pmaddwd              m5, m4, m7                    ;3344 * in3
+    pmaddwd              m2, m0
+    pmaddwd              m4, m1
+    paddd                m3, m5
+    paddd                m2, m4
+    mova                 m4, [o(pd_2048)]
+    paddd                m3, m4                        ;t2 + 2048
+    paddd                m2, m4
+    psrad                m3, 12
+    psrad                m2, 12
+    packssdw             m2, m3                        ;out2
+
+    pmaddwd              m4, m0, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
+    pmaddwd              m0, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
+    pmaddwd              m3, m1, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m1, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
+    paddd                m3, m4                        ;t0 + t3
+
+    pmaddwd              m1, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
+    mova                 m4, [o(pd_2048)]
+    paddd                m0, m4
+    paddd                m4, m3                        ;t0 + t3 + 2048
+    paddd                m5, m0                        ;t1 + t3 + 2048
+    paddd                m3, m0
+    paddd                m3, m1                        ;t0 + t1 - t3 + 2048
+
+    psrad                m4, 12                        ;out0
+    psrad                m5, 12                        ;out1
+    psrad                m3, 12                        ;out3
+    packssdw             m0, m4, m5                    ;low: out0  high: out1
+
+    pmaddwd              m4, m6, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
+    pmaddwd              m6, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
+    pmaddwd              m1, m7, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m7, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
+    paddd                m1, m4                        ;t0 + t3
+    pmaddwd              m7, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
+
+    mova                 m4, [o(pd_2048)]
+    paddd                m6, m4
+    paddd                m4, m1                        ;t0 + t3 + 2048
+    paddd                m5, m6                        ;t1 + t3 + 2048
+    paddd                m1, m6
+    paddd                m1, m7                        ;t0 + t1 - t3 + 2048
+
+    psrad                m4, 12                        ;out0
+    psrad                m5, 12                        ;out1
+    psrad                m1, 12                        ;out3
+    packssdw             m3, m1                        ;out3
+    packssdw             m4, m5                        ;low: out0  high: out1
+
+    punpckhqdq           m1, m0, m4                    ;out1
+    punpcklqdq           m0, m4                        ;out0
+    ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    shufps               m0, m0, q1032
+    shufps               m1, m1, q1032
+    call m(iadst_4x8_internal).main
+
+    punpckhwd            m5, m3, m2
+    punpcklwd            m3, m2
+    punpckhwd            m2, m1, m0
+    punpcklwd            m1, m0
+
+    pxor                 m0, m0
+    psubsw               m4, m0, m2
+    psubsw               m0, m5
+    punpckhdq            m2, m0, m4
+    punpckldq            m0, m4
+    punpckhdq            m4, m3, m1
+    punpckldq            m3, m1
+    punpckhwd            m1, m0, m3      ;in1
+    punpcklwd            m0, m3          ;in0
+    punpckhwd            m3, m2, m4      ;in3
+    punpcklwd            m2, m4          ;in2
+    jmp                  tx2q
+
+.pass2:
+    call m(iadst_8x4_internal).main
+    mova                 m4, m0
+    mova                 m5, m1
+    mova                 m0, m3
+    mova                 m1, m2
+    mova                 m2, m5
+    mova                 m3, m4
+    jmp m(iadst_8x4_internal).end
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+    paddsw               m0, m0
+    paddsw               m1, m1
+    paddsw               m2, m2
+    paddsw               m3, m3
+
+    punpckhwd            m4, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m2, m3
+    punpcklwd            m2, m3
+    punpckhdq            m5, m4, m1
+    punpckldq            m4, m1
+    punpckhdq            m3, m0, m2
+    punpckldq            m0, m2
+    punpckhwd            m1, m0, m4      ;in1
+    punpcklwd            m0, m4          ;in0
+    punpcklwd            m2, m3, m5      ;in2
+    punpckhwd            m3, m5          ;in3
+    jmp                tx2q
+
+.pass2:
+    mova                 m7, [o(pw_1697x8)]
+    pmulhrsw             m4, m7, m0
+    pmulhrsw             m5, m7, m1
+    pmulhrsw             m6, m7, m2
+    pmulhrsw             m7, m3
+    paddsw               m0, m4
+    paddsw               m1, m5
+    paddsw               m2, m6
+    paddsw               m3, m7
+    jmp m(iadst_8x4_internal).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x8, 8, 16*4
+%ifidn %1_%2, dct_dct
+    pshuflw              m0, [coeffq], q0000
+    punpcklwd            m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+    mova                 m2, [o(pw_16384)]
+    mov            [coeffq], eobd
+    pmulhrsw             m0, m2
+    psrlw                m2, 3
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m2
+.end:
+    mov                 r3d, 2
+    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x8).end3)]
+.loop:
+    WRITE_8X4             0, 0, 0, 0, 1, 2, 3
+    lea                dstq, [dstq+strideq*2]
+    dec                 r3d
+    jg .loop
+    jmp                tx2q
+.end3:
+    RET
+%endif
+%endmacro
+
+%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
+%if %3
+    mova                 m7, [o(pw_2896x8)]
+    pmulhrsw             m0, m7, [%1+%2*0]
+    pmulhrsw             m1, m7, [%1+%2*1]
+    pmulhrsw             m2, m7, [%1+%2*2]
+    pmulhrsw             m3, m7, [%1+%2*3]
+    pmulhrsw             m4, m7, [%1+%2*4]
+    pmulhrsw             m5, m7, [%1+%2*5]
+    pmulhrsw             m6, m7, [%1+%2*6]
+    pmulhrsw             m7, [%1+%2*7]
+%else
+    mova                 m0, [%1+%2*0]
+    mova                 m1, [%1+%2*1]
+    mova                 m2, [%1+%2*2]
+    mova                 m3, [%1+%2*3]
+    mova                 m4, [%1+%2*4]
+    mova                 m5, [%1+%2*5]
+    mova                 m6, [%1+%2*6]
+    mova                 m7, [%1+%2*7]
+%endif
+%endmacro
+
+%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
+    ITX_MULSUB_2W         %1, %4, %5, %6, %7,  799, 4017    ;t4a, t7a
+    ITX_MULSUB_2W         %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
+    psubsw               m%2, m%4, m%5                      ;t6a
+    paddsw               m%4, m%5                           ;t7
+    psubsw               m%5, m%1, m%3                      ;t5a
+    paddsw               m%1, m%3                           ;t4
+    ITX_MULSUB_2W         %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    LOAD_8ROWS          coeffq, 16
+
+.pass1:
+    call .main
+
+.pass1_end:
+    mova                    m7, [o(pw_16384)]
+
+.pass1_end1:
+    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova    [rsp+gprsize+16*1], m6
+
+.pass1_end2:
+    REPX      {pmulhrsw x, m7}, m1, m3, m5
+    pmulhrsw                m7, [rsp+gprsize+16*0]
+
+.pass1_end3:
+    punpcklwd               m6, m1, m5             ;10 50 11 51 12 52 13 53
+    punpckhwd               m1, m5                 ;14 54 15 55 16 56 17 57
+    punpckhwd               m5, m0, m4             ;04 44 05 45 06 46 07 47
+    punpcklwd               m0, m4                 ;00 40 01 41 02 42 03 43
+    punpckhwd               m4, m3, m7             ;34 74 35 75 36 76 37 77
+    punpcklwd               m3, m7                 ;30 70 31 71 32 72 33 73
+    punpckhwd               m7, m1, m4             ;16 36 56 76 17 37 57 77
+    punpcklwd               m1, m4                 ;14 34 54 74 15 35 55 75
+    punpckhwd               m4, m6, m3             ;12 32 52 72 13 33 53 73
+    punpcklwd               m6, m3                 ;10 30 50 70 11 31 51 71
+    mova    [rsp+gprsize+16*2], m6
+    mova                    m6, [rsp+gprsize+16*1]
+    punpckhwd               m3, m2, m6             ;24 64 25 65 26 66 27 67
+    punpcklwd               m2, m6                 ;20 60 21 61 22 62 23 63
+    punpckhwd               m6, m5, m3             ;06 26 46 66 07 27 47 67
+    punpcklwd               m5, m3                 ;04 24 44 64 05 25 45 65
+    punpckhwd               m3, m0, m2             ;02 22 42 62 03 23 43 63
+    punpcklwd               m0, m2                 ;00 20 40 60 01 21 41 61
+
+    punpckhwd               m2, m6, m7             ;07 17 27 37 47 57 67 77
+    punpcklwd               m6, m7                 ;06 16 26 36 46 56 66 76
+    mova    [rsp+gprsize+16*0], m2
+    punpcklwd               m2, m3, m4             ;02 12 22 32 42 52 62 72
+    punpckhwd               m3, m4                 ;03 13 23 33 43 53 63 73
+    punpcklwd               m4, m5, m1             ;04 14 24 34 44 54 64 74
+    punpckhwd               m5, m1                 ;05 15 25 35 45 55 65 75
+    mova                    m7, [rsp+gprsize+16*2]
+    punpckhwd               m1, m0, m7             ;01 11 21 31 41 51 61 71
+    punpcklwd               m0, m7                 ;00 10 20 30 40 50 60 70
+    mova                    m7, [rsp+gprsize+16*0]
+    jmp                   tx2q
+
+.pass2:
+    lea                   tx2q, [o(m(idct_8x8_internal).end4)]
+
+.pass2_main:
+    call .main
+
+.end:
+    mova                    m7, [o(pw_2048)]
+    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova    [rsp+gprsize+16*1], m6
+
+.end2:
+    REPX      {pmulhrsw x, m7}, m1, m3, m5
+    pmulhrsw                m7, [rsp+gprsize+16*0]
+    mova    [rsp+gprsize+16*2], m5
+    mova    [rsp+gprsize+16*0], m7
+
+.end3:
+    WRITE_8X4                0, 1, 2, 3, 5, 6, 7
+    lea                   dstq, [dstq+strideq*2]
+    WRITE_8X4                4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7
+    jmp                   tx2q
+
+.end4:
+    pxor                    m7, m7
+    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
+    ret
+
+ALIGN function_align
+.main:
+    mova  [rsp+gprsize*2+16*0], m7
+    mova  [rsp+gprsize*2+16*1], m3
+    mova  [rsp+gprsize*2+16*2], m1
+    mova                    m7, [o(pd_2048)]
+    IDCT4_1D                 0, 2, 4, 6, 1, 3, 7
+    mova                    m3, [rsp+gprsize*2+16*2]
+    mova  [rsp+gprsize*2+16*2], m2
+    mova                    m2, [rsp+gprsize*2+16*1]
+    mova  [rsp+gprsize*2+16*1], m4
+    mova                    m4, [rsp+gprsize*2+16*0]
+    mova  [rsp+gprsize*2+16*0], m6
+    IDCT8_1D_ODDHALF         3, 2, 5, 4, 1, 6, 7
+    mova                    m6, [rsp+gprsize*2+16*0]
+    psubsw                  m7, m0, m4                    ;out7
+    paddsw                  m0, m4                        ;out0
+    mova  [rsp+gprsize*2+16*0], m7
+    mova                    m1, [rsp+gprsize*2+16*2]
+    psubsw                  m4, m6, m3                    ;out4
+    paddsw                  m3, m6                        ;out3
+    mova                    m7, [rsp+gprsize*2+16*1]
+    psubsw                  m6, m1, m5                    ;out6
+    paddsw                  m1, m5                        ;out1
+    psubsw                  m5, m7, m2                    ;out5
+    paddsw                  m2, m7                        ;out2
+    ret
+
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    LOAD_8ROWS          coeffq, 16
+
+.pass1:
+    call .main
+    call .main_pass1_end
+
+.pass1_end:
+    mova                    m7, [o(pw_16384)]
+
+.pass1_end1:
+    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova    [rsp+gprsize+16*1], m6
+    pxor                    m6, m6
+    psubw                   m6, m7
+    mova                    m7, m6
+    jmp m(idct_8x8_internal).pass1_end2
+
+ALIGN function_align
+.pass2:
+    lea                   tx2q, [o(m(idct_8x8_internal).end4)]
+
+.pass2_main:
+    call .main
+    call .main_pass2_end
+
+.end:
+    mova                    m7, [o(pw_2048)]
+    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova    [rsp+gprsize+16*1], m6
+    pxor                    m6, m6
+    psubw                   m6, m7
+    mova                    m7, m6
+    jmp m(idct_8x8_internal).end2
+
+ALIGN function_align
+.main:
+    mova  [rsp+gprsize*2+16*0], m7
+    mova  [rsp+gprsize*2+16*1], m3
+    mova  [rsp+gprsize*2+16*2], m4
+    mova                    m7, [o(pd_2048)]
+    ITX_MULSUB_2W            5, 2, 3, 4, 7, 1931, 3612    ;t3a, t2a
+    ITX_MULSUB_2W            1, 6, 3, 4, 7, 3920, 1189    ;t7a, t6a
+    paddsw                  m3, m2, m6                    ;t2
+    psubsw                  m2, m6                        ;t6
+    paddsw                  m4, m5, m1                    ;t3
+    psubsw                  m5, m1                        ;t7
+    ITX_MULSUB_2W            5, 2, 1, 6, 7, 3784, 1567    ;t6a, t7a
+
+    mova                    m6, [rsp+gprsize*2+16*2]
+    mova  [rsp+gprsize*2+16*2], m5
+    mova                    m1, [rsp+gprsize*2+16*1]
+    mova  [rsp+gprsize*2+16*1], m2
+    mova                    m5, [rsp+gprsize*2+16*0]
+    mova  [rsp+gprsize*2+16*0], m3
+    ITX_MULSUB_2W            5, 0, 2, 3, 7,  401, 4076    ;t1a, t0a
+    ITX_MULSUB_2W            1, 6, 2, 3, 7, 3166, 2598    ;t5a, t4a
+    psubsw                  m2, m0, m6                    ;t4
+    paddsw                  m0, m6                        ;t0
+    paddsw                  m3, m5, m1                    ;t1
+    psubsw                  m5, m1                        ;t5
+    ITX_MULSUB_2W            2, 5, 1, 6, 7, 1567, 3784    ;t5a, t4a
+
+    mova                    m7, [rsp+gprsize*2+16*0]
+    paddsw                  m1, m3, m4                    ;-out7
+    psubsw                  m3, m4                        ;t3
+    mova  [rsp+gprsize*2+16*0], m1
+    psubsw                  m4, m0, m7                    ;t2
+    paddsw                  m0, m7                        ;out0
+    mova                    m6, [rsp+gprsize*2+16*2]
+    mova                    m7, [rsp+gprsize*2+16*1]
+    paddsw                  m1, m5, m6                    ;-out1
+    psubsw                  m5, m6                        ;t6
+    paddsw                  m6, m2, m7                    ;out6
+    psubsw                  m2, m7                        ;t7
+    ret
+ALIGN function_align
+.main_pass1_end:
+    mova  [rsp+gprsize*2+16*1], m1
+    mova  [rsp+gprsize*2+16*2], m6
+    punpckhwd               m1, m4, m3
+    punpcklwd               m4, m3
+    punpckhwd               m7, m5, m2
+    punpcklwd               m5, m2
+    mova                    m2, [o(pw_2896_2896)]
+    mova                    m6, [o(pd_2048)]
+    pmaddwd                 m3, m2, m7
+    pmaddwd                 m2, m5
+    paddd                   m3, m6
+    paddd                   m2, m6
+    psrad                   m3, 12
+    psrad                   m2, 12
+    packssdw                m2, m3                        ;out2
+    mova                    m3, [o(pw_2896_m2896)]
+    pmaddwd                 m7, m3
+    pmaddwd                 m5, m3
+    paddd                   m7, m6
+    paddd                   m5, m6
+    psrad                   m7, 12
+    psrad                   m5, 12
+    packssdw                m5, m7                        ;-out5
+    mova                    m3, [o(pw_2896_2896)]
+    pmaddwd                 m7, m3, m1
+    pmaddwd                 m3, m4
+    paddd                   m7, m6
+    paddd                   m3, m6
+    psrad                   m7, 12
+    psrad                   m3, 12
+    packssdw                m3, m7                        ;-out3
+    mova                    m7, [o(pw_2896_m2896)]
+    pmaddwd                 m1, m7
+    pmaddwd                 m4, m7
+    paddd                   m1, m6
+    paddd                   m4, m6
+    psrad                   m1, 12
+    psrad                   m4, 12
+    packssdw                m4, m1                        ;-out5
+    mova                    m1, [rsp+gprsize*2+16*1]
+    mova                    m6, [rsp+gprsize*2+16*2]
+    ret
+ALIGN function_align
+.main_pass2_end:
+    paddsw                  m7, m4, m3                    ;t2 + t3
+    psubsw                  m4, m3                        ;t2 - t3
+    paddsw                  m3, m5, m2                    ;t6 + t7
+    psubsw                  m5, m2                        ;t6 - t7
+    mova                    m2, [o(pw_2896x8)]
+    pmulhrsw                m4, m2                        ;out4
+    pmulhrsw                m5, m2                        ;-out5
+    pmulhrsw                m7, m2                        ;-out3
+    pmulhrsw                m2, m3                        ;out2
+    mova                    m3, m7
+    ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    LOAD_8ROWS          coeffq, 16
+
+.pass1:
+    call m(iadst_8x8_internal).main
+    call m(iadst_8x8_internal).main_pass1_end
+
+.pass1_end:
+    mova                    m7, [o(pw_m16384)]
+
+.pass1_end1:
+    pmulhrsw                m1, m7
+    mova    [rsp+gprsize+16*1], m1
+    mova                    m1, m6
+    mova                    m6, m2
+    pmulhrsw                m2, m5, m7
+    mova                    m5, m6
+    mova                    m6, m4
+    pmulhrsw                m4, m3, m7
+    mova                    m3, m6
+    mova                    m6, m0
+    mova                    m0, m7
+    pxor                    m7, m7
+    psubw                   m7, m0
+    pmulhrsw                m0, [rsp+gprsize+16*0]
+    REPX      {pmulhrsw x, m7}, m1, m3, m5
+    pmulhrsw                m7, m6
+    jmp m(idct_8x8_internal).pass1_end3
+
+ALIGN function_align
+.pass2:
+    lea                   tx2q, [o(m(idct_8x8_internal).end4)]
+
+.pass2_main:
+    call m(iadst_8x8_internal).main
+    call m(iadst_8x8_internal).main_pass2_end
+
+.end:
+    mova                    m7, [o(pw_2048)]
+    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova    [rsp+gprsize+16*2], m2
+    mova                    m2, m0
+    pxor                    m0, m0
+    psubw                   m0, m7
+    mova                    m7, m2
+    pmulhrsw                m1, m0
+    pmulhrsw                m2, m5, m0
+    mova    [rsp+gprsize+16*1], m1
+    mova                    m5, m4
+    mova                    m1, m6
+    pmulhrsw                m4, m3, m0
+    pmulhrsw                m0, [rsp+gprsize+16*0]
+    mova                    m3, m5
+    mova    [rsp+gprsize+16*0], m7
+    jmp m(idct_8x8_internal).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    LOAD_8ROWS          coeffq, 16
+    mova    [rsp+gprsize+16*1], m6
+    jmp   m(idct_8x8_internal).pass1_end3
+
+ALIGN function_align
+.pass2:
+    lea                   tx2q, [o(m(idct_8x8_internal).end4)]
+
+.end:
+    pmulhrsw                m7, [o(pw_4096)]
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_4096)]
+    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    mova    [rsp+gprsize+16*2], m5
+    mova    [rsp+gprsize+16*1], m6
+    jmp m(idct_8x8_internal).end3
+
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x16, 8
+%ifidn %1_%2, dct_dct
+    pshuflw               m0, [coeffq], q0000
+    punpcklwd             m0, m0
+    mova                  m1, [o(pw_2896x8)]
+    pmulhrsw              m0, m1
+    mov             [coeffq], eobd
+    pmulhrsw              m0, [o(pw_16384)]
+    pmulhrsw              m0, m1
+    pmulhrsw              m0, [o(pw_2048)]
+.end:
+    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
+    lea                dstq, [dstq+strideq*4]
+    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
+    lea                dstq, [dstq+strideq*4]
+    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
+    lea                dstq, [dstq+strideq*4]
+    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
+    RET
+%endif
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
+
+cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    lea                  r3, [o(m(idct_4x8_internal).pass1)]
+
+.pass1:
+    mova                 m0, [coeffq+16*1]
+    mova                 m1, [coeffq+16*3]
+    mova                 m2, [coeffq+16*5]
+    mova                 m3, [coeffq+16*7]
+    push               tx2q
+    lea                tx2q, [o(m(idct_4x16_internal).pass1_2)]
+    jmp                  r3
+
+.pass1_2:
+    mova      [coeffq+16*1], m0
+    mova      [coeffq+16*3], m1
+    mova      [coeffq+16*5], m2
+    mova      [coeffq+16*7], m3
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*2]
+    mova                 m2, [coeffq+16*4]
+    mova                 m3, [coeffq+16*6]
+    lea                tx2q, [o(m(idct_4x16_internal).pass1_end)]
+    jmp                  r3
+
+.pass1_end:
+    pop                tx2q
+
+    mova                 m4, [coeffq+16*1]
+    mova                 m5, [coeffq+16*3]
+    mova                 m6, [coeffq+16*5]
+    mova                 m7, [o(pw_16384)]
+    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+
+    pmulhrsw             m7, [coeffq+16*7]
+    mova       [coeffq+16*7], m7
+    jmp                tx2q
+
+.pass2:
+    call m(idct_16x4_internal).main
+
+.end:
+    mova                  m7, [o(pw_2048)]
+    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    pmulhrsw              m7, [coeffq+16*7]
+    mova       [coeffq+16*4], m4
+
+.end1:
+    mova       [coeffq+16*5], m5
+    mova       [coeffq+16*6], m6
+    mov                   r3, coeffq
+    WRITE_4X8              0, 1, 3, 2
+
+    mova                  m0, [r3+16*4]
+    mova                  m1, [r3+16*5]
+    mova                  m2, [r3+16*6]
+    mova                  m3, m7
+    lea                 dstq, [dstq+strideq*4]
+    WRITE_4X8              0, 1, 3, 2
+
+.end2:
+    pxor                  m7, m7
+    REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
+    ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    lea                   r3, [o(m(iadst_4x8_internal).pass1)]
+    jmp   m(idct_4x16_internal).pass1
+
+.pass2:
+    call m(iadst_16x4_internal).main
+    call m(iadst_16x4_internal).main_pass2_end
+
+    punpcklqdq            m6, m5, m4                ;low: -out5  high: -out7
+    punpckhqdq            m4, m5                    ;low:  out8  high:  out10
+    punpcklqdq            m5, m7, m2                ;low:  out4  high:  out6
+    punpckhqdq            m2, m7                    ;low: -out9  high: -out11
+    mova       [coeffq+16*4], m2
+    mova       [coeffq+16*5], m6
+    mova                  m2, [coeffq+16*6]
+    mova                  m6, [coeffq+16*7]
+    punpckhqdq            m1, m6, m0                ;low: -out13 high: -out15
+    punpcklqdq            m0, m6                    ;low:  out0  high:  out2
+    punpckhqdq            m6, m3, m2                ;low:  out12 high:  out14
+    punpcklqdq            m2, m3                    ;low: -out1  high: -out3
+
+    mova                  m7, [o(pw_2048)]
+
+.end1:
+    REPX    {pmulhrsw x, m7}, m0, m5, m4, m6
+    pxor                  m3, m3
+    psubw                 m3, m7
+    mova                  m7, [coeffq+16*4]
+    REPX    {pmulhrsw x, m3}, m2, m7, m1
+    pmulhrsw              m3, [coeffq+16*5]
+    mova       [coeffq+16*7], m5
+
+    punpckhqdq            m5, m4, m7                ;low:  out10 high:  out11
+    punpcklqdq            m4, m7                    ;low:  out8  high:  out9
+    punpckhqdq            m7, m6, m1                ;low:  out14 high:  out15
+    punpcklqdq            m6, m1                    ;low:  out12 high:  out13
+    punpckhqdq            m1, m0, m2                ;low:  out2  high:  out3
+    punpcklqdq            m0, m2                    ;low:  out0  high:  out1
+    mova       [coeffq+16*4], m4
+    mova                  m4, [coeffq+16*7]
+    punpcklqdq            m2, m4, m3                ;low:  out4  high:  out5
+    punpckhqdq            m4, m3                    ;low:  out6  high:  out7
+    mova                  m3, m4
+
+.end2:
+    mova       [coeffq+16*5], m5
+    mova       [coeffq+16*6], m6
+    mov                   r3, coeffq
+    WRITE_4X8              0, 1, 2, 3
+
+    mova                  m0, [r3+16*4]
+    mova                  m1, [r3+16*5]
+    mova                  m2, [r3+16*6]
+    mova                  m3, m7
+    lea                 dstq, [dstq+strideq*4]
+    WRITE_4X8              0, 1, 2, 3
+
+.end3:
+    pxor                  m7, m7
+    REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
+    ret
+
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    lea                   r3, [o(m(iflipadst_4x8_internal).pass1)]
+    jmp   m(idct_4x16_internal).pass1
+
+.pass2:
+    call m(iadst_16x4_internal).main
+    call m(iadst_16x4_internal).main_pass2_end
+
+    punpckhqdq            m6, m5, m4                ;low:  out5  high:  out7
+    punpcklqdq            m4, m5                    ;low: -out8  high: -out10
+    punpckhqdq            m5, m7, m2                ;low: -out4  high: -out6
+    punpcklqdq            m2, m7                    ;low:  out9  high:  out11
+    mova       [coeffq+16*4], m2
+    mova       [coeffq+16*5], m6
+    mova                  m2, [coeffq+16*6]
+    mova                  m6, [coeffq+16*7]
+    punpcklqdq            m1, m6, m0                ;low:  out13 high:  out15
+    punpckhqdq            m0, m6                    ;low: -out0  high: -out2
+    punpcklqdq            m6, m3, m2                ;low: -out12 high: -out14
+    punpckhqdq            m2, m3                    ;low:  out1  high:  out3
+
+    mova                  m7, [o(pw_m2048)]
+    jmp   m(iadst_4x16_internal).end1
+
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
+    pmulhrsw            m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+    pmulhrsw            m%2, m%4
+%else
+    paddsw              m%1, m%1
+%endif
+    paddsw              m%1, m%2
+%endmacro
+
+cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                  m0, [coeffq+16*1]
+    mova                  m6, [o(pw_1697x8)]
+    mova                  m1, [coeffq+16*3]
+    mova                  m2, [coeffq+16*5]
+    mova                  m3, [coeffq+16*7]
+    pcmpeqw               m7, m7
+    mov                   r3, tx2q
+    lea                 tx2q, [o(.pass1_2)]
+.pass1:
+    pmulhrsw              m4, m6, m0
+    pmulhrsw              m5, m6, m1
+    pavgw                 m4, m0
+    pcmpeqw               m0, m7
+    pavgw                 m5, m1
+    pcmpeqw               m1, m7
+    pandn                 m0, m4
+    pmulhrsw              m4, m6, m2
+    pandn                 m1, m5
+    pmulhrsw              m5, m6, m3
+    pavgw                 m4, m2
+    pcmpeqw               m2, m7
+    pavgw                 m5, m3
+    pcmpeqw               m3, m7
+    pandn                 m2, m4
+    pandn                 m3, m5
+    jmp m(iadst_4x8_internal).pass1_end
+.pass1_2:
+    mova       [coeffq+16*1], m0
+    mova       [coeffq+16*3], m1
+    mova       [coeffq+16*5], m2
+    mova       [coeffq+16*7], m3
+    mova                  m0, [coeffq+16*0]
+    mova                  m1, [coeffq+16*2]
+    mova                  m2, [coeffq+16*4]
+    mova                  m3, [coeffq+16*6]
+    lea                 tx2q, [o(.pass1_end)]
+    jmp .pass1
+.pass1_end:
+    mova                  m4, [coeffq+16*1]
+    mova                  m5, [coeffq+16*3]
+    mova                  m6, [coeffq+16*5]
+    jmp                   r3
+.pass2:
+    mova                  m7, [o(pw_1697x16)]
+    mova       [coeffq+16*6], m6
+    REPX    {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
+    mova                  m6, [coeffq+16*7]
+    IDTX16                 6, 7, 7
+    mova       [coeffq+16*7], m6
+    mova                  m6, [coeffq+16*6]
+    pmulhrsw              m7, m6, [o(pw_1697x16)]
+    paddsw                m6, m6
+    paddsw                m6, m7
+    mova                  m7, [o(pw_2048)]
+    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    pmulhrsw              m7, [coeffq+16*7]
+    mova       [coeffq+16*4], m4
+    jmp m(iadst_4x16_internal).end2
+
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x4, 8
+%ifidn %1_%2, dct_dct
+    movd                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1, [coeffq]
+    movd                 m2, [o(pw_16384)]
+    mov            [coeffq], eobd
+    mov                 r2d, 2
+    lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x4).end)]
+.dconly:
+    pmulhrsw             m0, m2
+    movd                 m2, [o(pw_2048)]              ;intentionally rip-relative
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m2
+    pshuflw              m0, m0, q0000
+    punpcklwd            m0, m0
+    pxor                 m5, m5
+.dconly_loop:
+    mova                 m1, [dstq]
+    mova                 m3, [dstq+strideq]
+    punpckhbw            m2, m1, m5
+    punpcklbw            m1, m5
+    punpckhbw            m4, m3, m5
+    punpcklbw            m3, m5
+    paddw                m2, m0
+    paddw                m1, m0
+    paddw                m4, m0
+    paddw                m3, m0
+    packuswb             m1, m2
+    packuswb             m3, m4
+    mova             [dstq], m1
+    mova     [dstq+strideq], m3
+    lea                dstq, [dstq+strideq*2]
+    dec                 r2d
+    jg .dconly_loop
+    jmp                tx2q
+.end:
+    RET
+%endif
+%endmacro
+
+%macro LOAD_7ROWS 2 ;src, stride
+    mova                 m0, [%1+%2*0]
+    mova                 m1, [%1+%2*1]
+    mova                 m2, [%1+%2*2]
+    mova                 m3, [%1+%2*3]
+    mova                 m4, [%1+%2*4]
+    mova                 m5, [%1+%2*5]
+    mova                 m6, [%1+%2*6]
+%endmacro
+
+%macro SAVE_7ROWS 2 ;src, stride
+    mova          [%1+%2*0], m0
+    mova          [%1+%2*1], m1
+    mova          [%1+%2*2], m2
+    mova          [%1+%2*3], m3
+    mova          [%1+%2*4], m4
+    mova          [%1+%2*5], m5
+    mova          [%1+%2*6], m6
+%endmacro
+
+%macro IDCT16_1D_PACKED_ODDHALF 7  ;src[1-4], tmp[1-3]
+    punpckhwd            m%5, m%4, m%1                ;packed in13 in3
+    punpcklwd            m%1, m%4                     ;packed in1  in15
+    punpcklwd            m%4, m%3, m%2                ;packed in9  in7
+    punpckhwd            m%2, m%3                     ;packed in5  in11
+    mova                 m%7, [o(pd_2048)]
+    ITX_MUL2X_PACK        %1, %6, %7,  401, 4076, 1    ;low: t8a   high: t15a
+    ITX_MUL2X_PACK        %4, %6, %7, 3166, 2598, 1    ;low: t9a   high: t14a
+    ITX_MUL2X_PACK        %2, %6, %7, 1931, 3612, 1    ;low: t10a  high: t13a
+    ITX_MUL2X_PACK        %5, %6, %7, 3920, 1189, 1    ;low: t11a  high: t12a
+    psubsw               m%6, m%1, m%4                 ;low: t9    high: t14
+    paddsw               m%1, m%4                      ;low: t8    high: t15
+    psubsw               m%4, m%5, m%2                 ;low: t10   high: t13
+    paddsw               m%5, m%2                      ;low: t11   high: t12
+    mova                 m%2, [o(deint_shuf2)]
+    pshufb               m%6, m%2
+    pshufb               m%4, m%2
+    ITX_MUL2X_PACK        %6, %3, %7, 1567, 3784, 1    ;low: t9a   high: t14a
+    ITX_MUL2X_PACK        %4, %3, %7, m3784, 1567, 1   ;low: t10a  high: t13a
+    psubsw               m%3, m%1, m%5                 ;low: t11a  high: t12a
+    paddsw               m%1, m%5                      ;low: t8a   high: t15a
+    psubsw               m%5, m%6, m%4                 ;low: t10   high: t13
+    paddsw               m%6, m%4                      ;low: t9    high: t14
+    pshufb               m%3, m%2
+    pshufb               m%5, m%2
+    ITX_MUL2X_PACK        %3, %2, %7, 2896, 2896, 4    ;t12,  t11
+    ITX_MUL2X_PACK        %5, %4, %7, 2896, 2896, 4    ;t13a, t10a
+    packssdw             m%2, m%4                      ;low: t11   high: t10a
+    packssdw             m%3, m%5                      ;low: t12   high: t13a
+    punpckhqdq           m%4, m%1, m%6                 ;low: t15a  high: t14
+    punpcklqdq           m%1, m%6                      ;low: t8a   high: t9
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    LOAD_7ROWS        coeffq, 16
+    call .main
+
+.pass1_end:
+    punpckhwd             m7, m0, m2                 ;packed out1,  out5
+    punpcklwd             m0, m2                     ;packed out0,  out4
+    punpcklwd             m2, m1, m3                 ;packed out3,  out7
+    punpckhwd             m1, m3                     ;packed out2,  out6
+    mova       [coeffq+16*6], m7
+    mova                  m7, [coeffq+16*7]
+    punpckhwd             m3, m4, m6                 ;packed out9,  out13
+    punpcklwd             m4, m6                     ;packed out8,  out12
+    punpcklwd             m6, m5, m7                 ;packed out11, out15
+    punpckhwd             m5, m7                     ;packed out10, out14
+
+.pass1_end2:
+    mova                  m7, [o(pw_16384)]
+    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    pmulhrsw              m7, [coeffq+16*6]
+    mova       [coeffq+16*6], m7
+
+.pass1_end3:
+    punpckhwd             m7, m3, m6                 ;packed 9, 11, 13, 15 high
+    punpcklwd             m3, m6                     ;packed 9, 10, 13, 15 low
+    punpckhwd             m6, m4, m5                 ;packed 8, 10, 12, 14 high
+    punpcklwd             m4, m5                     ;packed 8, 10, 12, 14 low
+    punpckhwd             m5, m4, m3                 ;8, 9, 10, 11, 12, 13, 14, 15(1)
+    punpcklwd             m4, m3                     ;8, 9, 10, 11, 12, 13, 14, 15(0)
+    punpckhwd             m3, m6, m7                 ;8, 9, 10, 11, 12, 13, 14, 15(3)
+    punpcklwd             m6, m7                     ;8, 9, 10, 11, 12, 13, 14, 15(2)
+    mova       [coeffq+16*7], m3
+    mova                  m3, [coeffq+16*6]
+    punpckhwd             m7, m3, m2                 ;packed 1, 3, 5, 7 high
+    punpcklwd             m3, m2                     ;packed 1, 3, 5, 7 low
+    punpckhwd             m2, m0, m1                 ;packed 0, 2, 4, 6 high
+    punpcklwd             m0, m1                     ;packed 0, 2, 4, 6 low
+    punpckhwd             m1, m0, m3                 ;0, 1, 2, 3, 4, 5, 6, 7(1)
+    punpcklwd             m0, m3                     ;0, 1, 2, 3, 4, 5, 6, 7(0)
+    punpckhwd             m3, m2, m7                 ;0, 1, 2, 3, 4, 5, 6, 7(3)
+    punpcklwd             m2, m7                     ;0, 1, 2, 3, 4, 5, 6, 7(2)
+    jmp                 tx2q
+
+.pass2:
+    lea                 tx2q, [o(m(idct_8x4_internal).pass2)]
+
+.pass2_end:
+    mova       [coeffq+16*4], m4
+    mova       [coeffq+16*5], m5
+    mova       [coeffq+16*6], m6
+    lea                   r3, [dstq+8]
+    call                tx2q
+
+    add               coeffq, 16*4
+    mova                  m0, [coeffq+16*0]
+    mova                  m1, [coeffq+16*1]
+    mova                  m2, [coeffq+16*2]
+    mova                  m3, [coeffq+16*3]
+    mov                 dstq, r3
+    jmp                 tx2q
+
+ALIGN function_align
+.main:
+    punpckhqdq            m7, m0, m1                 ;low:in1  high:in3
+    punpcklqdq            m0, m1
+    punpcklqdq            m1, m2, m3
+    punpckhqdq            m3, m2                     ;low:in7  high:in5
+    mova       [coeffq+16*4], m7
+    mova       [coeffq+16*5], m3
+    mova                  m7, [coeffq+16*7]
+    punpcklqdq            m2, m4, m5
+    punpckhqdq            m4, m5                     ;low:in9  high:in11
+    punpcklqdq            m3, m6, m7
+    punpckhqdq            m7, m6                     ;low:in15 high:in13
+    mova       [coeffq+16*6], m4
+    IDCT8_1D_PACKED
+    mova                  m6, [coeffq+16*4]
+    mova                  m4, [coeffq+16*5]
+    mova                  m5, [coeffq+16*6]
+    mova       [coeffq+16*4], m1
+    mova       [coeffq+16*5], m2
+    mova       [coeffq+16*6], m3
+
+    IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3
+
+    mova                  m1, [coeffq+16*4]
+    psubsw                m3, m0, m7                 ;low:out15 high:out14
+    paddsw                m0, m7                     ;low:out0  high:out1
+    psubsw                m7, m1, m5                 ;low:out12 high:out13
+    paddsw                m1, m5                     ;low:out3  high:out2
+    mova       [coeffq+16*7], m3
+    mova                  m2, [coeffq+16*5]
+    mova                  m3, [coeffq+16*6]
+    psubsw                m5, m2, m4                 ;low:out11 high:out10
+    paddsw                m2, m4                     ;low:out4  high:out5
+    psubsw                m4, m3, m6                 ;low:out8  high:out9
+    paddsw                m3, m6                     ;low:out7  high:out6
+    mova                  m6, m7
+    ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    LOAD_7ROWS        coeffq, 16
+    call .main
+    call .main_pass1_end
+
+    punpckhwd             m6, m7, m0                 ;packed -out11, -out15
+    punpcklwd             m0, m7                     ;packed   out0,   out4
+    punpcklwd             m7, m3, m4                 ;packed  -out3,  -out7
+    punpckhwd             m4, m3                     ;packed   out8,  out12
+    mova                  m1, [coeffq+16*6]
+    punpcklwd             m3, m1, m5                 ;packed  -out1,  -out5
+    punpckhwd             m5, m1                     ;packed  out10,  out14
+    mova                  m1, [coeffq+16*7]
+    mova       [coeffq+16*6], m3
+    mova       [coeffq+16*7], m7
+    punpckhwd             m3, m2, m1                 ;packed  -out9,  -out13
+    punpcklwd             m1, m2                     ;packed   out2,   out6
+
+    mova                  m7, [o(pw_16384)]
+
+.pass1_end:
+    REPX    {pmulhrsw x, m7}, m0, m1, m4, m5
+    pxor                  m2, m2
+    psubw                 m2, m7
+    mova                  m7, [coeffq+16*6]
+    REPX    {pmulhrsw x, m2}, m7, m3, m6
+    pmulhrsw              m2, [coeffq+16*7]
+    mova       [coeffq+16*6], m7
+    jmp   m(idct_16x4_internal).pass1_end3
+
+.pass2:
+    lea                 tx2q, [o(m(iadst_8x4_internal).pass2)]
+    jmp   m(idct_16x4_internal).pass2_end
+
+ALIGN function_align
+.main:
+    mova       [coeffq+16*6], m0
+    pshufd                m0, m1, q1032
+    pshufd                m2, m2, q1032
+    punpckhwd             m1, m6, m0                 ;packed in13,  in2
+    punpcklwd             m0, m6                     ;packed  in3, in12
+    punpckhwd             m7, m5, m2                 ;packed in11,  in4
+    punpcklwd             m2, m5                     ;packed  in5, in10
+    mova                  m6, [o(pd_2048)]
+    ITX_MUL2X_PACK         1, 5, 6,  995, 3973       ;low:t2   high:t3
+    ITX_MUL2X_PACK         7, 5, 6, 1751, 3703       ;low:t4   high:t5
+    ITX_MUL2X_PACK         2, 5, 6, 3513, 2106       ;low:t10  high:t11
+    ITX_MUL2X_PACK         0, 5, 6, 3857, 1380       ;low:t12  high:t13
+    psubsw                m5, m1, m2                 ;low:t10a high:t11a
+    paddsw                m1, m2                     ;low:t2a  high:t3a
+    psubsw                m2, m7, m0                 ;low:t12a high:t13a
+    paddsw                m7, m0                     ;low:t4a  high:t5a
+    punpcklqdq            m0, m5
+    punpckhwd             m0, m5                     ;packed t10a, t11a
+    punpcklqdq            m5, m2
+    punpckhwd             m2, m5                     ;packed t13a, t12a
+    ITX_MUL2X_PACK         0, 5, 6, 3406, 2276       ;low:t10  high:t11
+    ITX_MUL2X_PACK         2, 5, 6, 4017,  799, 1    ;low:t12  high:t13
+    mova       [coeffq+16*4], m1
+    mova       [coeffq+16*5], m7
+    mova                  m1, [coeffq+16*6]
+    mova                  m7, [coeffq+16*7]
+    pshufd                m1, m1, q1032
+    pshufd                m3, m3, q1032
+    punpckhwd             m5, m7, m1                 ;packed in15,  in0
+    punpcklwd             m1, m7                     ;packed  in1, in14
+    punpckhwd             m7, m4, m3                 ;packed  in9,  in6
+    punpcklwd             m3, m4                     ;packed  in7,  in8
+    ITX_MUL2X_PACK         5, 4, 6,  201, 4091       ;low:t0    high:t1
+    ITX_MUL2X_PACK         7, 4, 6, 2440, 3290       ;low:t6    high:t7
+    ITX_MUL2X_PACK         3, 4, 6, 3035, 2751       ;low:t8    high:t9
+    ITX_MUL2X_PACK         1, 4, 6, 4052,  601       ;low:t14   high:t15
+    psubsw                m4, m5, m3                 ;low:t8a   high:t9a
+    paddsw                m5, m3                     ;low:t0a   high:t1a
+    psubsw                m3, m7, m1                 ;low:t14a  high:t15a
+    paddsw                m7, m1                     ;low:t6a   high:t7a
+    punpcklqdq            m1, m4
+    punpckhwd             m1, m4                     ;packed  t8a,  t9a
+    punpcklqdq            m4, m3
+    punpckhwd             m3, m4                     ;packed t15a, t14a
+    ITX_MUL2X_PACK         1, 4, 6,  799, 4017       ;low:t8    high:t9
+    ITX_MUL2X_PACK         3, 4, 6, 2276, 3406, 1    ;low:t14   high:t15
+    paddsw                m4, m1, m2                 ;low:t12a  high:t13a
+    psubsw                m1, m2                     ;low:t8a   high:t9a
+    psubsw                m2, m0, m3                 ;low:t14a  high:t15a
+    paddsw                m0, m3                     ;low:t10a  high:t11a
+    punpcklqdq            m3, m1
+    punpckhwd             m3, m1                     ;packed t12a, t13a
+    punpcklqdq            m1, m2
+    punpckhwd             m2, m1                     ;packed t15a, t14a
+    ITX_MUL2X_PACK         3, 1, 6, 1567, 3784       ;low:t12   high:t13
+    ITX_MUL2X_PACK         2, 1, 6, 3784, 1567, 1    ;low:t14   high:t15
+    psubsw                m1, m3, m2                 ;low:t14a  high:t15a
+    paddsw                m3, m2                     ;low:out2  high:-out13
+    psubsw                m2, m4, m0                 ;low:t10   high:t11
+    paddsw                m0, m4                     ;low:-out1 high:out14
+    mova       [coeffq+16*6], m0
+    mova       [coeffq+16*7], m3
+    mova                  m0, [coeffq+16*4]
+    mova                  m3, [coeffq+16*5]
+    psubsw                m4, m5, m3                 ;low:t4    high:t5
+    paddsw                m5, m3                     ;low:t0    high:t1
+    psubsw                m3, m0, m7                 ;low:t6    high:t7
+    paddsw                m0, m7                     ;low:t2    high:t3
+    punpcklqdq            m7, m4
+    punpckhwd             m7, m4                     ;packed t4, t5
+    punpcklqdq            m4, m3
+    punpckhwd             m3, m4                     ;packed t7, t6
+    ITX_MUL2X_PACK         7, 4, 6, 1567, 3784       ;low:t4a   high:t5a
+    ITX_MUL2X_PACK         3, 4, 6, 3784, 1567, 1    ;low:t6a   high:t7a
+    psubsw                m4, m5, m0                 ;low:t2a   high:t3a
+    paddsw                m0, m5                     ;low:out0  high:-out15
+    psubsw                m5, m7, m3                 ;low:t6    high:t7
+    paddsw                m3, m7                     ;low:-out3 high:out12
+    ret
+ALIGN function_align
+.main_pass1_end:
+    mova                  m7, [o(deint_shuf1)]
+    mova       [coeffq+16*4], m0
+    mova       [coeffq+16*5], m3
+    mova                  m0, [o(pw_2896_m2896)]
+    mova                  m3, [o(pw_2896_2896)]
+    pshufb                m1, m7                     ;t14a t15a
+    pshufb                m2, m7                     ;t10  t11
+    pshufb                m4, m7                     ;t2a  t3a
+    pshufb                m5, m7                     ;t6   t7
+    pmaddwd               m7, m0, m2
+    pmaddwd               m2, m3
+    paddd                 m7, m6
+    paddd                 m2, m6
+    psrad                 m7, 12
+    psrad                 m2, 12
+    packssdw              m2, m7                     ;low:out6  high:-out9
+    pmaddwd               m7, m0, m4
+    pmaddwd               m4, m3
+    paddd                 m7, m6
+    paddd                 m4, m6
+    psrad                 m7, 12
+    psrad                 m4, 12
+    packssdw              m4, m7                     ;low:-out7 high:out8
+    pmaddwd               m7, m3, m5
+    pmaddwd               m5, m0
+    paddd                 m7, m6
+    paddd                 m5, m6
+    psrad                 m7, 12
+    psrad                 m5, 12
+    packssdw              m7, m5                     ;low:out4  high:-out11
+    pmaddwd               m5, m3, m1
+    pmaddwd               m1, m0
+    paddd                 m5, m6
+    paddd                 m1, m6
+    psrad                 m5, 12
+    psrad                 m1, 12
+    packssdw              m5, m1                     ;low:-out5 high:out10
+    mova                  m0, [coeffq+16*4]
+    mova                  m3, [coeffq+16*5]
+    ret
+ALIGN function_align
+.main_pass2_end:
+    mova                  m7, [o(pw_2896x8)]
+    punpckhqdq            m6, m2, m1                 ;low:t11   high:t15a
+    punpcklqdq            m2, m1                     ;low:t10   high:t14a
+    psubsw                m1, m2, m6
+    paddsw                m2, m6
+    punpckhqdq            m6, m4, m5                 ;low:t3a   high:t7
+    punpcklqdq            m4, m5                     ;low:t2a   high:t6
+    psubsw                m5, m4, m6
+    paddsw                m4, m6
+    pmulhrsw              m1, m7                     ;low:-out9 high:out10
+    pmulhrsw              m2, m7                     ;low:out6  high:-out5
+    pmulhrsw              m5, m7                     ;low:out8  high:-out11
+    pmulhrsw              m4, m7                     ;low:-out7 high:out4
+    punpckhqdq            m7, m4, m5                 ;low:out4  high:-out11
+    punpcklqdq            m4, m5                     ;low:-out7 high:out8
+    punpckhqdq            m5, m2, m1                 ;low:-out5 high:out10
+    punpcklqdq            m2, m1                     ;low:out6  high:-out9
+    ret
+
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    LOAD_7ROWS        coeffq, 16
+    call m(iadst_16x4_internal).main
+    call m(iadst_16x4_internal).main_pass1_end
+
+    punpcklwd             m6, m7, m0                 ;packed  out11,  out15
+    punpckhwd             m0, m7                     ;packed  -out0,  -out4
+    punpckhwd             m7, m3, m4                 ;packed   out3,   out7
+    punpcklwd             m4, m3                     ;packed  -out8, -out12
+    mova                  m1, [coeffq+16*6]
+    punpckhwd             m3, m1, m5                 ;packed   out1,   out5
+    punpcklwd             m5, m1                     ;packed -out10, -out14
+    mova                  m1, [coeffq+16*7]
+    mova       [coeffq+16*6], m3
+    mova       [coeffq+16*7], m7
+    punpcklwd             m3, m2, m1                 ;packed   out9,  out13
+    punpckhwd             m1, m2                     ;packed  -out2,  -out6
+
+    mova                  m7, [o(pw_m16384)]
+    jmp   m(iadst_16x4_internal).pass1_end
+
+.pass2:
+    lea                 tx2q, [o(m(iflipadst_8x4_internal).pass2)]
+    jmp   m(idct_16x4_internal).pass2_end
+
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                  m1, [coeffq+16*6]
+    mova                  m0, [coeffq+16*5]
+    mova                  m2, [coeffq+16*7]
+    mova                  m6, [o(pw_1697x16)]
+    mova                  m7, [o(pw_16384)]
+    pmulhrsw              m4, m6, m1
+    pmulhrsw              m3, m6, m0
+    pmulhrsw              m5, m6, m2
+    pmulhrsw              m4, m7
+    pmulhrsw              m3, m7
+    pmulhrsw              m5, m7
+    paddsw                m1, m4
+    paddsw                m0, m3
+    paddsw                m5, m2
+    mova                  m2, [coeffq+16*2]
+    mova                  m3, [coeffq+16*3]
+    mova                  m4, [coeffq+16*4]
+    mova       [coeffq+16*6], m1
+    mova       [coeffq+16*5], m0
+    mova       [coeffq+16*7], m5
+    pmulhrsw              m0, m6, m2
+    pmulhrsw              m1, m6, m3
+    pmulhrsw              m5, m6, m4
+    pmulhrsw              m0, m7
+    pmulhrsw              m1, m7
+    pmulhrsw              m5, m7
+    paddsw                m2, m0
+    paddsw                m3, m1
+    paddsw                m4, m5
+    mova                  m0, [coeffq+16*0]
+    mova                  m1, [coeffq+16*1]
+    pmulhrsw              m5, m6, m0
+    pmulhrsw              m6, m1
+    pmulhrsw              m5, m7
+    pmulhrsw              m6, m7
+    paddsw                m0, m5
+    paddsw                m1, m6
+    mova                  m6, [coeffq+16*6]
+    mova                  m5, [coeffq+16*5]
+    punpckhwd             m7, m0, m2                 ;packed out1,  out5
+    punpcklwd             m0, m2                     ;packed out0,  out4
+    punpckhwd             m2, m1, m3                 ;packed out3,  out7
+    punpcklwd             m1, m3                     ;packed out2,  out6
+    mova       [coeffq+16*6], m7
+    mova                  m7, [coeffq+16*7]
+    punpckhwd             m3, m4, m6                 ;packed out9,  out13
+    punpcklwd             m4, m6                     ;packed out8,  out12
+    punpckhwd             m6, m5, m7                 ;packed out11, out15
+    punpcklwd             m5, m7                     ;packed out10, out14
+    jmp   m(idct_16x4_internal).pass1_end3
+
+.pass2:
+    lea                 tx2q, [o(m(iidentity_8x4_internal).pass2)]
+    jmp   m(idct_16x4_internal).pass2_end
+
+
+%macro SAVE_8ROWS 2  ;src, stride
+    mova                 [%1+%2*0], m0
+    mova                 [%1+%2*1], m1
+    mova                 [%1+%2*2], m2
+    mova                 [%1+%2*3], m3
+    mova                 [%1+%2*4], m4
+    mova                 [%1+%2*5], m5
+    mova                 [%1+%2*6], m6
+    mova                 [%1+%2*7], m7
+%endmacro
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x16, 8, 16*16
+%ifidn %1_%2, dct_dct
+    pshuflw              m0, [coeffq], q0000
+    punpcklwd            m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+    mova                 m2, [o(pw_16384)]
+    mov            [coeffq], eobd
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m2
+    psrlw                m2, 3              ; pw_2048
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m2
+    mov                 r3d, 4
+    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x16).end)]
+    jmp m(inv_txfm_add_dct_dct_8x8).loop
+.end:
+    RET
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
+
+cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    lea                    r3, [o(m(idct_8x8_internal).pass1)]
+
+.pass1:
+    LOAD_8ROWS    coeffq+16*1, 32, 1
+    mov   [rsp+gprsize+16*11], tx2q
+    lea                  tx2q, [o(m(idct_8x16_internal).pass1_end)]
+    jmp                    r3
+
+.pass1_end:
+    SAVE_8ROWS    coeffq+16*1, 32
+    LOAD_8ROWS    coeffq+16*0, 32, 1
+    mov                  tx2q, [rsp+gprsize+16*11]
+    jmp                    r3
+
+.pass2:
+    lea                  tx2q, [o(m(idct_8x16_internal).end)]
+
+.pass2_pre:
+    mova       [coeffq+16*2 ], m1
+    mova       [coeffq+16*6 ], m3
+    mova       [coeffq+16*10], m5
+    mova       [coeffq+16*14], m7
+    mova                   m1, m2
+    mova                   m2, m4
+    mova                   m3, m6
+    mova                   m4, [coeffq+16*1 ]
+    mova                   m5, [coeffq+16*5 ]
+    mova                   m6, [coeffq+16*9 ]
+    mova                   m7, [coeffq+16*13]
+
+.pass2_main:
+    call m(idct_8x8_internal).main
+
+    SAVE_7ROWS   rsp+gprsize+16*3, 16
+    mova                   m0, [coeffq+16*2 ]
+    mova                   m1, [coeffq+16*6 ]
+    mova                   m2, [coeffq+16*10]
+    mova                   m3, [coeffq+16*14]
+    mova                   m4, [coeffq+16*3 ]
+    mova                   m5, [coeffq+16*7 ]
+    mova                   m6, [coeffq+16*11]
+    mova                   m7, [coeffq+16*15]
+    call m(idct_16x8_internal).main
+
+    mov                    r3, dstq
+    lea                  dstq, [dstq+strideq*8]
+    jmp  m(idct_8x8_internal).end
+
+.end:
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
+    mova   [rsp+gprsize+16*0], m7
+    lea                  tx2q, [o(m(idct_8x16_internal).end1)]
+    mov                  dstq, r3
+    jmp  m(idct_8x8_internal).end
+
+.end1:
+    pxor                   m7, m7
+    REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
+    ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    lea                    r3, [o(m(iadst_8x8_internal).pass1)]
+    jmp  m(idct_8x16_internal).pass1
+
+.pass2:
+    lea                  tx2q, [o(m(iadst_8x16_internal).end)]
+
+.pass2_pre:
+    mova    [rsp+gprsize+16*7], m0
+    mova    [rsp+gprsize+16*8], m1
+    mova    [rsp+gprsize+16*5], m6
+    mova    [rsp+gprsize+16*6], m7
+    mova                    m0, m2
+    mova                    m1, m3
+    mova                    m2, m4
+    mova                    m3, m5
+
+.pass2_main:
+    mova                    m4, [coeffq+16*1 ]
+    mova                    m5, [coeffq+16*3 ]
+    mova                    m6, [coeffq+16*13]
+    mova                    m7, [coeffq+16*15]
+    mova    [rsp+gprsize+16*3], m4
+    mova    [rsp+gprsize+16*4], m5
+    mova    [rsp+gprsize+16*9], m6
+    mova    [rsp+gprsize+32*5], m7
+    mova                    m4, [coeffq+16*5 ]
+    mova                    m5, [coeffq+16*7 ]
+    mova                    m6, [coeffq+16*9 ]
+    mova                    m7, [coeffq+16*11]
+
+    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass2_end
+
+    mov                    r3, dstq
+    lea                  dstq, [dstq+strideq*8]
+    jmp m(iadst_8x8_internal).end
+
+.end:
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
+    mova   [rsp+gprsize+16*0], m7
+    lea                  tx2q, [o(m(idct_8x16_internal).end1)]
+    mov                  dstq, r3
+    jmp  m(iadst_8x8_internal).end
+
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    lea                    r3, [o(m(iflipadst_8x8_internal).pass1)]
+    jmp  m(idct_8x16_internal).pass1
+
+.pass2:
+    lea                   tx2q, [o(m(iflipadst_8x16_internal).end)]
+    lea                     r3, [dstq+strideq*8]
+
+.pass2_pre:
+    mova    [rsp+gprsize+16*7], m0
+    mova    [rsp+gprsize+16*8], m1
+    mova    [rsp+gprsize+16*5], m6
+    mova    [rsp+gprsize+16*6], m7
+    mova                    m0, m2
+    mova                    m1, m3
+    mova                    m2, m4
+    mova                    m3, m5
+
+.pass2_main:
+    mova                    m4, [coeffq+16*1 ]
+    mova                    m5, [coeffq+16*3 ]
+    mova                    m6, [coeffq+16*13]
+    mova                    m7, [coeffq+16*15]
+    mova    [rsp+gprsize+16*3], m4
+    mova    [rsp+gprsize+16*4], m5
+    mova    [rsp+gprsize+16*9], m6
+    mova    [rsp+gprsize+32*5], m7
+    mova                    m4, [coeffq+16*5 ]
+    mova                    m5, [coeffq+16*7 ]
+    mova                    m6, [coeffq+16*9 ]
+    mova                    m7, [coeffq+16*11]
+
+    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass2_end
+    jmp  m(iflipadst_8x8_internal).end
+
+.end:
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_8x16_internal).end1)]
+    mov                   dstq, r3
+    jmp  m(iflipadst_8x8_internal).end
+
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    LOAD_8ROWS    coeffq+16*1, 32, 1
+    mov                    r3, tx2q
+    lea                  tx2q, [o(m(iidentity_8x16_internal).pass1_end)]
+    mova   [rsp+gprsize+16*1], m6
+    jmp  m(idct_8x8_internal).pass1_end3
+
+.pass1_end:
+    SAVE_8ROWS    coeffq+16*1, 32
+    LOAD_8ROWS    coeffq+16*0, 32, 1
+    mov                  tx2q, r3
+    mova   [rsp+gprsize+16*1], m6
+    jmp  m(idct_8x8_internal).pass1_end3
+
+.pass2:
+    lea                  tx2q, [o(m(iidentity_8x16_internal).end1)]
+
+.end:
+    mova   [rsp+gprsize+16*0], m7
+    mova   [rsp+gprsize+16*1], m6
+    mova                   m7, [o(pw_1697x16)]
+    REPX     {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
+    mova                   m6, [rsp+gprsize+16*1]
+    mova   [rsp+gprsize+16*2], m5
+    IDTX16                  6, 5, 7
+    mova                   m5, [rsp+gprsize+16*0]
+    IDTX16                  5, 7, 7
+    mova                   m7, [o(pw_2048)]
+    REPX     {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    pmulhrsw               m7, [rsp+gprsize+16*2]
+    mova   [rsp+gprsize+16*0], m5
+    mova   [rsp+gprsize+16*1], m6
+    mova   [rsp+gprsize+16*2], m7
+    jmp  m(idct_8x8_internal).end3
+
+.end1:
+    LOAD_8ROWS    coeffq+16*1, 32
+    lea                  tx2q, [o(m(idct_8x16_internal).end1)]
+    lea                  dstq, [dstq+strideq*2]
+    jmp .end
+
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x8, 8, 16*16
+%ifidn %1_%2, dct_dct
+    movd                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1, [coeffq]
+    movd                 m2, [o(pw_16384)]
+    mov            [coeffq], eobd
+    pmulhrsw             m0, m1
+    mov                 r2d, 4
+    lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x8).end)]
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+.end:
+    RET
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
+
+cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    LOAD_8ROWS    coeffq+16*0, 32, 1
+    call m(idct_8x8_internal).main
+    SAVE_7ROWS   rsp+gprsize+16*3, 16
+
+    LOAD_8ROWS    coeffq+16*1, 32, 1
+    call  .main
+    mov                    r3, tx2q
+    lea                  tx2q, [o(m(idct_16x8_internal).pass1_end)]
+    jmp  m(idct_8x8_internal).pass1_end
+
+.pass1_end:
+    SAVE_8ROWS    coeffq+16*1, 32
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
+    mova   [rsp+gprsize+16*0], m7
+    mov                  tx2q, r3
+    jmp  m(idct_8x8_internal).pass1_end
+
+.pass2:
+    lea                  tx2q, [o(m(idct_16x8_internal).end)]
+    lea                    r3, [dstq+8]
+    jmp  m(idct_8x8_internal).pass2_main
+
+.end:
+    LOAD_8ROWS    coeffq+16*1, 32
+    lea                  tx2q, [o(m(idct_8x16_internal).end1)]
+    mov                  dstq, r3
+    jmp  m(idct_8x8_internal).pass2_main
+
+
+ALIGN function_align
+.main:
+    mova [rsp+gprsize*2+16*1], m2
+    mova [rsp+gprsize*2+16*2], m6
+    mova [rsp+gprsize*2+32*5], m5
+
+    mova                   m6, [o(pd_2048)]
+    ITX_MULSUB_2W           0, 7, 2, 5, 6,  401, 4076   ;t8a, t15a
+    ITX_MULSUB_2W           4, 3, 2, 5, 6, 3166, 2598   ;t9a, t14a
+    psubsw                 m2, m0, m4                   ;t9
+    paddsw                 m0, m4                       ;t8
+    psubsw                 m4, m7, m3                   ;t14
+    paddsw                 m7, m3                       ;t15
+    ITX_MULSUB_2W           4, 2, 3, 5, 6, 1567, 3784   ;t9a, t14a
+    mova                   m3, [rsp+gprsize*2+16*1]
+    mova                   m5, [rsp+gprsize*2+32*5]
+    mova [rsp+gprsize*2+16*1], m2
+    mova [rsp+gprsize*2+32*5], m4
+    mova                   m2, [rsp+gprsize*2+16*2]
+    mova [rsp+gprsize*2+16*2], m7
+    ITX_MULSUB_2W           3, 5, 7, 4, 6, 1931, 3612   ;t10a, t13a
+    ITX_MULSUB_2W           2, 1, 7, 4, 6, 3920, 1189   ;t11a, t12a
+    psubsw                 m4, m2, m3                   ;t10
+    paddsw                 m2, m3                       ;t11
+    psubsw                 m3, m1, m5                   ;t13
+    paddsw                 m1, m5                       ;t12
+    ITX_MULSUB_2W           3, 4, 7, 5, 6, m3784, 1567  ;t10a, t13a
+    mova                   m7, [rsp+gprsize*2+32*5]
+    psubsw                 m6, m0, m2                   ;t11a
+    paddsw                 m0, m2                       ;t8a
+    paddsw                 m2, m7, m3                   ;t9
+    psubsw                 m7, m3                       ;t10
+    mova                   m5, [rsp+gprsize*2+16*0]
+    psubsw                 m3, m5, m0                   ;out8
+    paddsw                 m0, m5                       ;out7
+    mova [rsp+gprsize*2+32*5], m0
+    mova                   m5, [rsp+gprsize*2+16*9]
+    psubsw                 m0, m5, m2                   ;out9
+    paddsw                 m2, m5                       ;out6
+    mova [rsp+gprsize*2+16*0], m0
+    mova [rsp+gprsize*2+16*9], m2
+    mova                   m0, [rsp+gprsize*2+16*1]
+    mova                   m2, [rsp+gprsize*2+16*2]
+    mova [rsp+gprsize*2+16*1], m3
+    psubsw                 m5, m0, m4                   ;t13
+    paddsw                 m0, m4                       ;t14
+    mova                   m3, [o(pd_2048)]
+    psubsw                 m4, m2, m1                   ;t12a
+    paddsw                 m1, m2                       ;t15a
+    mova [rsp+gprsize*2+16*2], m1
+    ITX_MULSUB_2W           5, 7, 1, 2, 3, 2896, 2896   ;t10a, t13a
+    ITX_MULSUB_2W           4, 6, 1, 2, 3, 2896, 2896   ;t11,  t12
+    mova                   m3, [rsp+gprsize*2+16*8]
+    psubsw                 m2, m3, m5                   ;out10
+    paddsw                 m3, m5                       ;out5
+    mova                   m5, [rsp+gprsize*2+16*7]
+    mova [rsp+gprsize*2+16*8], m3
+    psubsw                 m3, m5, m4                   ;out11
+    paddsw                 m5, m4                       ;out4
+    mova                   m4, [rsp+gprsize*2+16*6]
+    mova [rsp+gprsize*2+16*7], m5
+    paddsw                 m5, m4, m6                   ;out3
+    psubsw                 m4, m6                       ;out12
+    mova                   m6, [rsp+gprsize*2+16*5]
+    mova [rsp+gprsize*2+16*6], m5
+    psubsw                 m5, m6, m7                   ;out13
+    paddsw                 m6, m7                       ;out2
+    mova                   m7, [rsp+gprsize*2+16*4]
+    mova [rsp+gprsize*2+16*5], m6
+    psubsw                 m6, m7, m0                   ;out14
+    paddsw                 m7, m0                       ;out1
+    mova                   m1, [rsp+gprsize*2+16*2]
+    mova                   m0, [rsp+gprsize*2+16*3]
+    mova [rsp+gprsize*2+16*4], m7
+    psubsw                 m7, m0, m1                   ;out15
+    paddsw                 m0, m1                       ;out0
+    mova [rsp+gprsize*2+16*3], m0
+    mova                   m1, [rsp+gprsize*2+16*0]
+    mova                   m0, [rsp+gprsize*2+16*1]
+    mova [rsp+gprsize*2+16*0], m7
+    ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                    m7, [o(pw_2896x8)]
+    pmulhrsw                m0, m7, [coeffq+16*0 ]
+    pmulhrsw                m1, m7, [coeffq+16*1 ]
+    pmulhrsw                m2, m7, [coeffq+16*14]
+    pmulhrsw                m3, m7, [coeffq+16*15]
+    mova    [rsp+gprsize+16*7], m0
+    mova    [rsp+gprsize+16*8], m1
+    mova    [rsp+gprsize+16*9], m2
+    mova    [rsp+gprsize+32*5], m3
+    pmulhrsw                m0, m7, [coeffq+16*6 ]
+    pmulhrsw                m1, m7, [coeffq+16*7 ]
+    pmulhrsw                m2, m7, [coeffq+16*8 ]
+    pmulhrsw                m3, m7, [coeffq+16*9 ]
+    mova    [rsp+gprsize+16*3], m2
+    mova    [rsp+gprsize+16*4], m3
+    mova    [rsp+gprsize+16*5], m0
+    mova    [rsp+gprsize+16*6], m1
+    pmulhrsw                m0, m7, [coeffq+16*2 ]
+    pmulhrsw                m1, m7, [coeffq+16*3 ]
+    pmulhrsw                m2, m7, [coeffq+16*4 ]
+    pmulhrsw                m3, m7, [coeffq+16*5 ]
+    pmulhrsw                m4, m7, [coeffq+16*10]
+    pmulhrsw                m5, m7, [coeffq+16*11]
+    pmulhrsw                m6, m7, [coeffq+16*12]
+    pmulhrsw                m7,     [coeffq+16*13]
+
+    call .main
+    call .main_pass1_end
+    mov                    r3, tx2q
+    lea                  tx2q, [o(m(iadst_16x8_internal).pass1_end)]
+    jmp m(iadst_8x8_internal).pass1_end
+
+.pass1_end:
+    SAVE_8ROWS    coeffq+16*1, 32
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
+    mova   [rsp+gprsize+16*0], m7
+    mov                  tx2q, r3
+    jmp m(iadst_8x8_internal).pass1_end
+
+.pass2:
+    lea                  tx2q, [o(m(iadst_16x8_internal).end)]
+    lea                    r3, [dstq+8]
+    jmp m(iadst_8x8_internal).pass2_main
+
+.end:
+    LOAD_8ROWS    coeffq+16*1, 32
+    lea                  tx2q, [o(m(idct_8x16_internal).end1)]
+    mov                  dstq, r3
+    jmp m(iadst_8x8_internal).pass2_main
+
+ALIGN function_align
+.main:
+    mova  [rsp+gprsize*2+16*0], m1
+    mova  [rsp+gprsize*2+16*1], m2
+    mova  [rsp+gprsize*2+16*2], m6
+
+    mova                    m6, [o(pd_2048)]
+    ITX_MULSUB_2W            7, 0, 1, 2, 6,  995, 3973   ;t3,  t2
+    ITX_MULSUB_2W            3, 4, 1, 2, 6, 3513, 2106   ;t11, t10
+    psubsw                  m1, m0, m4                   ;t10a
+    paddsw                  m0, m4                       ;t2a
+    psubsw                  m4, m7, m3                   ;t11a
+    paddsw                  m3, m7                       ;t3a
+    ITX_MULSUB_2W            1, 4, 7, 2, 6, 3406, 2276   ;t11, t10
+    mova                    m2, [rsp+gprsize*2+16*0]     ;in3
+    mova                    m7, [rsp+gprsize*2+16*1]     ;in4
+    mova  [rsp+gprsize*2+16*0], m1                       ;t11
+    mova  [rsp+gprsize*2+16*1], m4                       ;t10
+    mova                    m1, [rsp+gprsize*2+16*2]     ;in12
+    mova  [rsp+gprsize*2+16*2], m0                       ;t2a
+    ITX_MULSUB_2W            5, 7, 0, 4, 6, 1751, 3703   ;t5,  t4
+    ITX_MULSUB_2W            2, 1, 0, 4, 6, 3857, 1380   ;t13, t12
+    psubsw                  m0, m7, m1                   ;t12a
+    paddsw                  m1, m7                       ;t4a
+    psubsw                  m4, m5, m2                   ;t13a
+    paddsw                  m5, m2                       ;t5a
+    ITX_MULSUB_2W            4, 0, 7, 2, 6, 4017,  799   ;t12, t13
+    mova                    m2, [rsp+gprsize*2+16*8]     ;in1
+    mova                    m7, [rsp+gprsize*2+16*9]     ;in14
+    mova  [rsp+gprsize*2+16*8], m4                       ;t12
+    mova  [rsp+gprsize*2+16*9], m0                       ;t13
+    mova                    m4, [rsp+gprsize*2+16*4]     ;in9
+    mova                    m0, [rsp+gprsize*2+16*5]     ;in6
+    mova  [rsp+gprsize*2+16*4], m1                       ;t4a
+    mova  [rsp+gprsize*2+16*5], m5                       ;t5a
+    ITX_MULSUB_2W            2, 7, 1, 5, 6, 4052,  601   ;t15, t14
+    ITX_MULSUB_2W            4, 0, 1, 5, 6, 2440, 3290   ;t7,  t6
+    psubsw                  m1, m0, m7                   ;t14a
+    paddsw                  m0, m7                       ;t6a
+    psubsw                  m5, m4, m2                   ;t15a
+    paddsw                  m4, m2                       ;t7a
+    ITX_MULSUB_2W            5, 1, 7, 2, 6, 2276, 3406   ;t14, t15
+    mova                    m2, [rsp+gprsize*2+16*2]     ;t2a
+    mova  [rsp+gprsize*2+16*2], m5                       ;t14
+    psubsw                  m7, m2, m0                   ;t6
+    paddsw                  m2, m0                       ;t2
+    psubsw                  m0, m3, m4                   ;t7
+    paddsw                  m3, m4                       ;t3
+    ITX_MULSUB_2W            0, 7, 4, 5, 6, 3784, 1567   ;t6a, t7a
+    mova                    m4, [rsp+gprsize*2+16*7]     ;in0
+    mova                    m5, [rsp+gprsize*2+32*5]     ;in15
+    mova  [rsp+gprsize*2+16*7], m3                       ;t3
+    mova  [rsp+gprsize*2+32*5], m1                       ;t15
+    mova                    m1, [rsp+gprsize*2+16*6]     ;in7
+    mova                    m3, [rsp+gprsize*2+16*3]     ;in8
+    mova  [rsp+gprsize*2+16*6], m7                       ;t7a
+    mova  [rsp+gprsize*2+16*3], m0                       ;t6a
+    ITX_MULSUB_2W            5, 4, 0, 7, 6,  201, 4091   ;t1,  t0
+    ITX_MULSUB_2W            1, 3, 0, 7, 6, 3035, 2751   ;t9,  t8
+    psubsw                  m0, m4, m3                   ;t8a
+    paddsw                  m4, m3                       ;t0a
+    psubsw                  m3, m5, m1                   ;t9a
+    paddsw                  m5, m1                       ;t1a
+    ITX_MULSUB_2W            0, 3, 1, 7, 6,  799, 4017   ;t9,  t8
+    mova                    m1, [rsp+gprsize*2+16*4]     ;t4a
+    mova                    m7, [rsp+gprsize*2+16*5]     ;t5a
+    mova  [rsp+gprsize*2+16*4], m3                       ;t8
+    mova  [rsp+gprsize*2+16*5], m0                       ;t9
+    psubsw                  m0, m4, m1                   ;t4
+    paddsw                  m4, m1                       ;t0
+    psubsw                  m3, m5, m7                   ;t5
+    paddsw                  m5, m7                       ;t1
+    ITX_MULSUB_2W            0, 3, 1, 7, 6, 1567, 3784   ;t5a, t4a
+    mova                    m7, [rsp+gprsize*2+16*3]     ;t6a
+    psubsw                  m1, m4, m2                   ;t2a
+    paddsw                  m4, m2                       ;out0
+    mova  [rsp+gprsize*2+16*3], m4                       ;out0
+    mova                    m4, [rsp+gprsize*2+16*6]     ;t7a
+    psubsw                  m2, m3, m7                   ;t6
+    paddsw                  m3, m7                       ;-out3
+    mova  [rsp+gprsize*2+16*6], m3                       ;-out3
+    psubsw                  m3, m0, m4                   ;t7
+    paddsw                  m0, m4                       ;out12
+    mova [rsp+gprsize*2+16*12], m3
+    mova                    m3, [rsp+gprsize*2+16*7]     ;t3
+    mova [rsp+gprsize*2+16* 7], m2                       ;out4
+    psubsw                  m2, m5, m3                   ;t3a
+    paddsw                  m5, m3                       ;-out15
+    mova [rsp+gprsize*2+16*11], m2
+    mova                    m2, [rsp+gprsize*2+32*5]     ;t15
+    mova [rsp+gprsize*2+16*10], m1                       ;-out7
+    mova                    m1, [rsp+gprsize*2+16*0]     ;t11
+    mova [rsp+gprsize*2+16*0 ], m5                       ;-out15
+    mova                    m3, [rsp+gprsize*2+16*1]     ;t10
+    mova [rsp+gprsize*2+16*1 ], m4                       ;-out11
+    mova                    m4, [rsp+gprsize*2+16*2]     ;t14
+    mova [rsp+gprsize*2+16*2 ], m0                       ;out12
+    psubsw                  m0, m3, m4                   ;t14a
+    paddsw                  m3, m4                       ;t10a
+    psubsw                  m5, m1, m2                   ;t15a
+    paddsw                  m1, m2                       ;t11a
+    ITX_MULSUB_2W            5, 0, 2, 4, 6, 3784, 1567   ;t14, t15
+    mova                    m2, [rsp+gprsize*2+16*4]     ;t8
+    mova                    m4, [rsp+gprsize*2+16*5]     ;t9
+    mova  [rsp+gprsize*2+16*4], m3                       ;t10a
+    mova  [rsp+gprsize*2+16*5], m1                       ;t11a
+    mova                    m3, [rsp+gprsize*2+16*8]     ;t12
+    mova                    m1, [rsp+gprsize*2+16*9]     ;t13
+    mova  [rsp+gprsize*2+16*8], m5                       ;t14
+    mova  [rsp+gprsize*2+16*9], m0                       ;t15
+    psubsw                  m5, m2, m3                   ;t12a
+    paddsw                  m2, m3                       ;t8a
+    psubsw                  m0, m4, m1                   ;t13a
+    paddsw                  m4, m1                       ;t9a
+    ITX_MULSUB_2W            5, 0, 1, 3, 6, 1567, 3784   ;t13, t12
+    mova                    m6, [rsp+gprsize*2+16*4]     ;t10a
+    mova                    m1, [rsp+gprsize*2+16*5]     ;t11a
+    psubsw                  m3, m2, m6                   ;t10
+    paddsw                  m2, m6                       ;-out1
+    paddsw                  m6, m4, m1                   ;out14
+    psubsw                  m4, m1                       ;t11
+    mova [rsp+gprsize*2+16*14], m4
+    mova [rsp+gprsize*2+16* 4], m2                       ;-out1
+    mova                    m4, [rsp+gprsize*2+16*8]     ;t14
+    mova                    m2, [rsp+gprsize*2+16*9]     ;t15
+    mova [rsp+gprsize*2+16* 9], m3                       ;out6
+    psubsw                  m3, m0, m4                   ;t14a
+    paddsw                  m0, m4                       ;out2
+    psubsw                  m4, m5, m2                   ;t15a
+    paddsw                  m5, m2                       ;-out13
+    mova [rsp+gprsize*2+16* 5], m0                       ;out2
+    ret
+ALIGN function_align
+.main_pass1_end:
+    mova                    m0, [rsp+gprsize*2+16*14]
+    mova [rsp+gprsize*2+16*14], m5
+    mova [rsp+gprsize*2+16*15], m6
+    mova                    m5, [o(pw_2896_2896)]
+    mova                    m6, [o(pw_2896_m2896)]
+    mova                    m7, [o(pd_2048)]
+    punpcklwd               m2, m3, m4
+    punpckhwd               m3, m4
+    pmaddwd                 m4, m5, m2
+    pmaddwd                 m2, m6
+    pmaddwd                 m1, m5, m3
+    pmaddwd                 m3, m6
+    REPX         {paddd x, m7}, m4, m2, m1, m3
+    REPX         {psrad x, 12}, m4, m1, m2, m3
+    packssdw                m4, m1                       ;-out5
+    packssdw                m2, m3                       ;out10
+    mova [rsp+gprsize*2+16* 8], m4
+    mova                    m3, [rsp+gprsize*2+16* 9]
+    punpcklwd               m1, m3, m0
+    punpckhwd               m3, m0
+    pmaddwd                 m0, m5, m1
+    pmaddwd                 m1, m6
+    pmaddwd                 m4, m5, m3
+    pmaddwd                 m3, m6
+    REPX         {paddd x, m7}, m0, m1, m4, m3
+    REPX         {psrad x, 12}, m0, m4, m1, m3
+    packssdw                m0, m4                       ;out6
+    packssdw                m1, m3                       ;-out9
+    mova [rsp+gprsize*2+16* 9], m0
+    mova                    m0, [rsp+gprsize*2+16* 7]
+    mova                    m4, [rsp+gprsize*2+16*12]
+    punpcklwd               m3, m0, m4
+    punpckhwd               m0, m4
+    pmaddwd                 m4, m5, m3
+    pmaddwd                 m3, m6
+    pmaddwd                 m5, m0
+    pmaddwd                 m0, m6
+    REPX         {paddd x, m7}, m4, m3, m5, m0
+    REPX         {psrad x, 12}, m4, m5, m3, m0
+    packssdw                m4, m5                       ;out4
+    packssdw                m3, m0                       ;-out11
+    mova [rsp+gprsize*2+16* 7], m4
+    mova                    m4, [rsp+gprsize*2+16*10]
+    mova                    m5, [rsp+gprsize*2+16*11]
+    punpcklwd               m0, m4, m5
+    punpckhwd               m4, m5
+    pmaddwd                 m5, m0, [o(pw_2896_2896)]
+    pmaddwd                 m0, m6
+    pmaddwd                 m6, m4
+    pmaddwd                 m4, [o(pw_2896_2896)]
+    REPX         {paddd x, m7}, m5, m0, m6, m4
+    REPX         {psrad x, 12}, m0, m6, m5, m4
+    packssdw                m0, m6                       ;out8
+    packssdw                m5, m4                       ;-out7
+    mova [rsp+gprsize*2+16*10], m5
+    mova                    m4, [rsp+gprsize*2+16* 2]    ;out12
+    mova                    m5, [rsp+gprsize*2+16*14]    ;-out13
+    mova                    m6, [rsp+gprsize*2+16*15]    ;out14
+    ret
+ALIGN function_align
+.main_pass2_end:
+    mova                    m7, [o(pw_2896x8)]
+    mova                    m1, [rsp+gprsize*2+16* 9]
+    mova                    m2, [rsp+gprsize*2+16*14]
+    paddsw                  m0, m1, m2
+    psubsw                  m1, m2
+    pmulhrsw                m0, m7                       ;out6
+    pmulhrsw                m1, m7                       ;-out9
+    mova [rsp+gprsize*2+16* 9], m0
+    psubsw                  m2, m3, m4
+    paddsw                  m3, m4
+    pmulhrsw                m2, m7                       ;out10
+    pmulhrsw                m3, m7                       ;-out5
+    mova [rsp+gprsize*2+16* 8], m3
+    mova                    m3, [rsp+gprsize*2+16* 7]
+    mova                    m4, [rsp+gprsize*2+16*12]
+    paddsw                  m0, m3, m4
+    psubsw                  m3, m4
+    pmulhrsw                m0, m7                       ;out4
+    pmulhrsw                m3, m7                       ;-out11
+    mova [rsp+gprsize*2+16* 7], m0
+    mova                    m0, [rsp+gprsize*2+16*10]
+    paddsw                  m4, m0, [rsp+gprsize*2+16*11]
+    psubsw                  m0, [rsp+gprsize*2+16*11]
+    pmulhrsw                m4, m7                       ;-out7
+    pmulhrsw                m0, m7                       ;out8
+    mova [rsp+gprsize*2+16*10], m4
+    mova                    m4, [rsp+gprsize*2+16*2 ]    ;out12
+    ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                    m7, [o(pw_2896x8)]
+    pmulhrsw                m0, m7, [coeffq+16*0 ]
+    pmulhrsw                m1, m7, [coeffq+16*1 ]
+    pmulhrsw                m2, m7, [coeffq+16*14]
+    pmulhrsw                m3, m7, [coeffq+16*15]
+    mova    [rsp+gprsize+16*7], m0
+    mova    [rsp+gprsize+16*8], m1
+    mova    [rsp+gprsize+16*9], m2
+    mova    [rsp+gprsize+32*5], m3
+    pmulhrsw                m0, m7, [coeffq+16*6 ]
+    pmulhrsw                m1, m7, [coeffq+16*7 ]
+    pmulhrsw                m2, m7, [coeffq+16*8 ]
+    pmulhrsw                m3, m7, [coeffq+16*9 ]
+    mova    [rsp+gprsize+16*3], m2
+    mova    [rsp+gprsize+16*4], m3
+    mova    [rsp+gprsize+16*5], m0
+    mova    [rsp+gprsize+16*6], m1
+    pmulhrsw                m0, m7, [coeffq+16*2 ]
+    pmulhrsw                m1, m7, [coeffq+16*3 ]
+    pmulhrsw                m2, m7, [coeffq+16*4 ]
+    pmulhrsw                m3, m7, [coeffq+16*5 ]
+    pmulhrsw                m4, m7, [coeffq+16*10]
+    pmulhrsw                m5, m7, [coeffq+16*11]
+    pmulhrsw                m6, m7, [coeffq+16*12]
+    pmulhrsw                m7,     [coeffq+16*13]
+
+    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
+
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS     coeffq+16*0, 32
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    mov                     r3, tx2q
+    lea                   tx2q, [o(m(iflipadst_16x8_internal).pass1_end)]
+    jmp m(iflipadst_8x8_internal).pass1_end
+
+.pass1_end:
+    SAVE_8ROWS     coeffq+16*1, 32
+    LOAD_8ROWS     coeffq+16*0, 32
+    mova    [rsp+gprsize+16*0], m7
+    mov                   tx2q, r3
+    jmp m(iflipadst_8x8_internal).pass1_end
+
+.pass2:
+    lea                   tx2q, [o(m(iflipadst_16x8_internal).end)]
+    lea                     r3, [dstq+8]
+    jmp m(iflipadst_8x8_internal).pass2_main
+
+.end:
+    LOAD_8ROWS     coeffq+16*1, 32
+    lea                   tx2q, [o(m(idct_8x16_internal).end1)]
+    mov                   dstq, r3
+    jmp m(iflipadst_8x8_internal).pass2_main
+
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    add                coeffq, 16*16
+    mova                   m4, [coeffq-16*7]
+    mova                   m5, [coeffq-16*5]
+    mova                   m6, [coeffq-16*3]
+    mova                   m7, [coeffq-16*1]
+    mov                    r3, tx2q
+    lea                  tx2q, [o(m(iidentity_16x8_internal).pass1_end)]
+
+.pass1:
+    mova                   m0, [o(pw_2896x8)]
+    mova                   m2, [o(pw_1697x16)]
+    mova                   m3, [o(pw_16384)]
+    sub                coeffq, 8*16
+    REPX     {pmulhrsw x, m0}, m4, m5, m6, m7
+    pmulhrsw               m1, m2, m4
+    pmulhrsw               m1, m3
+    paddsw                 m1, m4 ; 1
+    pmulhrsw               m4, m2, m5
+    pmulhrsw               m4, m3
+    paddsw                 m4, m5 ; 3
+    pmulhrsw               m5, m2, m6
+    pmulhrsw               m5, m3
+    paddsw                 m5, m6 ; 5
+    pmulhrsw               m6, m2, m7
+    pmulhrsw               m6, m3
+    paddsw                 m7, m6 ; 7
+    pmulhrsw               m6, m0, [coeffq+16*6]
+    mova   [rsp+gprsize+16*0], m4
+    pmulhrsw               m4, m2, m6
+    pmulhrsw               m4, m3
+    paddsw                 m6, m4 ; 6
+    pmulhrsw               m4, m0, [coeffq+16*4]
+    mova   [rsp+gprsize+16*1], m6
+    pmulhrsw               m6, m2, m4
+    pmulhrsw               m6, m3
+    paddsw                 m4, m6 ; 4
+    pmulhrsw               m6, m0, [coeffq+16*2]
+    pmulhrsw               m0,     [coeffq+16*0]
+    pmulhrsw               m2, m6
+    pmulhrsw               m2, m3
+    paddsw                 m2, m6 ; 2
+    pmulhrsw               m6, m0, [o(pw_1697x16)]
+    pmulhrsw               m6, m3
+    mova                   m3, [rsp+gprsize+16*0]
+    paddsw                 m0, m6
+    jmp   m(idct_8x8_internal).pass1_end3
+
+.pass1_end:
+    mova        [coeffq+16*1], m4
+    mova        [coeffq+16*3], m5
+    mova        [coeffq+16*5], m6
+    mova        [coeffq+16*7], m7
+    mova                   m4, [coeffq-16*7]
+    mova                   m5, [coeffq-16*5]
+    mova                   m6, [coeffq-16*3]
+    mova                   m7, [coeffq-16*1]
+    mova        [coeffq-16*7], m0
+    mova        [coeffq-16*5], m1
+    mova        [coeffq-16*3], m2
+    mova        [coeffq-16*1], m3
+    mov                  tx2q, r3
+    jmp .pass1
+
+.pass2:
+    lea                  tx2q, [o(m(iidentity_16x8_internal).end)]
+    lea                    r3, [dstq+8]
+    jmp  m(iidentity_8x8_internal).end
+
+.end:
+    LOAD_8ROWS    coeffq+16*1, 32
+    lea                  tx2q, [o(m(idct_8x16_internal).end1)]
+    mov                  dstq, r3
+    jmp  m(iidentity_8x8_internal).end
+
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x16, 8, 16*16
+%ifidn %1_%2, dct_dct
+    movd                   m1, [o(pw_2896x8)]
+    pmulhrsw               m0, m1, [coeffq]
+    movd                   m2, [o(pw_8192)]
+    mov              [coeffq], eobd
+    mov                   r2d, 8
+    lea                  tx2q, [o(m(inv_txfm_add_dct_dct_16x16).end)]
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+.end:
+    RET
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
+
+cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    LOAD_8ROWS     coeffq+16*1, 64
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+16*3, 64
+    call m(idct_16x8_internal).main
+    mov                     r3, tx2q
+    lea                   tx2q, [o(m(idct_16x16_internal).pass1_end)]
+    mova                    m7, [o(pw_8192)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end:
+    SAVE_8ROWS    coeffq+16*17, 32
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_16x16_internal).pass1_end1)]
+    mova                    m7, [o(pw_8192)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end1:
+    SAVE_8ROWS     coeffq+16*1, 32
+    LOAD_8ROWS     coeffq+16*0, 64
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+16*2, 64
+    call m(idct_16x8_internal).main
+    lea                   tx2q, [o(m(idct_16x16_internal).pass1_end2)]
+    mova                    m7, [o(pw_8192)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end2:
+    SAVE_8ROWS    coeffq+16*16, 32
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    mov                   tx2q, r3
+    mova                    m7, [o(pw_8192)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass2:
+    lea                   tx2q, [o(m(idct_16x16_internal).end)]
+    jmp  m(idct_8x16_internal).pass2_pre
+
+.end:
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_16x16_internal).end1)]
+    mov                   dstq, r3
+    lea                     r3, [dstq+8]
+    jmp   m(idct_8x8_internal).end
+
+.end1:
+    pxor                    m7, m7
+    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
+
+    add                 coeffq, 32*8
+    mov                   dstq, r3
+
+    mova                    m0, [coeffq+16*0 ]
+    mova                    m1, [coeffq+16*4 ]
+    mova                    m2, [coeffq+16*8 ]
+    mova                    m3, [coeffq+16*12]
+    mova                    m4, [coeffq+16*1 ]
+    mova                    m5, [coeffq+16*5 ]
+    mova                    m6, [coeffq+16*9 ]
+    mova                    m7, [coeffq+16*13]
+    lea                   tx2q, [o(m(idct_8x16_internal).end)]
+    jmp  m(idct_8x16_internal).pass2_main
+
+
+%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0
+    mova                    m0, [coeffq+16*1 ]
+    mova                    m1, [coeffq+16*3 ]
+    mova                    m2, [coeffq+16*29]
+    mova                    m3, [coeffq+16*31]
+    mova    [rsp+gprsize+16*7], m0
+    mova    [rsp+gprsize+16*8], m1
+    mova    [rsp+gprsize+16*9], m2
+    mova    [rsp+gprsize+32*5], m3
+    mova                    m0, [coeffq+16*13]
+    mova                    m1, [coeffq+16*15]
+    mova                    m2, [coeffq+16*17]
+    mova                    m3, [coeffq+16*19]
+    mova    [rsp+gprsize+16*3], m2
+    mova    [rsp+gprsize+16*4], m3
+    mova    [rsp+gprsize+16*5], m0
+    mova    [rsp+gprsize+16*6], m1
+    mova                    m0, [coeffq+16*5 ]
+    mova                    m1, [coeffq+16*7 ]
+    mova                    m2, [coeffq+16*9 ]
+    mova                    m3, [coeffq+16*11]
+    mova                    m4, [coeffq+16*21]
+    mova                    m5, [coeffq+16*23]
+    mova                    m6, [coeffq+16*25]
+    mova                    m7, [coeffq+16*27]
+%endmacro
+
+%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0
+    mova                    m0, [coeffq+16*0 ]
+    mova                    m1, [coeffq+16*2 ]
+    mova                    m2, [coeffq+16*28]
+    mova                    m3, [coeffq+16*30]
+    mova    [rsp+gprsize+16*7], m0
+    mova    [rsp+gprsize+16*8], m1
+    mova    [rsp+gprsize+16*9], m2
+    mova    [rsp+gprsize+32*5], m3
+    mova                    m0, [coeffq+16*12]
+    mova                    m1, [coeffq+16*14]
+    mova                    m2, [coeffq+16*16]
+    mova                    m3, [coeffq+16*18]
+    mova    [rsp+gprsize+16*3], m2
+    mova    [rsp+gprsize+16*4], m3
+    mova    [rsp+gprsize+16*5], m0
+    mova    [rsp+gprsize+16*6], m1
+    mova                    m0, [coeffq+16*4 ]
+    mova                    m1, [coeffq+16*6 ]
+    mova                    m2, [coeffq+16*8 ]
+    mova                    m3, [coeffq+16*10]
+    mova                    m4, [coeffq+16*20]
+    mova                    m5, [coeffq+16*22]
+    mova                    m6, [coeffq+16*24]
+    mova                    m7, [coeffq+16*26]
+%endmacro
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    ITX_16X16_ADST_LOAD_ODD_COEFS
+    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
+
+    mov                     r3, tx2q
+    lea                   tx2q, [o(m(iadst_16x16_internal).pass1_end)]
+    mova                    m7, [o(pw_8192)]
+    jmp  m(iadst_8x8_internal).pass1_end1
+
+.pass1_end:
+    SAVE_8ROWS    coeffq+16*17, 32
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(iadst_16x16_internal).pass1_end1)]
+    mova                    m7, [o(pw_8192)]
+    jmp  m(iadst_8x8_internal).pass1_end1
+
+.pass1_end1:
+    SAVE_8ROWS     coeffq+16*1, 32
+    ITX_16X16_ADST_LOAD_EVEN_COEFS
+    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
+
+    lea                   tx2q, [o(m(iadst_16x16_internal).pass1_end2)]
+    mova                    m7, [o(pw_8192)]
+    jmp  m(iadst_8x8_internal).pass1_end1
+
+.pass1_end2:
+    SAVE_8ROWS    coeffq+16*16, 32
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    mov                   tx2q, r3
+    mova                    m7, [o(pw_8192)]
+    jmp  m(iadst_8x8_internal).pass1_end1
+
+.pass2:
+    lea                   tx2q, [o(m(iadst_16x16_internal).end)]
+    jmp m(iadst_8x16_internal).pass2_pre
+
+.end:
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(iadst_16x16_internal).end1)]
+    mov                   dstq, r3
+    lea                     r3, [dstq+8]
+    jmp  m(iadst_8x8_internal).end
+
+.end1:
+    pxor                    m7, m7
+    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
+
+    add                 coeffq, 32*8
+    mov                   dstq, r3
+
+    mova                    m4, [coeffq+16*0 ]
+    mova                    m5, [coeffq+16*2 ]
+    mova                    m0, [coeffq+16*4 ]
+    mova                    m1, [coeffq+16*6 ]
+    mova                    m2, [coeffq+16*8 ]
+    mova                    m3, [coeffq+16*10]
+    mova                    m6, [coeffq+16*12]
+    mova                    m7, [coeffq+16*14]
+    mova    [rsp+gprsize+16*7], m4
+    mova    [rsp+gprsize+16*8], m5
+    mova    [rsp+gprsize+16*5], m6
+    mova    [rsp+gprsize+16*6], m7
+    lea                   tx2q, [o(m(iadst_8x16_internal).end)]
+    jmp m(iadst_8x16_internal).pass2_main
+
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    ITX_16X16_ADST_LOAD_ODD_COEFS
+    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
+
+    mov                     r3, tx2q
+    lea                   tx2q, [o(m(iflipadst_16x16_internal).pass1_end)]
+    mova                    m7, [o(pw_m8192)]
+    jmp  m(iflipadst_8x8_internal).pass1_end1
+
+.pass1_end:
+    SAVE_8ROWS     coeffq+16*1, 32
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(iflipadst_16x16_internal).pass1_end1)]
+    mova                    m7, [o(pw_m8192)]
+    jmp  m(iflipadst_8x8_internal).pass1_end1
+
+.pass1_end1:
+    SAVE_8ROWS    coeffq+16*17, 32
+    ITX_16X16_ADST_LOAD_EVEN_COEFS
+    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
+
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS     coeffq+16*0, 32
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(iflipadst_16x16_internal).pass1_end2)]
+    mova                    m7, [o(pw_m8192)]
+    jmp  m(iflipadst_8x8_internal).pass1_end1
+
+.pass1_end2:
+    SAVE_8ROWS    coeffq+16*16, 32
+    LOAD_8ROWS    coeffq+16* 0, 32
+    mova    [rsp+gprsize+16*0], m7
+    mov                   tx2q, r3
+    mova                    m7, [o(pw_m8192)]
+    jmp m(iflipadst_8x8_internal).pass1_end1
+
+.pass2:
+    lea                   tx2q, [o(m(iflipadst_16x16_internal).end)]
+    lea                     r3, [dstq+8]
+    jmp m(iflipadst_8x16_internal).pass2_pre
+
+.end:
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(iflipadst_16x16_internal).end1)]
+    lea                   dstq, [dstq+strideq*2]
+    jmp  m(iflipadst_8x8_internal).end
+
+.end1:
+    pxor                    m7, m7
+    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
+
+    add                 coeffq, 32*8
+
+    mova                    m4, [coeffq+16*0 ]
+    mova                    m5, [coeffq+16*2 ]
+    mova                    m0, [coeffq+16*4 ]
+    mova                    m1, [coeffq+16*6 ]
+    mova                    m2, [coeffq+16*8 ]
+    mova                    m3, [coeffq+16*10]
+    mova                    m6, [coeffq+16*12]
+    mova                    m7, [coeffq+16*14]
+    mova    [rsp+gprsize+16*7], m4
+    mova    [rsp+gprsize+16*8], m5
+    mova    [rsp+gprsize+16*5], m6
+    mova    [rsp+gprsize+16*6], m7
+
+    lea                   tx2q, [o(m(iflipadst_16x16_internal).end2)]
+    mov                   dstq, r3
+    jmp m(iflipadst_8x16_internal).pass2_main
+
+.end2:
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_8x16_internal).end1)]
+    lea                   dstq, [dstq+strideq*2]
+    jmp  m(iflipadst_8x8_internal).end
+
+
+%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
+    pmulhrsw            m%2, m%3, m%1
+    psraw               m%2, 1
+    pavgw               m%1, m%2
+%endmacro
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    add                 coeffq, 16*17
+    mov                     r3, tx2q
+    lea                   tx2q, [o(m(iidentity_16x16_internal).pass1_end)]
+
+.pass1:
+    mova                    m6, [o(pw_1697x16)]
+    mova                    m7, [coeffq+32*6]
+    mova                    m0, [coeffq+32*0]
+    mova                    m1, [coeffq+32*1]
+    mova                    m2, [coeffq+32*2]
+    mova                    m3, [coeffq+32*3]
+    mova                    m4, [coeffq+32*4]
+    REPX     {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4
+    mova                    m5, [coeffq+32*5]
+    mova    [rsp+gprsize+16*1], m7
+    IDTX16B                  5, 7, 6
+    mova                    m7, [coeffq+32*7]
+    IDTX16B                  7, 6, 6
+    jmp   m(idct_8x8_internal).pass1_end3
+
+.pass1_end:
+    SAVE_8ROWS          coeffq, 32
+    sub                 coeffq, 16
+    lea                   tx2q, [o(m(iidentity_16x16_internal).pass1_end1)]
+    jmp .pass1
+
+.pass1_end1:
+    SAVE_8ROWS          coeffq, 32
+    sub                 coeffq, 15*16
+    lea                   tx2q, [o(m(iidentity_16x16_internal).pass1_end2)]
+    jmp .pass1
+
+.pass1_end2:
+    SAVE_8ROWS          coeffq, 32
+    sub                 coeffq, 16
+    mov                   tx2q, r3
+    jmp .pass1
+
+.pass2:
+    lea                     r3, [dstq+8]
+    lea                   tx2q, [o(m(iidentity_16x16_internal).end1)]
+
+.end:
+    mova    [rsp+gprsize+16*0], m7
+    mova    [rsp+gprsize+16*1], m4
+    mova                    m7, [o(pw_1697x16)]
+    REPX      {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3
+    mova                    m4, [o(pw_2048)]
+    pmulhrsw                m5, m4
+    pmulhrsw                m6, m4
+    mova    [rsp+gprsize+16*2], m5
+    mova                    m5, [rsp+gprsize+16*1]
+    mova    [rsp+gprsize+16*1], m6
+    IDTX16                   5, 6, 7
+    mova                    m6, [rsp+gprsize+16*0]
+    IDTX16                   6, 7, 7
+    REPX      {pmulhrsw x, m4}, m0, m1, m2, m3, m6
+    pmulhrsw                m4, m5
+    mova    [rsp+gprsize+16*0], m6
+    jmp   m(idct_8x8_internal).end3
+
+.end1:
+    LOAD_8ROWS     coeffq+16*1, 32
+    lea                   tx2q, [o(m(iidentity_16x16_internal).end2)]
+    lea                   dstq, [dstq+strideq*2]
+    jmp .end
+
+.end2:
+    pxor                    m7, m7
+    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
+
+    add                 coeffq, 32*8
+    LOAD_8ROWS          coeffq, 32
+    lea                   tx2q, [o(m(iidentity_16x16_internal).end3)]
+    mov                   dstq, r3
+    jmp .end
+
+.end3:
+    LOAD_8ROWS     coeffq+16*1, 32
+    lea                   tx2q, [o(m(idct_8x16_internal).end1)]
+    lea                   dstq, [dstq+strideq*2]
+    jmp .end
+
+
+cglobal inv_txfm_add_dct_dct_8x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+    call  m(idct_8x32_internal)
+    RET
+
+.dconly:
+    movd                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1, [coeffq]
+    movd                 m2, [o(pw_8192)]
+    mov            [coeffq], eobd
+    pmulhrsw             m0, m2
+    psrlw                m2, 2            ;pw_2048
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m2
+    pshuflw              m0, m0, q0000
+    punpcklwd            m0, m0
+    mov                 r3d, 8
+    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x32).end)]
+    jmp m(inv_txfm_add_dct_dct_8x8).loop
+
+.end:
+    RET
+
+
+
+cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+    cmp                   eobd, 106
+    jle .fast
+
+    LOAD_8ROWS     coeffq+16*3, 64
+    call  m(idct_8x8_internal).main
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_8x32_internal).pass1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1:
+    mova   [rsp+gprsize+16*9 ], m0                        ;in24
+    mova   [rsp+gprsize+16*10], m4                        ;in28
+    mova   [rsp+gprsize+16*17], m2                        ;in26
+    mova   [rsp+gprsize+16*18], m6                        ;in30
+    mova   [rsp+gprsize+16*31], m1                        ;in25
+    mova   [rsp+gprsize+16*30], m3                        ;in27
+    mova   [rsp+gprsize+16*27], m5                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+    LOAD_8ROWS     coeffq+16*2, 64
+    call  m(idct_8x8_internal).main
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_8x32_internal).pass1_1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_1:
+    mova   [rsp+gprsize+16*7 ], m0                        ;in16
+    mova   [rsp+gprsize+16*8 ], m4                        ;in20
+    mova   [rsp+gprsize+16*15], m2                        ;in18
+    mova   [rsp+gprsize+16*16], m6                        ;in22
+    mova   [rsp+gprsize+16*33], m1                        ;in17
+    mova   [rsp+gprsize+16*28], m3                        ;in19
+    mova   [rsp+gprsize+16*29], m5                        ;in21
+    mova   [rsp+gprsize+16*32], m7                        ;in23
+
+.fast:
+    LOAD_8ROWS     coeffq+16*1, 64
+    call  m(idct_8x8_internal).main
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_8x32_internal).pass1_end)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end:
+    mova   [rsp+gprsize+16*5 ], m0                        ;in8
+    mova   [rsp+gprsize+16*6 ], m4                        ;in12
+    mova   [rsp+gprsize+16*13], m2                        ;in10
+    mova   [rsp+gprsize+16*14], m6                        ;in14
+    mova   [rsp+gprsize+16*21], m1                        ;in9
+    mova   [rsp+gprsize+16*24], m3                        ;in11
+    mova   [rsp+gprsize+16*25], m5                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+    LOAD_8ROWS     coeffq+16*0, 64
+    call  m(idct_8x8_internal).main
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_8x32_internal).pass1_end1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end1:
+    mova   [rsp+gprsize+16*11], m2                        ;in2
+    mova   [rsp+gprsize+16*12], m6                        ;in6
+    mova   [rsp+gprsize+16*19], m1                        ;in1
+    mova   [rsp+gprsize+16*26], m3                        ;in3
+    mova   [rsp+gprsize+16*23], m5                        ;in5
+    mova   [rsp+gprsize+16*22], m7                        ;in7
+    mova                    m1, m4                        ;in4
+    mova                    m2, [rsp+gprsize+16*5 ]       ;in8
+    mova                    m3, [rsp+gprsize+16*6 ]       ;in12
+
+    cmp                   eobd, 106
+    jg .full
+
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS   rsp+gprsize+16*3 , 16
+    mova                    m0, [rsp+gprsize+16*11]
+    mova                    m1, [rsp+gprsize+16*12]
+    mova                    m2, [rsp+gprsize+16*13]
+    mova                    m3, [rsp+gprsize+16*14]
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call .main_fast
+    jmp  .pass2
+
+.full:
+    mova                    m4, [rsp+gprsize+16*7 ]       ;in16
+    mova                    m5, [rsp+gprsize+16*8 ]       ;in20
+    mova                    m6, [rsp+gprsize+16*9 ]       ;in24
+    mova                    m7, [rsp+gprsize+16*10]       ;in28
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS   rsp+gprsize+16*3 , 16
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+    call .main
+
+.pass2:
+    lea                     r3, [o(m(idct_8x32_internal).end6)]
+
+.end:
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   tx2q, [o(m(idct_8x32_internal).end2)]
+
+.end1:
+    pxor                    m7, m7
+    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  \
+                                     8,  9,  10, 11, 12, 13, 14, 15, \
+                                     16, 17, 18, 19, 20, 21, 22, 23, \
+                                     24, 25, 26, 27, 28, 29, 30, 31
+
+    jmp                   tx2q
+
+.end2:
+    lea                   tx2q, [o(m(idct_8x32_internal).end3)]
+    jmp   m(idct_8x8_internal).end
+
+.end3:
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   dstq, [dstq+strideq*2]
+    lea                   tx2q, [o(m(idct_8x32_internal).end4)]
+    jmp   m(idct_8x8_internal).end
+
+.end4:
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   dstq, [dstq+strideq*2]
+    lea                   tx2q, [o(m(idct_8x32_internal).end5)]
+    jmp   m(idct_8x8_internal).end
+
+.end5:
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   dstq, [dstq+strideq*2]
+    mov                   tx2q, r3
+    jmp   m(idct_8x8_internal).end
+
+.end6:
+    ret
+
+ALIGN function_align
+.main_veryfast:
+    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
+    pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t30,t31
+    pmulhrsw                m0, [o(pw_201x8)]             ;t16,t17
+    mova                    m7, [o(pd_2048)]
+    mova [rsp+gprsize*2+16*19], m0                        ;t16
+    mova [rsp+gprsize*2+16*34], m3                        ;t31
+    ITX_MULSUB_2W            3, 0, 1, 2, 7,  799, 4017    ;t17a, t30a
+    mova [rsp+gprsize*2+16*20], m3                        ;t17a
+    mova [rsp+gprsize*2+16*33], m0                        ;t30a
+    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
+    pmulhrsw                m2, m1, [o(pw_3857x8)]        ;t28,t29
+    pmulhrsw                m1, [o(pw_m1380x8)]           ;t18,t19
+    mova [rsp+gprsize*2+16*22], m1                        ;t19
+    mova [rsp+gprsize*2+16*31], m2                        ;t28
+    ITX_MULSUB_2W            2, 1, 0, 3, 7, m4017, 799    ;t18a, t29a
+    mova [rsp+gprsize*2+16*21], m2                        ;t18a
+    mova [rsp+gprsize*2+16*32], m1                        ;t29a
+    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
+    pmulhrsw                m3, m0, [o(pw_3973x8)]        ;t26, t27
+    pmulhrsw                m0, [o(pw_995x8)]             ;t20, t21
+    mova [rsp+gprsize*2+16*23], m0                        ;t20
+    mova [rsp+gprsize*2+16*30], m3                        ;t27
+    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3406, 2276    ;t21a, t26a
+    mova [rsp+gprsize*2+16*24], m3                        ;t21a
+    mova [rsp+gprsize*2+16*29], m0                        ;t26a
+    mova                    m2, [rsp+gprsize*2+16*26]     ;in3
+    pxor                    m0, m0
+    mova                    m3, m0
+    pmulhrsw                m1, m2, [o(pw_4052x8)]
+    pmulhrsw                m2, [o(pw_m601x8)]
+    jmp .main2
+
+ALIGN function_align
+.main_fast: ;bottom half is zero
+    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
+    mova                    m1, [rsp+gprsize*2+16*20]     ;in15
+    pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t31a
+    pmulhrsw                m0, [o(pw_201x8)]             ;t16a
+    pmulhrsw                m2, m1, [o(pw_3035x8)]        ;t30a
+    pmulhrsw                m1, [o(pw_m2751x8)]           ;t17a
+    mova                    m7, [o(pd_2048)]
+    psubsw                  m4, m0, m1                    ;t17
+    paddsw                  m0, m1                        ;t16
+    psubsw                  m5, m3, m2                    ;t30
+    paddsw                  m3, m2                        ;t31
+    ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a
+    mova [rsp+gprsize*2+16*19], m0                        ;t16
+    mova [rsp+gprsize*2+16*20], m5                        ;t17a
+    mova [rsp+gprsize*2+16*33], m4                        ;t30a
+    mova [rsp+gprsize*2+16*34], m3                        ;t31
+    mova                    m0, [rsp+gprsize*2+16*21]     ;in9
+    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
+    pmulhrsw                m3, m0, [o(pw_3703x8)]
+    pmulhrsw                m0, [o(pw_1751x8)]
+    pmulhrsw                m2, m1, [o(pw_3857x8)]
+    pmulhrsw                m1, [o(pw_m1380x8)]
+    psubsw                  m4, m1, m0                    ;t18
+    paddsw                  m0, m1                        ;t19
+    psubsw                  m5, m2, m3                    ;t29
+    paddsw                  m3, m2                        ;t28
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4017, 799    ;t18a, t29a
+    mova [rsp+gprsize*2+16*21], m5                        ;t18a
+    mova [rsp+gprsize*2+16*22], m0                        ;t19
+    mova [rsp+gprsize*2+16*31], m3                        ;t28
+    mova [rsp+gprsize*2+16*32], m4                        ;t29a
+    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
+    mova                    m1, [rsp+gprsize*2+16*24]     ;in11
+    pmulhrsw                m3, m0, [o(pw_3973x8)]
+    pmulhrsw                m0, [o(pw_995x8)]
+    pmulhrsw                m2, m1, [o(pw_3513x8)]
+    pmulhrsw                m1, [o(pw_m2106x8)]
+    psubsw                  m4, m0, m1                    ;t21
+    paddsw                  m0, m1                        ;t20
+    psubsw                  m5, m3, m2                    ;t26
+    paddsw                  m3, m2                        ;t27
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a
+    mova [rsp+gprsize*2+16*23], m0                        ;t20
+    mova [rsp+gprsize*2+16*24], m5                        ;t21a
+    mova [rsp+gprsize*2+16*29], m4                        ;t26a
+    mova [rsp+gprsize*2+16*30], m3                        ;t27
+    mova                    m0, [rsp+gprsize*2+16*25]     ;in13
+    mova                    m2, [rsp+gprsize*2+16*26]     ;in3
+    pmulhrsw                m3, m0, [o(pw_3290x8)]
+    pmulhrsw                m0, [o(pw_2440x8)]
+    pmulhrsw                m1, m2, [o(pw_4052x8)]
+    pmulhrsw                m2, [o(pw_m601x8)]
+    jmp .main2
+
+ALIGN function_align
+.main:
+    mova                    m7, [o(pd_2048)]
+    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
+    mova                    m1, [rsp+gprsize*2+16*20]     ;in15
+    mova                    m2, [rsp+gprsize*2+16*33]     ;in17
+    mova                    m3, [rsp+gprsize*2+16*34]     ;in31
+    ITX_MULSUB_2W            0, 3, 4, 5, 7,  201, 4091    ;t16a, t31a
+    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3035, 2751    ;t17a, t30a
+    psubsw                  m4, m0, m2                    ;t17
+    paddsw                  m0, m2                        ;t16
+    psubsw                  m5, m3, m1                    ;t30
+    paddsw                  m3, m1                        ;t31
+    ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a
+    mova [rsp+gprsize*2+16*19], m0                        ;t16
+    mova [rsp+gprsize*2+16*20], m5                        ;t17a
+    mova [rsp+gprsize*2+16*33], m4                        ;t30a
+    mova [rsp+gprsize*2+16*34], m3                        ;t31
+    mova                    m0, [rsp+gprsize*2+16*21]     ;in9
+    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
+    mova                    m2, [rsp+gprsize*2+16*31]     ;in25
+    mova                    m3, [rsp+gprsize*2+16*32]     ;in23
+    ITX_MULSUB_2W            0, 3, 4, 5, 7, 1751, 3703    ;t18a, t29a
+    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3857, 1380    ;t19a, t28a
+    psubsw                  m4, m2, m0                    ;t18
+    paddsw                  m0, m2                        ;t19
+    psubsw                  m5, m1, m3                    ;t29
+    paddsw                  m3, m1                        ;t28
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4017, 799    ;t18a, t29a
+    mova [rsp+gprsize*2+16*21], m5                        ;t18a
+    mova [rsp+gprsize*2+16*22], m0                        ;t19
+    mova [rsp+gprsize*2+16*31], m3                        ;t28
+    mova [rsp+gprsize*2+16*32], m4                        ;t29a
+    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
+    mova                    m1, [rsp+gprsize*2+16*24]     ;in11
+    mova                    m2, [rsp+gprsize*2+16*29]     ;in21
+    mova                    m3, [rsp+gprsize*2+16*30]     ;in27
+    ITX_MULSUB_2W            0, 3, 4, 5, 7,  995, 3973    ;t20a, t27a
+    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3513, 2106    ;t21a, t26a
+    psubsw                  m4, m0, m2                    ;t21
+    paddsw                  m0, m2                        ;t20
+    psubsw                  m5, m3, m1                    ;t26
+    paddsw                  m3, m1                        ;t27
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a
+    mova [rsp+gprsize*2+16*23], m0                        ;t20
+    mova [rsp+gprsize*2+16*24], m5                        ;t21a
+    mova [rsp+gprsize*2+16*29], m4                        ;t26a
+    mova [rsp+gprsize*2+16*30], m3                        ;t27
+    mova                    m0, [rsp+gprsize*2+16*25]     ;in13
+    mova                    m1, [rsp+gprsize*2+16*26]     ;in3
+    mova                    m2, [rsp+gprsize*2+16*27]     ;in29
+    mova                    m3, [rsp+gprsize*2+16*28]     ;in19
+    ITX_MULSUB_2W            0, 3, 4, 5, 7, 2440, 3290    ;t22a, t25a
+    ITX_MULSUB_2W            2, 1, 4, 5, 7, 4052,  601    ;t23a, t24a
+
+.main2:
+    psubsw                  m4, m2, m0                    ;t22
+    paddsw                  m0, m2                        ;t23
+    psubsw                  m5, m1, m3                    ;t25
+    paddsw                  m3, m1                        ;t24
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, m2276, 3406   ;t22a, t25a
+    mova                    m2, [rsp+gprsize*2+16*24]     ;t21a
+    psubsw                  m1, m5, m2                    ;t21
+    paddsw                  m5, m2                        ;t22
+    mova [rsp+gprsize*2+16*25], m5                        ;t22
+    mova                    m2, [rsp+gprsize*2+16*29]     ;t26a
+    psubsw                  m5, m4, m2                    ;t26
+    paddsw                  m4, m2                        ;t25
+    mova [rsp+gprsize*2+16*28], m4                        ;t25
+    ITX_MULSUB_2W            5, 1, 2, 4, 7, m3784, 1567   ;t21a, t26a
+    mova [rsp+gprsize*2+16*24], m5                        ;t21a
+    mova [rsp+gprsize*2+16*29], m1                        ;t26a
+
+    mova                    m1, [rsp+gprsize*2+16*23]     ;t20
+    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
+    psubsw                  m2, m0, m1                    ;t20a
+    paddsw                  m0, m1                        ;t23a
+    psubsw                  m6, m3, m5                    ;t27a
+    paddsw                  m3, m5                        ;t24a
+    ITX_MULSUB_2W            6, 2, 1, 5, 7, m3784, 1567   ;t20, t27
+    mova [rsp+gprsize*2+16*26], m0                        ;t23a
+    mova [rsp+gprsize*2+16*27], m3                        ;t24a
+    mova [rsp+gprsize*2+16*30], m2                        ;t27
+
+    mova                    m0, [rsp+gprsize*2+16*20]     ;t17a
+    mova                    m1, [rsp+gprsize*2+16*21]     ;t18a
+    mova                    m2, [rsp+gprsize*2+16*32]     ;t29a
+    mova                    m3, [rsp+gprsize*2+16*33]     ;t30a
+    psubsw                  m4, m0, m1                    ;t18
+    paddsw                  m0, m1                        ;t17
+    psubsw                  m5, m3, m2                    ;t29
+    paddsw                  m3, m2                        ;t30
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t18a, t29a
+    mova [rsp+gprsize*2+16*20], m0                        ;t17
+    mova [rsp+gprsize*2+16*21], m5                        ;t18a
+    mova [rsp+gprsize*2+16*32], m4                        ;t29a
+    mova [rsp+gprsize*2+16*33], m3                        ;t30
+    mova                    m0, [rsp+gprsize*2+16*19]     ;t16
+    mova                    m1, [rsp+gprsize*2+16*22]     ;t19
+    mova                    m2, [rsp+gprsize*2+16*31]     ;t28
+    mova                    m3, [rsp+gprsize*2+16*34]     ;t31
+    psubsw                  m4, m0, m1                    ;t19a
+    paddsw                  m0, m1                        ;t16a
+    psubsw                  m5, m3, m2                    ;t28a
+    paddsw                  m3, m2                        ;t31a
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t19, t28
+    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp12
+    psubsw                  m1, m5, m6                    ;t20a
+    paddsw                  m5, m6                        ;t19a
+    psubsw                  m6, m2, m5                    ;out19
+    paddsw                  m2, m5                        ;out12
+    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
+    mova [rsp+gprsize*2+16*22], m6                        ;out19
+    mova [rsp+gprsize*2+16*15], m2                        ;out12
+    psubsw                  m6, m4, m5                    ;t27a
+    paddsw                  m4, m5                        ;t28a
+    ITX_MULSUB_2W            6, 1, 2, 5, 7, 2896, 2896    ;t20, t27
+    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp3
+    psubsw                  m5, m2, m4                    ;out28
+    paddsw                  m2, m4                        ;out3
+    mova                    m4, [rsp+gprsize*2+16*14]     ;tmp11
+    mova [rsp+gprsize*2+16*31], m5                        ;out28
+    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
+    psubsw                  m5, m4, m6                    ;out20
+    paddsw                  m4, m6                        ;out11
+    mova                    m2, [rsp+gprsize*2+16*7 ]     ;tmp4
+    mova [rsp+gprsize*2+16*23], m5                        ;out20
+    mova [rsp+gprsize*2+16*14], m4                        ;out11
+    psubsw                  m5, m2, m1                    ;out27
+    paddsw                  m2, m1                        ;out4
+    mova                    m1, [rsp+gprsize*2+16*26]     ;t23a
+    mova                    m4, [rsp+gprsize*2+16*27]     ;t24a
+    mova [rsp+gprsize*2+16*30], m5                        ;out27
+    mova [rsp+gprsize*2+16*7 ], m2                        ;out4
+    psubsw                  m5, m0, m1                    ;t23
+    paddsw                  m0, m1                        ;t16
+    psubsw                  m2, m3, m4                    ;t24
+    paddsw                  m3, m4                        ;t31
+    ITX_MULSUB_2W            2, 5, 4, 6, 7, 2896, 2896    ;t23a, t24a
+    mova                    m6, [rsp+gprsize*2+16*18]     ;tmp15
+    psubsw                  m4, m6, m0                    ;out16
+    paddsw                  m6, m0                        ;out15
+    mova                    m0, [rsp+gprsize*2+16*3 ]     ;tmp0
+    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp8
+    mova [rsp+gprsize*2+16*18], m6                        ;out15
+    mova [rsp+gprsize*2+16*19], m4                        ;out16
+    psubsw                  m6, m0, m3                    ;out31
+    paddsw                  m0, m3                        ;out0
+    psubsw                  m4, m1, m2                    ;out23
+    paddsw                  m1, m2                        ;out8
+    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp7
+    mova [rsp+gprsize*2+16*34], m6                        ;out31
+    mova [rsp+gprsize*2+16*11], m1                        ;out8
+    mova [rsp+gprsize*2+16*26], m4                        ;out23
+    paddsw                  m6, m3, m5                    ;out7
+    psubsw                  m3, m5                        ;out24
+    mova                    m1, [rsp+gprsize*2+16*20]     ;t17
+    mova                    m5, [rsp+gprsize*2+16*25]     ;t22
+    mova                    m2, [rsp+gprsize*2+16*17]     ;tmp14
+    mova [rsp+gprsize*2+16*27], m3                        ;out24
+    psubsw                  m4, m1, m5                    ;t22a
+    paddsw                  m1, m5                        ;t17a
+    psubsw                  m3, m2, m1                    ;out17
+    paddsw                  m2, m1                        ;out14
+    mova                    m5, [rsp+gprsize*2+16*28]     ;t25
+    mova                    m1, [rsp+gprsize*2+16*33]     ;t30
+    mova [rsp+gprsize*2+16*17], m2                        ;out14
+    mova [rsp+gprsize*2+16*20], m3                        ;out17
+    psubsw                  m2, m1, m5                    ;t25a
+    paddsw                  m1, m5                        ;t30a
+    ITX_MULSUB_2W            2, 4, 3, 5, 7, 2896, 2896    ;t22, t25
+    mova                    m5, [rsp+gprsize*2+16*4 ]     ;tmp1
+    psubsw                  m3, m5, m1                    ;out30
+    paddsw                  m5, m1                        ;out1
+    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp9
+    mova [rsp+gprsize*2+16*33], m3                        ;out30
+    mova [rsp+gprsize*2+16*4 ], m5                        ;out1
+    psubsw                  m3, m1, m2                    ;out22
+    paddsw                  m1, m2                        ;out9
+    mova                    m5, [rsp+gprsize*2+16*9 ]     ;tmp6
+    mova [rsp+gprsize*2+16*25], m3                        ;out22
+    mova [rsp+gprsize*2+16*12], m1                        ;out9
+    psubsw                  m3, m5, m4                    ;out25
+    paddsw                  m5, m4                        ;out6
+    mova                    m4, [rsp+gprsize*2+16*21]     ;t18a
+    mova                    m1, [rsp+gprsize*2+16*24]     ;t21a
+    mova                    m2, [rsp+gprsize*2+16*16]     ;tmp13
+    mova [rsp+gprsize*2+16*28], m3                        ;out25
+    mova [rsp+gprsize*2+16*9 ], m5                        ;out6
+    paddsw                  m3, m4, m1                    ;t18
+    psubsw                  m4, m1                        ;t21
+    psubsw                  m5, m2, m3                    ;out18
+    paddsw                  m2, m3                        ;out13
+    mova                    m1, [rsp+gprsize*2+16*29]     ;t26a
+    mova                    m3, [rsp+gprsize*2+16*32]     ;t29a
+    mova [rsp+gprsize*2+16*21], m5                        ;out18
+    mova [rsp+gprsize*2+16*16], m2                        ;out13
+    psubsw                  m5, m3, m1                    ;t26
+    paddsw                  m3, m1                        ;t29
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 2896, 2896    ;t21a, t26a
+    mova                    m2, [rsp+gprsize*2+16*5 ]     ;tmp2
+    psubsw                  m1, m2, m3                    ;out29
+    paddsw                  m2, m3                        ;out2
+    mova                    m3, [rsp+gprsize*2+16*13]     ;tmp10
+    mova [rsp+gprsize*2+16*32], m1                        ;out29
+    psubsw                  m7, m3, m5                    ;out21
+    paddsw                  m3, m5                        ;out10
+    mova                    m5, [rsp+gprsize*2+16*8 ]     ;tmp5
+    mova [rsp+gprsize*2+16*24], m7                        ;out21
+    mova [rsp+gprsize*2+16*13], m3                        ;out10
+    psubsw                  m1, m5, m4                    ;out26
+    paddsw                  m5, m4                        ;out5
+    mova                    m7, m6                        ;out7
+    mova                    m3, [rsp+gprsize*2+16*6 ]     ;out3
+    mova                    m4, [rsp+gprsize*2+16*7 ]     ;out4
+    mova [rsp+gprsize*2+16*29], m1                        ;out26
+    mova                    m6, [rsp+gprsize*2+16*9 ]     ;out6
+    mova                    m1, [rsp+gprsize*2+16*4 ]     ;out1
+    ret
+
+
+cglobal inv_txfm_add_dct_dct_32x8, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+    call  m(idct_32x8_internal)
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_8192)]
+    mov               [coeffq], eobd
+    mov                    r3d, 8
+    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)]
+
+.body:
+    pmulhrsw                m0, m2
+    movd                    m2, [o(pw_2048)]  ;intentionally rip-relative
+    pmulhrsw                m0, m1
+    pmulhrsw                m0, m2
+    pshuflw                 m0, m0, q0000
+    punpcklwd               m0, m0
+    pxor                    m5, m5
+
+.loop:
+    mova                    m1, [dstq+16*0]
+    mova                    m3, [dstq+16*1]
+    punpckhbw               m2, m1, m5
+    punpcklbw               m1, m5
+    punpckhbw               m4, m3, m5
+    punpcklbw               m3, m5
+    paddw                   m2, m0
+    paddw                   m1, m0
+    paddw                   m4, m0
+    paddw                   m3, m0
+    packuswb                m1, m2
+    packuswb                m3, m4
+    mova           [dstq+16*0], m1
+    mova           [dstq+16*1], m3
+    add                   dstq, strideq
+    dec                    r3d
+    jg .loop
+    jmp                   tx2q
+
+.end:
+    RET
+
+
+cglobal idct_32x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+    LOAD_8ROWS     coeffq+16*0, 64
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    LOAD_8ROWS     coeffq+16*2, 64
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    LOAD_8ROWS     coeffq+16*1, 32
+    mova   [rsp+gprsize+16*19], m0                        ;in1
+    mova   [rsp+gprsize+16*26], m1                        ;in3
+    mova   [rsp+gprsize+16*23], m2                        ;in5
+    mova   [rsp+gprsize+16*22], m3                        ;in7
+    mova   [rsp+gprsize+16*21], m4                        ;in9
+    mova   [rsp+gprsize+16*24], m5                        ;in11
+    mova   [rsp+gprsize+16*25], m6                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+
+    cmp                   eobd, 106
+    jg  .full
+    call m(idct_8x32_internal).main_fast
+    jmp .pass2
+
+.full:
+    LOAD_8ROWS    coeffq+16*17, 32
+    mova   [rsp+gprsize+16*33], m0                        ;in17
+    mova   [rsp+gprsize+16*28], m1                        ;in19
+    mova   [rsp+gprsize+16*29], m2                        ;in21
+    mova   [rsp+gprsize+16*32], m3                        ;in23
+    mova   [rsp+gprsize+16*31], m4                        ;in25
+    mova   [rsp+gprsize+16*30], m5                        ;in27
+    mova   [rsp+gprsize+16*27], m6                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+    call m(idct_8x32_internal).main
+
+.pass2:
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   tx2q, [o(m(idct_32x8_internal).end)]
+    jmp  m(idct_8x32_internal).end1
+
+.end:
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x8_internal).end1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.end1:
+    lea                     r3, [dstq+8]
+    lea                   tx2q, [o(m(idct_32x8_internal).end2)]
+    jmp   m(idct_8x8_internal).pass2_main
+
+.end2:
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x8_internal).end3)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.end3:
+    mov                   dstq, r3
+    add                     r3, 8
+    lea                   tx2q, [o(m(idct_32x8_internal).end4)]
+    jmp   m(idct_8x8_internal).pass2_main
+
+.end4:
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x8_internal).end5)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.end5:
+    mov                   dstq, r3
+    add                     r3, 8
+    lea                   tx2q, [o(m(idct_32x8_internal).end6)]
+    jmp   m(idct_8x8_internal).pass2_main
+
+.end6:
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x8_internal).end7)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.end7:
+    mov                   dstq, r3
+    lea                   tx2q, [o(m(idct_32x8_internal).end8)]
+    jmp   m(idct_8x8_internal).pass2_main
+
+.end8:
+    ret
+
+
+cglobal inv_txfm_add_identity_identity_8x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+    mov                    r5d, 4
+    mov                   tx2d, 2
+    cmp                   eobd, 107
+    cmovns                tx2d, r5d
+    mov                    r3d, tx2d
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    lea                   tx2q, [o(m(idct_32x8_internal).end8)]
+.loop:
+    LOAD_8ROWS     coeffq+16*0, 64
+    paddsw                  m6, [o(pw_5)]
+    mova            [rsp+16*1], m6
+    mova                    m6, [o(pw_5)]
+    REPX        {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+    call  m(idct_8x8_internal).pass1_end3
+    REPX        {psraw  x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
+    mova            [rsp+16*2], m5
+    mova            [rsp+16*1], m6
+    mova            [rsp+16*0], m7
+    call  m(idct_8x8_internal).end3
+    lea                   dstq, [dstq+strideq*2]
+    pxor                    m7, m7
+    REPX   {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+    add                 coeffq, 16
+    dec                    r3d
+    jg .loop
+    RET
+
+cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+    mov                    r5d, 4
+    mov                   tx2d, 2
+    cmp                   eobd, 107
+    cmovns                tx2d, r5d
+    mov                    r3d, tx2d
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+
+.loop:
+    LOAD_8ROWS     coeffq+16*0, 16
+    pmulhrsw                m6, [o(pw_4096)]
+    mova            [rsp+16*1], m6
+    mova                    m6, [o(pw_4096)]
+    REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+    lea                   tx2q, [o(m(idct_32x8_internal).end8)]
+    call  m(idct_8x8_internal).pass1_end3
+
+    mov             [rsp+16*3], dstq
+    mova            [rsp+16*2], m5
+    mova            [rsp+16*1], m6
+    mova            [rsp+16*0], m7
+    lea                   tx2q, [o(m(idct_8x8_internal).end4)]
+    call  m(idct_8x8_internal).end3
+
+    add                 coeffq, 16*8
+    mov                   dstq, [rsp+16*3]
+    lea                   dstq, [dstq+8]
+    dec                    r3d
+    jg .loop
+    jnc .loop
+    RET
+
+
+cglobal inv_txfm_add_dct_dct_16x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+    call  m(idct_16x32_internal)
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_16384)]
+    mov               [coeffq], eobd
+    pmulhrsw                m0, m1
+    mov                    r2d, 16
+    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_16x32).end)]
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+
+.end:
+    RET
+
+cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    LOAD_8ROWS     coeffq+16*1, 128, 1
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+16*5, 128, 1
+    call m(idct_16x8_internal).main
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end:
+    SAVE_8ROWS    coeffq+16*33, 64               ;in8~in15
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end1)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end1:
+    mova        [coeffq+16*1 ], m0                        ;in8
+    mova        [coeffq+16*5 ], m4                        ;in12
+    mova   [rsp+gprsize+16*13], m2                        ;in10
+    mova   [rsp+gprsize+16*14], m6                        ;in14
+    mova   [rsp+gprsize+16*21], m1                        ;in9
+    mova   [rsp+gprsize+16*24], m3                        ;in11
+    mova   [rsp+gprsize+16*25], m5                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+    LOAD_8ROWS     coeffq+16*0, 128, 1
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+16*4, 128, 1
+    call m(idct_16x8_internal).main
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end2)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end2:
+    SAVE_8ROWS    coeffq+16*32, 64               ;in0~in7
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end3)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end3:
+    mova   [rsp+gprsize+16*11], m2                        ;in2
+    mova   [rsp+gprsize+16*12], m6                        ;in6
+    mova   [rsp+gprsize+16*19], m1                        ;in1
+    mova   [rsp+gprsize+16*26], m3                        ;in3
+    mova   [rsp+gprsize+16*23], m5                        ;in5
+    mova   [rsp+gprsize+16*22], m7                        ;in7
+
+    cmp                   eobd, 150
+    jg .full
+
+    mova                    m1, m4                        ;in4
+    mova                    m2, [coeffq+16*1 ]            ;in8
+    mova                    m3, [coeffq+16*5 ]            ;in12
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    mova                    m0, [rsp+gprsize+16*11]       ;in2
+    mova                    m1, [rsp+gprsize+16*12]       ;in6
+    mova                    m2, [rsp+gprsize+16*13]       ;in10
+    mova                    m3, [rsp+gprsize+16*14]       ;in14
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call m(idct_8x32_internal).main_fast
+    jmp  .pass2
+
+.full:
+    mova        [coeffq+16*0 ], m0                        ;in0
+    mova        [coeffq+16*4 ], m4                        ;in4
+
+    LOAD_8ROWS     coeffq+16*2, 128, 1
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+16*6, 128, 1
+    call m(idct_16x8_internal).main
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end4)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end4:
+    SAVE_8ROWS    coeffq+16*34, 64               ;in16~in23
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end5)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end5:
+    mova        [coeffq+16*2 ], m0                        ;in16
+    mova        [coeffq+16*6 ], m4                        ;in20
+    mova   [rsp+gprsize+16*15], m2                        ;in18
+    mova   [rsp+gprsize+16*16], m6                        ;in22
+    mova   [rsp+gprsize+16*33], m1                        ;in17
+    mova   [rsp+gprsize+16*28], m3                        ;in19
+    mova   [rsp+gprsize+16*29], m5                        ;in21
+    mova   [rsp+gprsize+16*32], m7                        ;in23
+
+    LOAD_8ROWS     coeffq+16*3, 128, 1
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+16*7, 128, 1
+    call m(idct_16x8_internal).main
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end6)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end6:
+    SAVE_8ROWS    coeffq+16*35, 64                        ;in24~in31
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end7)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end7:
+    mova   [rsp+gprsize+16*17], m2                        ;in26
+    mova   [rsp+gprsize+16*18], m6                        ;in30
+    mova   [rsp+gprsize+16*31], m1                        ;in25
+    mova   [rsp+gprsize+16*30], m3                        ;in27
+    mova   [rsp+gprsize+16*27], m5                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+
+    mova                    m6, m0                        ;in24
+    mova                    m7, m4                        ;in28
+    mova                    m0, [coeffq+16*0 ]            ;in0
+    mova                    m1, [coeffq+16*4 ]            ;in4
+    mova                    m2, [coeffq+16*1 ]            ;in8
+    mova                    m3, [coeffq+16*5 ]            ;in12
+    mova                    m4, [coeffq+16*2 ]            ;in16
+    mova                    m5, [coeffq+16*6 ]            ;in20
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS   rsp+gprsize+16*3 , 16
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call m(idct_8x32_internal).main
+
+.pass2:
+    mov  [rsp+gprsize*1+16*35], eobd
+    lea                     r3, [dstq+8]
+    mov  [rsp+gprsize*2+16*35], r3
+    lea                     r3, [o(m(idct_16x32_internal).end)]
+    jmp  m(idct_8x32_internal).end
+
+.end:
+    mov                   dstq, [rsp+gprsize*2+16*35]
+    mov                   eobd, [rsp+gprsize*1+16*35]
+    add                 coeffq, 16*32
+
+    mova                    m0, [coeffq+16*4 ]            ;in1
+    mova                    m1, [coeffq+16*12]            ;in3
+    mova                    m2, [coeffq+16*20]            ;in5
+    mova                    m3, [coeffq+16*28]            ;in7
+    mova                    m4, [coeffq+16*5 ]            ;in9
+    mova                    m5, [coeffq+16*13]            ;in11
+    mova                    m6, [coeffq+16*21]            ;in13
+    mova                    m7, [coeffq+16*29]            ;in15
+
+    mova   [rsp+gprsize+16*19], m0                        ;in1
+    mova   [rsp+gprsize+16*26], m1                        ;in3
+    mova   [rsp+gprsize+16*23], m2                        ;in5
+    mova   [rsp+gprsize+16*22], m3                        ;in7
+    mova   [rsp+gprsize+16*21], m4                        ;in9
+    mova   [rsp+gprsize+16*24], m5                        ;in11
+    mova   [rsp+gprsize+16*25], m6                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+
+    mova                    m0, [coeffq+16*0 ]            ;in0
+    mova                    m1, [coeffq+16*16]            ;in4
+    mova                    m2, [coeffq+16*1 ]            ;in8
+    mova                    m3, [coeffq+16*17]            ;in12
+
+    cmp                   eobd, 150
+    jg .full1
+
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    mova                    m0, [coeffq+16*8 ]            ;in2
+    mova                    m1, [coeffq+16*24]            ;in6
+    mova                    m2, [coeffq+16*9 ]            ;in10
+    mova                    m3, [coeffq+16*25]            ;in14
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call m(idct_8x32_internal).main_fast
+    jmp  .end1
+
+.full1:
+    mova                    m4, [coeffq+16*2 ]            ;in16
+    mova                    m5, [coeffq+16*18]            ;in20
+    mova                    m6, [coeffq+16*3 ]            ;in24
+    mova                    m7, [coeffq+16*19]            ;in26
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    mova                    m0, [coeffq+16*8 ]            ;in2
+    mova                    m1, [coeffq+16*24]            ;in6
+    mova                    m2, [coeffq+16*9 ]            ;in10
+    mova                    m3, [coeffq+16*25]            ;in14
+    mova                    m4, [coeffq+16*10]            ;in18
+    mova                    m5, [coeffq+16*26]            ;in22
+    mova                    m6, [coeffq+16*11]            ;in26
+    mova                    m7, [coeffq+16*27]            ;in30
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    mova                    m0, [coeffq+16*6 ]            ;in17
+    mova                    m1, [coeffq+16*14]            ;in19
+    mova                    m2, [coeffq+16*22]            ;in21
+    mova                    m3, [coeffq+16*30]            ;in23
+    mova                    m4, [coeffq+16*7 ]            ;in25
+    mova                    m5, [coeffq+16*15]            ;in27
+    mova                    m6, [coeffq+16*23]            ;in29
+    mova                    m7, [coeffq+16*31]            ;in31
+
+    mova   [rsp+gprsize+16*33], m0                        ;in17
+    mova   [rsp+gprsize+16*28], m1                        ;in19
+    mova   [rsp+gprsize+16*29], m2                        ;in21
+    mova   [rsp+gprsize+16*32], m3                        ;in23
+    mova   [rsp+gprsize+16*31], m4                        ;in25
+    mova   [rsp+gprsize+16*30], m5                        ;in27
+    mova   [rsp+gprsize+16*27], m6                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+
+    call m(idct_8x32_internal).main
+
+.end1:
+    jmp m(idct_8x32_internal).pass2
+
+
+
+cglobal inv_txfm_add_dct_dct_32x16, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+
+    call m(idct_32x16_internal)
+    call m(idct_8x16_internal).pass2
+
+    add                 coeffq, 16*16
+    lea                   dstq, [r3+8]
+    LOAD_8ROWS       rsp+16*11, 16
+    mova            [rsp+16*0], m7
+    lea                   tx2q, [o(m(idct_32x16_internal).end)]
+    call  m(idct_8x8_internal).pass1_end
+    call m(idct_8x16_internal).pass2
+
+    add                 coeffq, 16*16
+    lea                   dstq, [r3+8]
+    LOAD_8ROWS       rsp+16*19, 16
+    mova            [rsp+16*0], m7
+    lea                   tx2q, [o(m(idct_32x16_internal).end)]
+    call  m(idct_8x8_internal).pass1_end
+    call m(idct_8x16_internal).pass2
+
+    add                 coeffq, 16*16
+    lea                   dstq, [r3+8]
+    LOAD_8ROWS       rsp+16*27, 16
+    mova            [rsp+16*0], m7
+    lea                   tx2q, [o(m(idct_32x16_internal).end)]
+    call  m(idct_8x8_internal).pass1_end
+    call m(idct_8x16_internal).pass2
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_16384)]
+    mov               [coeffq], eobd
+    pmulhrsw                m0, m1
+    mov                    r3d, 16
+    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)]
+    jmp m(inv_txfm_add_dct_dct_32x8).body
+
+
+cglobal idct_32x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    add                 coeffq, 16
+    lea                     r3, [o(m(idct_32x16_internal).pass1_end1)]
+.pass1:
+    LOAD_8ROWS     coeffq+16*0, 128, 1
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    LOAD_8ROWS     coeffq+16*4, 128, 1
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    LOAD_8ROWS     coeffq+16*2, 64, 1
+    mova   [rsp+gprsize+16*19], m0                        ;in1
+    mova   [rsp+gprsize+16*26], m1                        ;in3
+    mova   [rsp+gprsize+16*23], m2                        ;in5
+    mova   [rsp+gprsize+16*22], m3                        ;in7
+    mova   [rsp+gprsize+16*21], m4                        ;in9
+    mova   [rsp+gprsize+16*24], m5                        ;in11
+    mova   [rsp+gprsize+16*25], m6                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+
+    LOAD_8ROWS    coeffq+16*34, 64, 1
+    mova   [rsp+gprsize+16*33], m0                        ;in17
+    mova   [rsp+gprsize+16*28], m1                        ;in19
+    mova   [rsp+gprsize+16*29], m2                        ;in21
+    mova   [rsp+gprsize+16*32], m3                        ;in23
+    mova   [rsp+gprsize+16*31], m4                        ;in25
+    mova   [rsp+gprsize+16*30], m5                        ;in27
+    mova   [rsp+gprsize+16*27], m6                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+    call m(idct_8x32_internal).main
+
+.pass1_end:
+    mova   [rsp+gprsize+16*0 ], m7
+    mov                   tx2q, r3
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end1:
+    SAVE_8ROWS     coeffq+16*0, 32
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   tx2q, [o(m(idct_32x16_internal).pass1_end2)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end2:
+    SAVE_8ROWS    coeffq+16*16, 32
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   tx2q, [o(m(idct_32x16_internal).pass1_end3)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end3:
+    SAVE_8ROWS    coeffq+16*32, 32
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   tx2q, [o(m(idct_32x16_internal).pass1_end4)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end4:
+    SAVE_8ROWS    coeffq+16*48, 32
+
+    sub                 coeffq, 16
+    lea                     r3, [o(m(idct_32x16_internal).end)]
+    jmp .pass1
+
+.end:
+    ret
+
+
+cglobal inv_txfm_add_identity_identity_16x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    mov                    r4d, eobd
+    cmp                   eobd, 43                ;if (eob > 43)
+    sbb                    r3d, r3d               ;  iteration_count++
+    cmp                    r4d, 150               ;if (eob > 150)
+    sbb                    r3d, 0                 ;  iteration_count++
+    cmp                    r4d, 278               ;if (eob > 278)
+    sbb                    r3d, -4                ;  iteration_count++
+
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    lea                     r4, [dstq+8]
+    mov             [rsp+16*3], r4
+    mov     [rsp+gprsize+16*3], r3d
+    mov   [rsp+gprsize*2+16*3], coeffq
+
+.loop:
+    LOAD_8ROWS          coeffq, 64, 1
+    mova            [rsp+16*1], m6
+    pxor                    m6, m6
+    REPX   {mova [coeffq+64*x], m6}, 0,  1,  2,  3,  4,  5,  6,  7
+    lea                   tx2q, [o(m(idct_32x16_internal).end)]
+    call  m(idct_8x8_internal).pass1_end3
+    mova            [rsp+16*0], m2
+    mova            [rsp+16*1], m3
+    mova            [rsp+16*2], m4
+    mova                    m3, [o(pw_1697x16)]
+    mova                    m4, [o(pw_16384)]
+    REPX   {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1
+    mova                    m2, [o(pw_8192)]
+    REPX      {pmulhrsw x, m2}, m5, m6, m7, m0, m1
+    mova                    m2, [rsp+16*0]
+    mova            [rsp+16*0], m7
+    IDTX16                   2, 7, 3, 4
+    mova                    m7, [rsp+16*2]
+    mova            [rsp+16*2], m5
+    IDTX16                   7, 5, 3, 4
+    mova                    m5, [rsp+16*1]
+    mova            [rsp+16*1], m6
+    pmulhrsw                m3, m5
+    pmulhrsw                m3, m4
+    psrlw                   m4, 1 ; pw_8192
+    paddsw                  m3, m5
+    pmulhrsw                m2, m4
+    pmulhrsw                m3, m4
+    pmulhrsw                m4, m7
+    call  m(idct_8x8_internal).end3
+    lea                   dstq, [dstq+strideq*2]
+    add                 coeffq, 16
+    dec                    r3d
+    jg .loop
+    mov                 coeffq, [rsp+gprsize*2+16*3]
+    add                 coeffq, 64*8
+    mov                    r3d, [rsp+gprsize+16*3]
+    xor                   dstq, dstq
+    mov     [rsp+gprsize+16*3], dstq
+    mov                   dstq, [rsp+16*3]
+    test                   r3d, r3d
+    jnz .loop
+    RET
+
+
+cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    mov                    r4d, 12                ;0100b
+    mov                    r5d, 136               ;1000 1000b
+    cmp                   eobd, 44                ;if (eob > 43)
+    cmovns                 r4d, r5d               ;  iteration_count+2
+    cmp                   eobd, 151               ;if (eob > 150)
+    mov                    r3d, 34952             ;1000 1000 1000 1000b
+    cmovs                  r3d, r4d               ;  iteration_count += 4
+
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    lea                     r4, [dstq+8]
+    mov             [rsp+16*3], r4
+
+.loop:
+    LOAD_8ROWS          coeffq, 32, 1
+    REPX         {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+    mova            [rsp+16*1], m6
+    lea                   tx2q, [o(m(idct_32x16_internal).end)]
+    call  m(idct_8x8_internal).pass1_end3
+    mova            [rsp+16*1], m5
+    mova            [rsp+16*2], m6
+    mova                    m6, [o(pw_1697x16)]
+    REPX      {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4
+    pmulhrsw                m7, [o(pw_2048)]
+    mova                    m5, [rsp+16*1]
+    mova            [rsp+16*0], m7
+    IDTX16                   5, 7, 6
+    mova                    m7, [rsp+16*2]
+    IDTX16                   7, 6, 6
+    mova                    m6, [o(pw_2048)]
+    REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+    mova            [rsp+16*2], m5
+    mova            [rsp+16*1], m7
+    call  m(idct_8x8_internal).end3
+    lea                   dstq, [dstq+strideq*2]
+    pxor                    m7, m7
+    REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+
+.loop_end:
+    add                 coeffq, 16
+    shr                    r3d, 2
+    jz .ret
+    test                   r3d, 2
+    jnz .loop
+    mov                    r4d, r3d
+    and                    r4d, 1
+    lea                 coeffq, [coeffq+r4*8+32*7]
+    mov                   dstq, [rsp+16*3]
+    lea                     r4, [dstq+8]
+    mov             [rsp+16*3], r4
+    jmp .loop
+
+.ret:
+    RET
+
+
+cglobal inv_txfm_add_dct_dct_32x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+
+    call m(idct_32x32_internal)
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_8192)]
+    mov               [coeffq], eobd
+    mov                    r3d, 32
+    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)]
+    jmp m(inv_txfm_add_dct_dct_32x8).body
+
+
+cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    mov                    r4d, 2
+    sub                   eobd, 136
+    mov  [rsp+gprsize*1+16*35], eobd
+    mov                    r3d, 4
+    cmovs                  r3d, r4d
+
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+
+    mov  [rsp+gprsize*2+16*35], coeffq
+
+.pass1_loop:
+    LOAD_8ROWS     coeffq+64*1, 64*2
+    mova   [rsp+gprsize+16*19], m0                        ;in1
+    mova   [rsp+gprsize+16*26], m1                        ;in3
+    mova   [rsp+gprsize+16*23], m2                        ;in5
+    mova   [rsp+gprsize+16*22], m3                        ;in7
+    mova   [rsp+gprsize+16*21], m4                        ;in9
+    mova   [rsp+gprsize+16*24], m5                        ;in11
+    mova   [rsp+gprsize+16*25], m6                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+
+    mov                   tx2d, [rsp+gprsize*1+16*35]
+    test                  tx2d, tx2d
+    jl .fast
+
+.full:
+    LOAD_8ROWS     coeffq+64*0, 64*4
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+64*2, 64*4
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    LOAD_8ROWS    coeffq+64*17, 64*2
+    mova   [rsp+gprsize+16*33], m0                        ;in17
+    mova   [rsp+gprsize+16*28], m1                        ;in19
+    mova   [rsp+gprsize+16*29], m2                        ;in21
+    mova   [rsp+gprsize+16*32], m3                        ;in23
+    mova   [rsp+gprsize+16*31], m4                        ;in25
+    mova   [rsp+gprsize+16*30], m5                        ;in27
+    mova   [rsp+gprsize+16*27], m6                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+
+    call m(idct_8x32_internal).main
+    jmp .pass1_end
+
+.fast:
+    mova                    m0, [coeffq+256*0]
+    mova                    m1, [coeffq+256*1]
+    mova                    m2, [coeffq+256*2]
+    mova                    m3, [coeffq+256*3]
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    mova                    m0, [coeffq+128*1]
+    mova                    m1, [coeffq+128*3]
+    mova                    m2, [coeffq+128*5]
+    mova                    m3, [coeffq+128*7]
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call m(idct_8x32_internal).main_fast
+
+.pass1_end:
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x32_internal).pass1_end1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end1:
+    SAVE_8ROWS     coeffq+64*0, 64
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x32_internal).pass1_end2)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end2:
+    SAVE_8ROWS     coeffq+64*8, 64
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x32_internal).pass1_end3)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end3:
+    SAVE_8ROWS    coeffq+64*16, 64
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x32_internal).pass1_end4)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end4:
+    SAVE_8ROWS    coeffq+64*24, 64
+
+    add                 coeffq, 16
+    dec                    r3d
+    jg .pass1_loop
+
+
+.pass2:
+    mov                 coeffq, [rsp+gprsize*2+16*35]
+    mov                    r3d, 4
+    lea                   tx2q, [o(m(idct_32x32_internal).pass2_end)]
+
+.pass2_loop:
+    mov  [rsp+gprsize*3+16*35], r3d
+    lea                     r3, [dstq+8]
+    mov  [rsp+gprsize*2+16*35], r3
+
+    mova                    m0, [coeffq+16*4 ]
+    mova                    m1, [coeffq+16*12]
+    mova                    m2, [coeffq+16*20]
+    mova                    m3, [coeffq+16*28]
+    mova                    m4, [coeffq+16*5 ]
+    mova                    m5, [coeffq+16*13]
+    mova                    m6, [coeffq+16*21]
+    mova                    m7, [coeffq+16*29]
+    mova   [rsp+gprsize+16*19], m0                        ;in1
+    mova   [rsp+gprsize+16*26], m1                        ;in3
+    mova   [rsp+gprsize+16*23], m2                        ;in5
+    mova   [rsp+gprsize+16*22], m3                        ;in7
+    mova   [rsp+gprsize+16*21], m4                        ;in9
+    mova   [rsp+gprsize+16*24], m5                        ;in11
+    mova   [rsp+gprsize+16*25], m6                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+
+    mov                   eobd, [rsp+gprsize*1+16*35]
+    test                  eobd, eobd
+    jl .fast1
+
+.full1:
+    mova                    m0, [coeffq+16*0 ]
+    mova                    m1, [coeffq+16*16]
+    mova                    m2, [coeffq+16*1 ]
+    mova                    m3, [coeffq+16*17]
+    mova                    m4, [coeffq+16*2 ]
+    mova                    m5, [coeffq+16*18]
+    mova                    m6, [coeffq+16*3 ]
+    mova                    m7, [coeffq+16*19]
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    mova                    m0, [coeffq+16*8 ]
+    mova                    m1, [coeffq+16*24]
+    mova                    m2, [coeffq+16*9 ]
+    mova                    m3, [coeffq+16*25]
+    mova                    m4, [coeffq+16*10]
+    mova                    m5, [coeffq+16*26]
+    mova                    m6, [coeffq+16*11]
+    mova                    m7, [coeffq+16*27]
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    mova                    m0, [coeffq+16*6 ]
+    mova                    m1, [coeffq+16*14]
+    mova                    m2, [coeffq+16*22]
+    mova                    m3, [coeffq+16*30]
+    mova                    m4, [coeffq+16*7 ]
+    mova                    m5, [coeffq+16*15]
+    mova                    m6, [coeffq+16*23]
+    mova                    m7, [coeffq+16*31]
+    mova   [rsp+gprsize+16*33], m0                        ;in17
+    mova   [rsp+gprsize+16*28], m1                        ;in19
+    mova   [rsp+gprsize+16*29], m2                        ;in21
+    mova   [rsp+gprsize+16*32], m3                        ;in23
+    mova   [rsp+gprsize+16*31], m4                        ;in25
+    mova   [rsp+gprsize+16*30], m5                        ;in27
+    mova   [rsp+gprsize+16*27], m6                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+
+    call m(idct_8x32_internal).main
+    jmp                   tx2q
+
+.fast1:
+    mova                    m0, [coeffq+16*0 ]
+    mova                    m1, [coeffq+16*16]
+    mova                    m2, [coeffq+16*1 ]
+    mova                    m3, [coeffq+16*17]
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    mova                    m0, [coeffq+16*8 ]
+    mova                    m1, [coeffq+16*24]
+    mova                    m2, [coeffq+16*9 ]
+    mova                    m3, [coeffq+16*25]
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call m(idct_8x32_internal).main_fast
+    jmp                   tx2q
+
+.pass2_end:
+    lea                     r3, [o(m(idct_32x32_internal).pass2_end1)]
+    jmp  m(idct_8x32_internal).end
+
+.pass2_end1:
+    lea                   tx2q, [o(m(idct_32x32_internal).pass2_end)]
+    add                 coeffq, 16*32
+    mov                   dstq, [rsp+gprsize*2+16*35]
+    mov                    r3d, [rsp+gprsize*3+16*35]
+    dec                    r3d
+    jg .pass2_loop
+
+    ret
+
+
+cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    mov                    r4d, 2
+    cmp                   eobd, 136
+    mov                    r3d, 4
+    cmovs                  r3d, r4d
+
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+
+    lea                     r4, [dstq+8]
+    mov   [rsp+gprsize*0+16*3], r4
+    mov   [rsp+gprsize*1+16*3], r3d
+    mov   [rsp+gprsize*2+16*3], r3d
+    mov   [rsp+gprsize*3+16*3], coeffq
+
+.loop:
+    LOAD_8ROWS          coeffq, 64
+    mova            [rsp+16*1], m6
+    lea                   tx2q, [o(m(idct_32x16_internal).end)]
+    call  m(idct_8x8_internal).pass1_end3
+    pmulhrsw                m7, [o(pw_8192)]
+    mova            [rsp+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    mova            [rsp+16*1], m6
+    mova            [rsp+16*2], m5
+    call  m(idct_8x8_internal).end3
+    lea                   dstq, [dstq+strideq*2]
+
+    pxor                    m7, m7
+    REPX   {mova [coeffq+64*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
+
+    add                 coeffq, 16
+    dec                    r3d
+    jg .loop
+
+    mov                    r4d, [rsp+gprsize*2+16*3]
+    dec                    r4d
+    jle .ret
+
+    mov                   dstq, [rsp+gprsize*0+16*3]
+    mov                 coeffq, [rsp+gprsize*3+16*3]
+    mov   [rsp+gprsize*2+16*3], r4
+    lea                     r3, [dstq+8]
+    add                 coeffq, 64*8
+    mov   [rsp+gprsize*0+16*3], r3
+    mov                    r3d, [rsp+gprsize*1+16*3]
+    mov   [rsp+gprsize*3+16*3], coeffq
+    jmp .loop
+
+.ret:
+    RET
+
+
+cglobal inv_txfm_add_dct_dct_16x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+
+    call m(idct_16x64_internal)
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_8192)]
+    mov               [coeffq], eobd
+    mov                    r2d, 32
+    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_16x64).end)]
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+
+.end:
+    RET
+
+
+cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    mov                    r4d, 2
+    sub                   eobd, 151
+    mov  [rsp+gprsize*1+16*67], eobd
+    mov                    r3d, 4
+    cmovs                  r3d, r4d
+
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+
+    mov  [rsp+gprsize*2+16*67], coeffq
+
+.pass1_loop:
+    LOAD_8ROWS     coeffq+64*0, 64*2
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+64*1, 64*2
+    call m(idct_16x8_internal).main
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_16x64_internal).pass1_end)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end:
+    SAVE_8ROWS     coeffq+64*8, 64
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_16x64_internal).pass1_end1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end1:
+    SAVE_8ROWS     coeffq+64*0, 64
+
+    add                 coeffq, 16
+    dec                    r3d
+    jg .pass1_loop
+
+    mov                 coeffq, [rsp+gprsize*2+16*67]
+    mov                    r3d, 2
+    lea                     r4, [dstq+8]
+    mov  [rsp+gprsize*2+16*67], r4
+    lea                     r4, [o(m(idct_16x64_internal).end1)]
+
+.pass2_loop:
+    mov  [rsp+gprsize*3+16*67], r3d
+    mov                   eobd, [rsp+gprsize*1+16*67]
+
+    mova                    m0, [coeffq+16*4 ]            ;in1
+    mova                    m1, [coeffq+16*12]            ;in3
+    mova                    m2, [coeffq+16*20]            ;in5
+    mova                    m3, [coeffq+16*28]            ;in7
+    mova                    m4, [coeffq+16*5 ]            ;in9
+    mova                    m5, [coeffq+16*13]            ;in11
+    mova                    m6, [coeffq+16*21]            ;in13
+    mova                    m7, [coeffq+16*29]            ;in15
+    mova   [rsp+gprsize+16*35], m0                        ;in1
+    mova   [rsp+gprsize+16*49], m1                        ;in3
+    mova   [rsp+gprsize+16*43], m2                        ;in5
+    mova   [rsp+gprsize+16*41], m3                        ;in7
+    mova   [rsp+gprsize+16*39], m4                        ;in9
+    mova   [rsp+gprsize+16*45], m5                        ;in11
+    mova   [rsp+gprsize+16*47], m6                        ;in13
+    mova   [rsp+gprsize+16*37], m7                        ;in15
+
+    pxor                    m4, m4
+    mova                    m0, [coeffq+16*0]
+    mova                    m1, [coeffq+16*1]
+
+    test                  eobd, eobd
+    jl .fast
+
+.full:
+    mova                    m2, [coeffq+16*2]
+    mova                    m3, [coeffq+16*3]
+
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    pxor                    m4, m4
+    mova                    m0, [coeffq+16*16]
+    mova                    m1, [coeffq+16*17]
+    mova                    m2, [coeffq+16*18]
+    mova                    m3, [coeffq+16*19]
+
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    mova                    m0, [coeffq+16*8 ]
+    mova                    m1, [coeffq+16*24]
+    mova                    m2, [coeffq+16*9 ]
+    mova                    m3, [coeffq+16*25]
+    mova                    m4, [coeffq+16*10]
+    mova                    m5, [coeffq+16*26]
+    mova                    m6, [coeffq+16*11]
+    mova                    m7, [coeffq+16*27]
+    mova   [rsp+gprsize+16*19], m0
+    mova   [rsp+gprsize+16*26], m1
+    mova   [rsp+gprsize+16*23], m2
+    mova   [rsp+gprsize+16*22], m3
+    mova   [rsp+gprsize+16*21], m4
+    mova   [rsp+gprsize+16*24], m5
+    mova   [rsp+gprsize+16*25], m6
+    mova   [rsp+gprsize+16*20], m7
+
+    call m(idct_8x32_internal).main_fast
+    SAVE_8ROWS    rsp+gprsize+16*3, 16
+
+    mova                    m0, [coeffq+16*6 ]            ;in17
+    mova                    m1, [coeffq+16*14]            ;in19
+    mova                    m2, [coeffq+16*22]            ;in21
+    mova                    m3, [coeffq+16*30]            ;in23
+    mova                    m4, [coeffq+16*7 ]            ;in25
+    mova                    m5, [coeffq+16*15]            ;in27
+    mova                    m6, [coeffq+16*23]            ;in29
+    mova                    m7, [coeffq+16*31]            ;in31
+    mova   [rsp+gprsize+16*63], m0                        ;in17
+    mova   [rsp+gprsize+16*53], m1                        ;in19
+    mova   [rsp+gprsize+16*55], m2                        ;in21
+    mova   [rsp+gprsize+16*61], m3                        ;in23
+    mova   [rsp+gprsize+16*59], m4                        ;in25
+    mova   [rsp+gprsize+16*57], m5                        ;in27
+    mova   [rsp+gprsize+16*51], m6                        ;in29
+    mova   [rsp+gprsize+16*65], m7                        ;in31
+
+    call .main
+    jmp  .end
+
+.fast:
+    REPX          {mova x, m4}, m2, m3, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    pxor                    m4, m4
+    mova                    m0, [coeffq+16*16]
+    mova                    m1, [coeffq+16*17]
+
+    REPX          {mova x, m4}, m2, m3, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    mova                    m0, [coeffq+16*8 ]
+    mova                    m1, [coeffq+16*24]
+    mova                    m2, [coeffq+16*9 ]
+    mova                    m3, [coeffq+16*25]
+    mova   [rsp+gprsize+16*19], m0                        ;in1
+    mova   [rsp+gprsize+16*26], m1                        ;in3
+    mova   [rsp+gprsize+16*23], m2                        ;in5
+    mova   [rsp+gprsize+16*22], m3                        ;in7
+
+    call m(idct_8x32_internal).main_veryfast
+    SAVE_8ROWS    rsp+gprsize+16*3, 16
+
+    call .main_fast
+
+.end:
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    mov                     r3, r4
+    jmp  m(idct_8x32_internal).end2
+
+.end1:
+    LOAD_8ROWS   rsp+gprsize+16*35, 16
+    lea                   dstq, [dstq+strideq*2]
+    add                    rsp, 16*32
+    lea                     r3, [o(m(idct_16x64_internal).end2)]
+    jmp  m(idct_8x32_internal).end
+
+.end2:
+    add                 coeffq, 16*32
+    sub                    rsp, 16*32
+
+    mov                   dstq, [rsp+gprsize*2+16*67]
+    mov                    r3d, [rsp+gprsize*3+16*67]
+    lea                     r4, [dstq+8]
+    mov  [rsp+gprsize*2+16*67], r4
+    lea                     r4, [o(m(idct_16x64_internal).end1)]
+
+    dec                    r3d
+    jg .pass2_loop
+    ret
+
+
+ALIGN function_align
+.main_fast:
+    mova                    m0, [rsp+gprsize*2+16*35]     ;in1
+    pmulhrsw                m3, m0, [o(pw_4095x8)]        ;t62,t63
+    pmulhrsw                m0, [o(pw_101x8)]             ;t32,t33
+    mova                    m7, [o(pd_2048)]
+    mova [rsp+gprsize*2+16*35], m0                        ;t32
+    mova [rsp+gprsize*2+16*66], m3                        ;t63
+    ITX_MULSUB_2W            3, 0, 1, 2, 7,  401, 4076    ;t33a, t62a
+    mova [rsp+gprsize*2+16*36], m3                        ;t33a
+    mova [rsp+gprsize*2+16*65], m0                        ;t62a
+
+    mova                    m1, [rsp+gprsize*2+16*37]     ;in15
+    pmulhrsw                m2, m1, [o(pw_3822x8)]        ;t60,t61
+    pmulhrsw                m1, [o(pw_m1474x8)]           ;t34,t35
+    mova [rsp+gprsize*2+16*38], m1                        ;t35
+    mova [rsp+gprsize*2+16*63], m2                        ;t60
+    ITX_MULSUB_2W            2, 1, 0, 3, 7, m4076, 401    ;t34a, t61a
+    mova [rsp+gprsize*2+16*37], m2                        ;t34a
+    mova [rsp+gprsize*2+16*64], m1                        ;t61a
+
+    mova                    m0, [rsp+gprsize*2+16*39]     ;in9
+    pmulhrsw                m3, m0, [o(pw_3996x8)]        ;t58,t59
+    pmulhrsw                m0, [o(pw_897x8)]             ;t36,t37
+    mova [rsp+gprsize*2+16*39], m0                        ;t36
+    mova [rsp+gprsize*2+16*62], m3                        ;t59
+    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3166, 2598    ;t37a, t58a
+    mova [rsp+gprsize*2+16*40], m3                        ;t37a
+    mova [rsp+gprsize*2+16*61], m0                        ;t58a
+
+    mova                    m1, [rsp+gprsize*2+16*41]     ;in7
+    pmulhrsw                m2, m1, [o(pw_4036x8)]        ;t56,t57
+    pmulhrsw                m1, [o(pw_m700x8)]            ;t38,t39
+    mova [rsp+gprsize*2+16*42], m1                        ;t39
+    mova [rsp+gprsize*2+16*59], m2                        ;t56
+    ITX_MULSUB_2W            2, 1, 0, 3, 7, m2598, 3166   ;t38a, t57a
+    mova [rsp+gprsize*2+16*41], m2                        ;t38a
+    mova [rsp+gprsize*2+16*60], m1                        ;t57a
+
+    mova                    m0, [rsp+gprsize*2+16*43]     ;in5
+    pmulhrsw                m3, m0, [o(pw_4065x8)]        ;t54,t55
+    pmulhrsw                m0, [o(pw_501x8)]             ;t40,t41
+    mova [rsp+gprsize*2+16*43], m0                        ;t40
+    mova [rsp+gprsize*2+16*58], m3                        ;t55
+    ITX_MULSUB_2W            3, 0, 1, 2, 7, 1931, 3612    ;t41a, t54a
+    mova [rsp+gprsize*2+16*44], m3                        ;t41a
+    mova [rsp+gprsize*2+16*57], m0                        ;t54a
+
+    mova                    m1, [rsp+gprsize*2+16*45]     ;in11
+    pmulhrsw                m2, m1, [o(pw_3948x8)]        ;t52,t53
+    pmulhrsw                m1, [o(pw_m1092x8)]           ;t42,t43
+    mova [rsp+gprsize*2+16*46], m1                        ;t43
+    mova [rsp+gprsize*2+16*55], m2                        ;t52
+    ITX_MULSUB_2W            2, 1, 0, 3, 7, m3612, 1931   ;t42a, t53a
+    mova [rsp+gprsize*2+16*45], m2                        ;t42a
+    mova [rsp+gprsize*2+16*56], m1                        ;t53a
+
+    mova                    m0, [rsp+gprsize*2+16*47]     ;in13
+    pmulhrsw                m3, m0, [o(pw_3889x8)]        ;t50,t51
+    pmulhrsw                m0, [o(pw_1285x8)]            ;t44,t45
+    mova                    m6, m0
+    mova [rsp+gprsize*2+16*54], m3                        ;t51
+    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3920, 1189    ;t45a, t50a
+    mova [rsp+gprsize*2+16*48], m3                        ;t45a
+    mova [rsp+gprsize*2+16*53], m0                        ;t50a
+
+    mova                    m0, [rsp+gprsize*2+16*49]     ;in3
+    pmulhrsw                m3, m0, [o(pw_4085x8)]        ;t48,t49
+    pmulhrsw                m0, [o(pw_m301x8)]            ;t46,t47
+    mova                    m4, m3
+    mova                    m5, m0
+
+    jmp .main2
+
+ALIGN function_align
+.main:
+    mova                    m0, [rsp+gprsize*2+16*35]     ;in1
+    mova                    m1, [rsp+gprsize*2+16*65]     ;in31
+    pmulhrsw                m3, m0, [o(pw_4095x8)]        ;t63a
+    pmulhrsw                m0, [o(pw_101x8)]             ;t32a
+    pmulhrsw                m2, m1, [o(pw_2967x8)]        ;t62a
+    pmulhrsw                m1, [o(pw_m2824x8)]           ;t33a
+    mova                    m7, [o(pd_2048)]
+    psubsw                  m4, m0, m1                    ;t33
+    paddsw                  m0, m1                        ;t32
+    psubsw                  m5, m3, m2                    ;t62
+    paddsw                  m3, m2                        ;t63
+    ITX_MULSUB_2W            5, 4, 1, 2, 7,  401, 4076    ;t33a, t62a
+    mova [rsp+gprsize*2+16*35], m0                        ;t32
+    mova [rsp+gprsize*2+16*36], m5                        ;t33a
+    mova [rsp+gprsize*2+16*65], m4                        ;t62a
+    mova [rsp+gprsize*2+16*66], m3                        ;t63
+
+    mova                    m0, [rsp+gprsize*2+16*63]     ;in17
+    mova                    m1, [rsp+gprsize*2+16*37]     ;in15
+    pmulhrsw                m3, m0, [o(pw_3745x8)]        ;t61a
+    pmulhrsw                m0, [o(pw_1660x8)]            ;t34a
+    pmulhrsw                m2, m1, [o(pw_3822x8)]        ;t60a
+    pmulhrsw                m1, [o(pw_m1474x8)]           ;t35a
+    psubsw                  m4, m1, m0                    ;t34
+    paddsw                  m0, m1                        ;t35
+    psubsw                  m5, m2, m3                    ;t61
+    paddsw                  m3, m2                        ;t60
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4076, 401    ;t34a, t61a
+    mova [rsp+gprsize*2+16*37], m5                        ;t34a
+    mova [rsp+gprsize*2+16*38], m0                        ;t35
+    mova [rsp+gprsize*2+16*63], m3                        ;t60
+    mova [rsp+gprsize*2+16*64], m4                        ;t61a
+
+    mova                    m0, [rsp+gprsize*2+16*39]     ;in9
+    mova                    m1, [rsp+gprsize*2+16*61]     ;in23
+    pmulhrsw                m3, m0, [o(pw_3996x8)]        ;t59a
+    pmulhrsw                m0, [o(pw_897x8)]             ;t36a
+    pmulhrsw                m2, m1, [o(pw_3461x8)]        ;t58a
+    pmulhrsw                m1, [o(pw_m2191x8)]           ;t37a
+    psubsw                  m4, m0, m1                    ;t37
+    paddsw                  m0, m1                        ;t36
+    psubsw                  m5, m3, m2                    ;t58
+    paddsw                  m3, m2                        ;t59
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3166, 2598    ;t37a, t58a
+    mova [rsp+gprsize*2+16*39], m0                        ;t36
+    mova [rsp+gprsize*2+16*40], m5                        ;t37a
+    mova [rsp+gprsize*2+16*61], m4                        ;t58a
+    mova [rsp+gprsize*2+16*62], m3                        ;t59
+
+    mova                    m0, [rsp+gprsize*2+16*59]     ;in25
+    mova                    m1, [rsp+gprsize*2+16*41]     ;in7
+    pmulhrsw                m3, m0, [o(pw_3349x8)]        ;t57a
+    pmulhrsw                m0, [o(pw_2359x8)]            ;t38a
+    pmulhrsw                m2, m1, [o(pw_4036x8)]        ;t56a
+    pmulhrsw                m1, [o(pw_m700x8)]            ;t39a
+    psubsw                  m4, m1, m0                    ;t38
+    paddsw                  m0, m1                        ;t39
+    psubsw                  m5, m2, m3                    ;t57
+    paddsw                  m3, m2                        ;t56
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, m2598, 3166   ;t38a, t57a
+    mova [rsp+gprsize*2+16*41], m5                        ;t38a
+    mova [rsp+gprsize*2+16*42], m0                        ;t39
+    mova [rsp+gprsize*2+16*59], m3                        ;t56
+    mova [rsp+gprsize*2+16*60], m4                        ;t57a
+
+    mova                    m0, [rsp+gprsize*2+16*43]     ;in5
+    mova                    m1, [rsp+gprsize*2+16*57]     ;in27
+    pmulhrsw                m3, m0, [o(pw_4065x8)]        ;t55a
+    pmulhrsw                m0, [o(pw_501x8)]             ;t40a
+    pmulhrsw                m2, m1, [o(pw_3229x8)]        ;t54a
+    pmulhrsw                m1, [o(pw_m2520x8)]           ;t41a
+    psubsw                  m4, m0, m1                    ;t41
+    paddsw                  m0, m1                        ;t40
+    psubsw                  m5, m3, m2                    ;t54
+    paddsw                  m3, m2                        ;t55
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1931, 3612    ;t41a, t54a
+    mova [rsp+gprsize*2+16*43], m0                        ;t40
+    mova [rsp+gprsize*2+16*44], m5                        ;t41a
+    mova [rsp+gprsize*2+16*57], m4                        ;t54a
+    mova [rsp+gprsize*2+16*58], m3                        ;t55
+
+    mova                    m0, [rsp+gprsize*2+16*55]     ;in21
+    mova                    m1, [rsp+gprsize*2+16*45]     ;in11
+    pmulhrsw                m3, m0, [o(pw_3564x8)]        ;t53a
+    pmulhrsw                m0, [o(pw_2019x8)]            ;t42a
+    pmulhrsw                m2, m1, [o(pw_3948x8)]        ;t52a
+    pmulhrsw                m1, [o(pw_m1092x8)]           ;t43a
+    psubsw                  m4, m1, m0                    ;t42
+    paddsw                  m0, m1                        ;t43
+    psubsw                  m5, m2, m3                    ;t53
+    paddsw                  m3, m2                        ;t52
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, m3612, 1931   ;t42a, t53a
+    mova [rsp+gprsize*2+16*45], m5                        ;t42a
+    mova [rsp+gprsize*2+16*46], m0                        ;t43
+    mova [rsp+gprsize*2+16*55], m3                        ;t52
+    mova [rsp+gprsize*2+16*56], m4                        ;t53a
+
+    mova                    m0, [rsp+gprsize*2+16*47]     ;in13
+    mova                    m1, [rsp+gprsize*2+16*53]     ;in19
+    pmulhrsw                m3, m0, [o(pw_3889x8)]        ;t51a
+    pmulhrsw                m0, [o(pw_1285x8)]            ;t44a
+    pmulhrsw                m2, m1, [o(pw_3659x8)]        ;t50a
+    pmulhrsw                m1, [o(pw_m1842x8)]           ;t45a
+    psubsw                  m4, m0, m1                    ;t45
+    paddsw                  m0, m1                        ;t44
+    psubsw                  m5, m3, m2                    ;t50
+    paddsw                  m3, m2                        ;t51
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3920, 1189    ;t45a, t50a
+    mova                    m6, m0
+    mova [rsp+gprsize*2+16*48], m5                        ;t45a
+    mova [rsp+gprsize*2+16*53], m4                        ;t50a
+    mova [rsp+gprsize*2+16*54], m3                        ;t51
+
+    mova                    m0, [rsp+gprsize*2+16*51]     ;in29
+    mova                    m1, [rsp+gprsize*2+16*49]     ;in3
+    pmulhrsw                m3, m0, [o(pw_3102x8)]        ;t49a
+    pmulhrsw                m0, [o(pw_2675x8)]            ;t46a
+    pmulhrsw                m2, m1, [o(pw_4085x8)]        ;t48a
+    pmulhrsw                m1, [o(pw_m301x8)]            ;t47a
+    psubsw                  m5, m1, m0                    ;t46
+    paddsw                  m0, m1                        ;t47
+    psubsw                  m4, m2, m3                    ;t49
+    paddsw                  m3, m2                        ;t48
+
+ALIGN function_align
+.main2:
+    ITX_MULSUB_2W            4, 5, 1, 2, 7, m1189, 3920   ;t46a, t49a
+    mova                    m1, [rsp+gprsize*2+16*54]     ;t51
+    psubsw                  m2, m0, m6                    ;t44a
+    paddsw                  m0, m6                        ;t47a
+    psubsw                  m6, m3, m1                    ;t51a
+    paddsw                  m3, m1                        ;t48a
+    mova [rsp+gprsize*2+16*50], m0                        ;t47a
+    mova [rsp+gprsize*2+16*51], m3                        ;t48a
+    ITX_MULSUB_2W            6, 2, 0, 3, 7, m2276, 3406   ;t44, t51
+    mova [rsp+gprsize*2+16*47], m6                        ;t44
+    mova [rsp+gprsize*2+16*54], m2                        ;t51
+
+    mova                    m0, [rsp+gprsize*2+16*48]     ;t45a
+    mova                    m3, [rsp+gprsize*2+16*53]     ;t50a
+    psubsw                  m2, m4, m0                    ;t45
+    paddsw                  m4, m0                        ;t46
+    psubsw                  m6, m5, m3                    ;t50
+    paddsw                  m5, m3                        ;t49
+    ITX_MULSUB_2W            6, 2, 0, 3, 7, m2276, 3406   ;t45a, t50a
+    mova [rsp+gprsize*2+16*48], m6                        ;t45a
+    mova [rsp+gprsize*2+16*49], m4                        ;t46
+    mova [rsp+gprsize*2+16*52], m5                        ;t49
+    mova [rsp+gprsize*2+16*53], m2                        ;t50a
+
+    mova                    m0, [rsp+gprsize*2+16*43]     ;t40
+    mova                    m2, [rsp+gprsize*2+16*46]     ;t43
+    mova                    m3, [rsp+gprsize*2+16*55]     ;t52
+    mova                    m1, [rsp+gprsize*2+16*58]     ;t55
+    psubsw                  m4, m0, m2                    ;t43a
+    paddsw                  m0, m2                        ;t40a
+    psubsw                  m5, m1, m3                    ;t52a
+    paddsw                  m1, m3                        ;t55a
+    ITX_MULSUB_2W            5, 4, 2, 3, 7, 3406, 2276    ;t43, t52
+    mova [rsp+gprsize*2+16*43], m0                        ;t40a
+    mova [rsp+gprsize*2+16*46], m5                        ;t43
+    mova [rsp+gprsize*2+16*55], m4                        ;t52
+    mova [rsp+gprsize*2+16*58], m1                        ;t55a
+
+    mova                    m0, [rsp+gprsize*2+16*44]     ;t41a
+    mova                    m2, [rsp+gprsize*2+16*45]     ;t42a
+    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
+    mova                    m1, [rsp+gprsize*2+16*57]     ;t54a
+    psubsw                  m4, m0, m2                    ;t42
+    paddsw                  m0, m2                        ;t41
+    psubsw                  m5, m1, m3                    ;t53
+    paddsw                  m1, m3                        ;t54
+    ITX_MULSUB_2W            5, 4, 2, 3, 7, 3406, 2276    ;t42a, t53a
+    mova [rsp+gprsize*2+16*44], m0                        ;t41
+    mova [rsp+gprsize*2+16*45], m5                        ;t42a
+    mova [rsp+gprsize*2+16*56], m4                        ;t53a
+    mova [rsp+gprsize*2+16*57], m1                        ;t54
+
+    mova                    m0, [rsp+gprsize*2+16*41]     ;t38a
+    mova                    m2, [rsp+gprsize*2+16*40]     ;t37a
+    mova                    m3, [rsp+gprsize*2+16*61]     ;t58a
+    mova                    m1, [rsp+gprsize*2+16*60]     ;t57a
+    psubsw                  m4, m0, m2                    ;t37
+    paddsw                  m0, m2                        ;t38
+    psubsw                  m5, m1, m3                    ;t58
+    paddsw                  m1, m3                        ;t57
+    ITX_MULSUB_2W            5, 4, 2, 3, 7, m4017, 799    ;t37a, t58a
+    mova [rsp+gprsize*2+16*41], m0                        ;t38
+    mova [rsp+gprsize*2+16*40], m5                        ;t37a
+    mova [rsp+gprsize*2+16*61], m4                        ;t58a
+    mova [rsp+gprsize*2+16*60], m1                        ;t57
+
+    mova                    m0, [rsp+gprsize*2+16*42]     ;t39
+    mova                    m2, [rsp+gprsize*2+16*39]     ;t36
+    mova                    m3, [rsp+gprsize*2+16*62]     ;t59
+    mova                    m1, [rsp+gprsize*2+16*59]     ;t56
+    psubsw                  m4, m0, m2                    ;t36a
+    paddsw                  m0, m2                        ;t39a
+    psubsw                  m5, m1, m3                    ;t59a
+    paddsw                  m1, m3                        ;t56a
+    ITX_MULSUB_2W            5, 4, 2, 3, 7, m4017, 799    ;t36, t59
+    mova [rsp+gprsize*2+16*42], m0                        ;t39a
+    mova [rsp+gprsize*2+16*39], m5                        ;t36
+    mova [rsp+gprsize*2+16*62], m4                        ;t59
+    mova [rsp+gprsize*2+16*59], m1                        ;t56a
+
+    mova                    m0, [rsp+gprsize*2+16*35]     ;t32
+    mova                    m2, [rsp+gprsize*2+16*38]     ;t35
+    mova                    m3, [rsp+gprsize*2+16*63]     ;t60
+    mova                    m1, [rsp+gprsize*2+16*66]     ;t63
+    psubsw                  m4, m0, m2                    ;t35a
+    paddsw                  m0, m2                        ;t32a
+    psubsw                  m5, m1, m3                    ;t60a
+    paddsw                  m1, m3                        ;t63a
+    ITX_MULSUB_2W            5, 4, 2, 3, 7,  799, 4017    ;t35, t60
+    mova [rsp+gprsize*2+16*35], m0                        ;t32a
+    mova [rsp+gprsize*2+16*38], m5                        ;t35
+    mova [rsp+gprsize*2+16*63], m4                        ;t60
+    mova [rsp+gprsize*2+16*66], m1                        ;t63a
+
+    mova                    m0, [rsp+gprsize*2+16*36]     ;t33a
+    mova                    m2, [rsp+gprsize*2+16*37]     ;t34a
+    mova                    m3, [rsp+gprsize*2+16*64]     ;t61a
+    mova                    m1, [rsp+gprsize*2+16*65]     ;t62a
+    psubsw                  m4, m0, m2                    ;t34
+    paddsw                  m0, m2                        ;t33
+    psubsw                  m5, m1, m3                    ;t61
+    paddsw                  m1, m3                        ;t62
+    ITX_MULSUB_2W            5, 4, 2, 3, 7,  799, 4017    ;t34a, t61a
+
+    mova                    m2, [rsp+gprsize*2+16*41]     ;t38
+    mova                    m3, [rsp+gprsize*2+16*60]     ;t57
+    psubsw                  m6, m0, m2                    ;t38a
+    paddsw                  m0, m2                        ;t33a
+    psubsw                  m2, m1, m3                    ;t57a
+    paddsw                  m1, m3                        ;t62a
+    mova [rsp+gprsize*2+16*36], m0                        ;t33a
+    mova [rsp+gprsize*2+16*65], m1                        ;t62a
+    ITX_MULSUB_2W            2, 6, 0, 3, 7, 1567, 3784    ;t38, t57
+    mova [rsp+gprsize*2+16*41], m2                        ;t38
+    mova [rsp+gprsize*2+16*60], m6                        ;t57
+
+    mova                    m2, [rsp+gprsize*2+16*40]     ;t37
+    mova                    m3, [rsp+gprsize*2+16*61]     ;t58
+    psubsw                  m0, m5, m2                    ;t37
+    paddsw                  m5, m2                        ;t34
+    psubsw                  m1, m4, m3                    ;t58
+    paddsw                  m4, m3                        ;t61
+    ITX_MULSUB_2W            1, 0, 2, 3, 7, 1567, 3784    ;t37a, t58a
+    mova [rsp+gprsize*2+16*37], m5                        ;t34
+    mova [rsp+gprsize*2+16*64], m4                        ;t61
+    mova [rsp+gprsize*2+16*40], m1                        ;t37a
+    mova [rsp+gprsize*2+16*61], m0                        ;t58a
+
+    mova                    m0, [rsp+gprsize*2+16*38]     ;t35
+    mova                    m2, [rsp+gprsize*2+16*39]     ;t36
+    mova                    m3, [rsp+gprsize*2+16*62]     ;t59
+    mova                    m1, [rsp+gprsize*2+16*63]     ;t60
+    psubsw                  m4, m0, m2                    ;t36a
+    paddsw                  m0, m2                        ;t35a
+    psubsw                  m5, m1, m3                    ;t59a
+    paddsw                  m1, m3                        ;t60a
+    ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t36, t59
+    mova [rsp+gprsize*2+16*38], m0                        ;t35a
+    mova [rsp+gprsize*2+16*39], m5                        ;t36
+    mova [rsp+gprsize*2+16*62], m4                        ;t59
+    mova [rsp+gprsize*2+16*63], m1                        ;t60a
+
+    mova                    m0, [rsp+gprsize*2+16*35]     ;t32a
+    mova                    m2, [rsp+gprsize*2+16*42]     ;t39a
+    mova                    m3, [rsp+gprsize*2+16*59]     ;t56a
+    mova                    m1, [rsp+gprsize*2+16*66]     ;t63a
+    psubsw                  m4, m0, m2                    ;t39
+    paddsw                  m0, m2                        ;t32
+    psubsw                  m5, m1, m3                    ;t56
+    paddsw                  m1, m3                        ;t63
+    ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t39a, t56a
+    mova [rsp+gprsize*2+16*35], m0                        ;t32
+    mova [rsp+gprsize*2+16*42], m5                        ;t39a
+    mova [rsp+gprsize*2+16*59], m4                        ;t56a
+    mova [rsp+gprsize*2+16*66], m1                        ;t63
+
+    mova                    m0, [rsp+gprsize*2+16*50]     ;t47a
+    mova                    m2, [rsp+gprsize*2+16*43]     ;t40a
+    mova                    m3, [rsp+gprsize*2+16*58]     ;t55a
+    mova                    m1, [rsp+gprsize*2+16*51]     ;t48a
+    psubsw                  m4, m0, m2                    ;t40
+    paddsw                  m0, m2                        ;t47
+    psubsw                  m5, m1, m3                    ;t55
+    paddsw                  m1, m3                        ;t48
+    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t40a, t55a
+    mova [rsp+gprsize*2+16*50], m0                        ;t47
+    mova [rsp+gprsize*2+16*43], m5                        ;t40a
+    mova [rsp+gprsize*2+16*58], m4                        ;t55a
+    mova [rsp+gprsize*2+16*51], m1                        ;t48
+
+    mova                    m0, [rsp+gprsize*2+16*49]     ;t46
+    mova                    m2, [rsp+gprsize*2+16*44]     ;t41
+    mova                    m3, [rsp+gprsize*2+16*57]     ;t54
+    mova                    m1, [rsp+gprsize*2+16*52]     ;t49
+    psubsw                  m4, m0, m2                    ;t41a
+    paddsw                  m0, m2                        ;t46a
+    psubsw                  m5, m1, m3                    ;t54a
+    paddsw                  m1, m3                        ;t49a
+    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t41, t54
+    mova [rsp+gprsize*2+16*49], m0                        ;t46a
+    mova [rsp+gprsize*2+16*44], m5                        ;t41
+    mova [rsp+gprsize*2+16*57], m4                        ;t54
+    mova [rsp+gprsize*2+16*52], m1                        ;t49a
+
+    mova                    m0, [rsp+gprsize*2+16*48]     ;t45a
+    mova                    m2, [rsp+gprsize*2+16*45]     ;t42a
+    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
+    mova                    m1, [rsp+gprsize*2+16*53]     ;t50a
+    psubsw                  m4, m0, m2                    ;t42
+    paddsw                  m0, m2                        ;t45
+    psubsw                  m5, m1, m3                    ;t53
+    paddsw                  m1, m3                        ;t50
+    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t42a, t53a
+    mova [rsp+gprsize*2+16*48], m0                        ;t45
+    mova [rsp+gprsize*2+16*45], m5                        ;t42a
+    mova [rsp+gprsize*2+16*56], m4                        ;t53a
+    mova [rsp+gprsize*2+16*53], m1                        ;t50
+
+    mova                    m0, [rsp+gprsize*2+16*47]     ;t44
+    mova                    m2, [rsp+gprsize*2+16*46]     ;t43
+    mova                    m3, [rsp+gprsize*2+16*55]     ;t52
+    mova                    m1, [rsp+gprsize*2+16*54]     ;t51
+    psubsw                  m4, m0, m2                    ;t43a
+    paddsw                  m0, m2                        ;t44a
+    psubsw                  m5, m1, m3                    ;t52a
+    paddsw                  m1, m3                        ;t51a
+    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t43, t52
+
+    mova                    m2, [rsp+gprsize*2+16*38]     ;t35a
+    mova                    m3, [rsp+gprsize*2+16*31]     ;tmp[28]
+    psubsw                  m6, m2, m0                    ;t44
+    paddsw                  m2, m0                        ;t35
+    psubsw                  m0, m3, m2                    ;out35
+    paddsw                  m2, m3                        ;out28
+    mova                    m3, [rsp+gprsize*2+16*63]     ;t60a
+    mova [rsp+gprsize*2+16*38], m0                        ;out35
+    mova [rsp+gprsize*2+16*31], m2                        ;out28
+    psubsw                  m0, m3, m1                    ;t51
+    paddsw                  m3, m1                        ;t60
+    ITX_MULSUB_2W            0, 6, 1, 2, 7, 2896, 2896    ;t44a, t51a
+    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp[3]
+    psubsw                  m1, m2, m3                    ;out60
+    paddsw                  m2, m3                        ;out3
+    mova                    m3, [rsp+gprsize*2+16*22]     ;tmp[19]
+    mova [rsp+gprsize*2+16*63], m1                        ;out60
+    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
+    psubsw                  m1, m3, m0                    ;out44
+    paddsw                  m3, m0                        ;out19
+    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp[12]
+
+    mova                    m0, [rsp+gprsize*2+16*39]     ;t36
+    mova [rsp+gprsize*2+16*47], m1                        ;out44
+    mova [rsp+gprsize*2+16*22], m3                        ;out19
+    mova                    m1, [rsp+gprsize*2+16*62]     ;t59
+    psubsw                  m3, m2, m6                    ;out51
+    paddsw                  m2, m6                        ;out12
+    mova [rsp+gprsize*2+16*54], m3                        ;out51
+    mova [rsp+gprsize*2+16*15], m2                        ;out12
+    psubsw                  m2, m0, m5                    ;t43a
+    paddsw                  m0, m5                        ;t36a
+    mova                    m5, [rsp+gprsize*2+16*30]     ;tmp[27]
+    psubsw                  m3, m1, m4                    ;t52a
+    paddsw                  m1, m4                        ;t59a
+    ITX_MULSUB_2W            3, 2, 4, 6, 7, 2896, 2896    ;t43, t52
+    mova                    m4, [rsp+gprsize*2+16*7 ]     ;tmp[4 ]
+    psubsw                  m6, m5, m0                    ;out36
+    paddsw                  m5, m0                        ;out27
+    psubsw                  m0, m4, m1                    ;out59
+    paddsw                  m4, m1                        ;out4
+    mova [rsp+gprsize*2+16*39], m6                        ;out36
+    mova [rsp+gprsize*2+16*30], m5                        ;out27
+    mova [rsp+gprsize*2+16*62], m0                        ;out59
+    mova [rsp+gprsize*2+16*7 ], m4                        ;out4
+    mova                    m0, [rsp+gprsize*2+16*23]     ;tmp[20]
+    mova                    m5, [rsp+gprsize*2+16*14]     ;tmp[11]
+    psubsw                  m4, m0, m3                    ;out43
+    paddsw                  m0, m3                        ;out20
+    psubsw                  m6, m5, m2                    ;out52
+    paddsw                  m5, m2                        ;out11
+    mova [rsp+gprsize*2+16*46], m4                        ;out43
+    mova [rsp+gprsize*2+16*23], m0                        ;out20
+    mova [rsp+gprsize*2+16*55], m6                        ;out52
+    mova [rsp+gprsize*2+16*14], m5                        ;out11
+
+    mova                    m0, [rsp+gprsize*2+16*40]     ;t37a
+    mova                    m5, [rsp+gprsize*2+16*45]     ;t42a
+    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
+    mova                    m1, [rsp+gprsize*2+16*61]     ;t58a
+    mova                    m2, [rsp+gprsize*2+16*29]     ;tmp[26]
+    psubsw                  m4, m0, m5                    ;t42
+    paddsw                  m0, m5                        ;t37
+    psubsw                  m5, m1, m3                    ;t53
+    paddsw                  m1, m3                        ;t58
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t43, t52
+    mova                    m3, [rsp+gprsize*2+16*8 ]     ;tmp[5 ]
+    psubsw                  m6, m2, m0                    ;out37
+    paddsw                  m2, m0                        ;out26
+    psubsw                  m0, m3, m1                    ;out58
+    paddsw                  m3, m1                        ;out5
+    mova [rsp+gprsize*2+16*40], m6                        ;out37
+    mova [rsp+gprsize*2+16*29], m2                        ;out26
+    mova [rsp+gprsize*2+16*61], m0                        ;out58
+    mova [rsp+gprsize*2+16*8 ], m3                        ;out5
+    mova                    m0, [rsp+gprsize*2+16*24]     ;tmp[21]
+    mova                    m1, [rsp+gprsize*2+16*13]     ;tmp[10]
+    psubsw                  m2, m0, m5                    ;out42
+    paddsw                  m0, m5                        ;out21
+    psubsw                  m3, m1, m4                    ;out53
+    paddsw                  m1, m4                        ;out10
+    mova [rsp+gprsize*2+16*45], m2                        ;out42
+    mova [rsp+gprsize*2+16*24], m0                        ;out21
+    mova [rsp+gprsize*2+16*56], m3                        ;out53
+    mova [rsp+gprsize*2+16*13], m1                        ;out10
+
+    mova                    m0, [rsp+gprsize*2+16*41]     ;t38
+    mova                    m5, [rsp+gprsize*2+16*44]     ;t41
+    mova                    m3, [rsp+gprsize*2+16*57]     ;t54
+    mova                    m1, [rsp+gprsize*2+16*60]     ;t57
+    mova                    m2, [rsp+gprsize*2+16*28]     ;tmp[25]
+    psubsw                  m4, m0, m5                    ;t41a
+    paddsw                  m0, m5                        ;t38a
+    psubsw                  m5, m1, m3                    ;t54a
+    paddsw                  m1, m3                        ;t57a
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t41a, t54a
+    mova                    m3, [rsp+gprsize*2+16*9 ]     ;tmp[6 ]
+    psubsw                  m6, m2, m0                    ;out38
+    paddsw                  m2, m0                        ;out25
+    psubsw                  m0, m3, m1                    ;out57
+    paddsw                  m3, m1                        ;out6
+    mova [rsp+gprsize*2+16*41], m6                        ;out38
+    mova [rsp+gprsize*2+16*28], m2                        ;out25
+    mova [rsp+gprsize*2+16*60], m0                        ;out57
+    mova [rsp+gprsize*2+16*9 ], m3                        ;out6
+    mova                    m0, [rsp+gprsize*2+16*25]     ;tmp[22]
+    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp[9 ]
+    psubsw                  m2, m0, m5                    ;out41
+    paddsw                  m0, m5                        ;out22
+    psubsw                  m3, m1, m4                    ;out54
+    paddsw                  m1, m4                        ;out9
+    mova [rsp+gprsize*2+16*44], m2                        ;out41
+    mova [rsp+gprsize*2+16*25], m0                        ;out22
+    mova [rsp+gprsize*2+16*57], m3                        ;out54
+    mova [rsp+gprsize*2+16*12], m1                        ;out9
+
+    mova                    m0, [rsp+gprsize*2+16*42]     ;t39a
+    mova                    m5, [rsp+gprsize*2+16*43]     ;t40a
+    mova                    m3, [rsp+gprsize*2+16*58]     ;t55a
+    mova                    m1, [rsp+gprsize*2+16*59]     ;t56a
+    mova                    m2, [rsp+gprsize*2+16*27]     ;tmp[24]
+    psubsw                  m4, m0, m5                    ;t40
+    paddsw                  m0, m5                        ;t39
+    psubsw                  m5, m1, m3                    ;t55
+    paddsw                  m1, m3                        ;t56
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t40a, t55a
+    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp[7 ]
+    psubsw                  m6, m2, m0                    ;out39
+    paddsw                  m2, m0                        ;out24
+    psubsw                  m0, m3, m1                    ;out56
+    paddsw                  m3, m1                        ;out7
+    mova [rsp+gprsize*2+16*42], m6                        ;out39
+    mova [rsp+gprsize*2+16*27], m2                        ;out24
+    mova [rsp+gprsize*2+16*59], m0                        ;out56
+    mova [rsp+gprsize*2+16*10], m3                        ;out7
+    mova                    m0, [rsp+gprsize*2+16*26]     ;tmp[23]
+    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp[8 ]
+    psubsw                  m2, m0, m5                    ;out40
+    paddsw                  m0, m5                        ;out23
+    psubsw                  m3, m1, m4                    ;out55
+    paddsw                  m1, m4                        ;out8
+    mova [rsp+gprsize*2+16*43], m2                        ;out40
+    mova [rsp+gprsize*2+16*26], m0                        ;out23
+    mova [rsp+gprsize*2+16*58], m3                        ;out55
+    mova [rsp+gprsize*2+16*11], m1                        ;out8
+
+    mova                    m0, [rsp+gprsize*2+16*37]     ;t34
+    mova                    m5, [rsp+gprsize*2+16*48]     ;t45
+    mova                    m3, [rsp+gprsize*2+16*53]     ;t50
+    mova                    m1, [rsp+gprsize*2+16*64]     ;t61
+    mova                    m2, [rsp+gprsize*2+16*32]     ;tmp[29]
+    psubsw                  m4, m0, m5                    ;t45a
+    paddsw                  m0, m5                        ;t34a
+    psubsw                  m5, m1, m3                    ;t50a
+    paddsw                  m1, m3                        ;t61a
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
+    mova                    m3, [rsp+gprsize*2+16*5 ]     ;tmp[2 ]
+    psubsw                  m6, m2, m0                    ;out34
+    paddsw                  m2, m0                        ;out29
+    psubsw                  m0, m3, m1                    ;out61
+    paddsw                  m3, m1                        ;out2
+    mova [rsp+gprsize*2+16*37], m6                        ;out34
+    mova [rsp+gprsize*2+16*32], m2                        ;out29
+    mova [rsp+gprsize*2+16*64], m0                        ;out61
+    mova [rsp+gprsize*2+16*5 ], m3                        ;out2
+    mova                    m0, [rsp+gprsize*2+16*21]     ;tmp[18]
+    mova                    m1, [rsp+gprsize*2+16*16]     ;tmp[13]
+    psubsw                  m2, m0, m5                    ;out45
+    paddsw                  m0, m5                        ;out18
+    psubsw                  m3, m1, m4                    ;out50
+    paddsw                  m1, m4                        ;out13
+    mova [rsp+gprsize*2+16*48], m2                        ;out45
+    mova [rsp+gprsize*2+16*21], m0                        ;out18
+    mova [rsp+gprsize*2+16*53], m3                        ;out50
+    mova [rsp+gprsize*2+16*16], m1                        ;out13
+
+    mova                    m0, [rsp+gprsize*2+16*36]     ;t33a
+    mova                    m5, [rsp+gprsize*2+16*49]     ;t46a
+    mova                    m3, [rsp+gprsize*2+16*52]     ;t49a
+    mova                    m1, [rsp+gprsize*2+16*65]     ;t62a
+    mova                    m2, [rsp+gprsize*2+16*33]     ;tmp[30]
+    psubsw                  m4, m0, m5                    ;t46
+    paddsw                  m0, m5                        ;t33
+    psubsw                  m5, m1, m3                    ;t49
+    paddsw                  m1, m3                        ;t62
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
+    mova                    m3, [rsp+gprsize*2+16*4 ]     ;tmp[1 ]
+    psubsw                  m6, m2, m0                    ;out33
+    paddsw                  m2, m0                        ;out30
+    psubsw                  m0, m3, m1                    ;out62
+    paddsw                  m3, m1                        ;out1
+    mova [rsp+gprsize*2+16*36], m6                        ;out33
+    mova [rsp+gprsize*2+16*33], m2                        ;out30
+    mova [rsp+gprsize*2+16*65], m0                        ;out62
+    mova [rsp+gprsize*2+16*4 ], m3                        ;out1
+    mova                    m0, [rsp+gprsize*2+16*20]     ;tmp[17]
+    mova                    m1, [rsp+gprsize*2+16*17]     ;tmp[14]
+    psubsw                  m2, m0, m5                    ;out46
+    paddsw                  m0, m5                        ;out17
+    psubsw                  m3, m1, m4                    ;out49
+    paddsw                  m1, m4                        ;out14
+    mova [rsp+gprsize*2+16*49], m2                        ;out46
+    mova [rsp+gprsize*2+16*20], m0                        ;out17
+    mova [rsp+gprsize*2+16*52], m3                        ;out49
+    mova [rsp+gprsize*2+16*17], m1                        ;out14
+
+    mova                    m0, [rsp+gprsize*2+16*35]     ;t32
+    mova                    m5, [rsp+gprsize*2+16*50]     ;t47
+    mova                    m3, [rsp+gprsize*2+16*51]     ;t48
+    mova                    m1, [rsp+gprsize*2+16*66]     ;t63
+    mova                    m2, [rsp+gprsize*2+16*34]     ;tmp[31]
+    psubsw                  m4, m0, m5                    ;t47a
+    paddsw                  m0, m5                        ;t32a
+    psubsw                  m5, m1, m3                    ;t48a
+    paddsw                  m1, m3                        ;t63a
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t47, t48
+    mova                    m3, [rsp+gprsize*2+16*3 ]     ;tmp[0 ]
+    psubsw                  m6, m2, m0                    ;out32
+    paddsw                  m2, m0                        ;out31
+    psubsw                  m0, m3, m1                    ;out63
+    paddsw                  m3, m1                        ;out0
+    mova [rsp+gprsize*2+16*35], m6                        ;out32
+    mova [rsp+gprsize*2+16*34], m2                        ;out31
+    mova [rsp+gprsize*2+16*66], m0                        ;out63
+    mova [rsp+gprsize*2+16*3 ], m3                        ;out0
+    mova                    m0, [rsp+gprsize*2+16*19]     ;tmp[16]
+    mova                    m1, [rsp+gprsize*2+16*18]     ;tmp[15]
+    psubsw                  m2, m0, m5                    ;out47
+    paddsw                  m0, m5                        ;out16
+    psubsw                  m3, m1, m4                    ;out48
+    paddsw                  m1, m4                        ;out15
+    mova [rsp+gprsize*2+16*50], m2                        ;out47
+    mova [rsp+gprsize*2+16*19], m0                        ;out16
+    mova [rsp+gprsize*2+16*51], m3                        ;out48
+    mova [rsp+gprsize*2+16*18], m1                        ;out15
+    ret
+
+
+cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+
+    call m(idct_64x16_internal)
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_8192)]
+    mov               [coeffq], eobd
+    mov                    r3d, 16
+    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x16).end)]
+
+.body:
+    pmulhrsw                m0, m2
+    movd                    m2, [o(pw_2048)]  ;intentionally rip-relative
+    pmulhrsw                m0, m1
+    pmulhrsw                m0, m2
+    pshuflw                 m0, m0, q0000
+    punpcklwd               m0, m0
+    pxor                    m7, m7
+
+.loop:
+    mova                    m1, [dstq+16*0]
+    mova                    m3, [dstq+16*1]
+    mova                    m5, [dstq+16*2]
+    mova                    m6, [dstq+16*3]
+    punpckhbw               m2, m1, m7
+    punpcklbw               m1, m7
+    punpckhbw               m4, m3, m7
+    punpcklbw               m3, m7
+    paddw                   m2, m0
+    paddw                   m1, m0
+    paddw                   m4, m0
+    paddw                   m3, m0
+    packuswb                m1, m2
+    packuswb                m3, m4
+    punpckhbw               m2, m5, m7
+    punpcklbw               m5, m7
+    punpckhbw               m4, m6, m7
+    punpcklbw               m6, m7
+    paddw                   m2, m0
+    paddw                   m5, m0
+    paddw                   m4, m0
+    paddw                   m6, m0
+    packuswb                m5, m2
+    packuswb                m6, m4
+    mova           [dstq+16*0], m1
+    mova           [dstq+16*1], m3
+    mova           [dstq+16*2], m5
+    mova           [dstq+16*3], m6
+    add                   dstq, strideq
+    dec                    r3d
+    jg .loop
+    jmp                   tx2q
+
+.end:
+    RET
+
+
+%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2
+
+%if %3
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [%1+%2*0]
+    pmulhrsw             m1, m3, [%1+%2*1]
+    pmulhrsw             m2, m3, [%1+%2*2]
+    pmulhrsw             m3, [%1+%2*3]
+%else
+    mova                 m0, [%1+%2*0]
+    mova                 m1, [%1+%2*1]
+    mova                 m2, [%1+%2*2]
+    mova                 m3, [%1+%2*3]
+%endif
+%endmacro
+
+%macro LOAD_4ROWS_H 2 ;src, stride
+    mova                 m4, [%1+%2*0]
+    mova                 m5, [%1+%2*1]
+    mova                 m6, [%1+%2*2]
+    mova                 m7, [%1+%2*3]
+%endmacro
+
+cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mov                    r3d, 2
+    mov  [rsp+gprsize*2+16*67], dstq
+    lea                   dstq, [rsp+gprsize+16*68]
+
+.pass1_loop:
+    LOAD_4ROWS     coeffq+32*0, 32*8
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    pxor                    m4, m4
+    LOAD_4ROWS     coeffq+32*4, 32*8
+
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    LOAD_8ROWS     coeffq+32*2, 32*4
+    mova   [rsp+gprsize+16*19], m0
+    mova   [rsp+gprsize+16*26], m1
+    mova   [rsp+gprsize+16*23], m2
+    mova   [rsp+gprsize+16*22], m3
+    mova   [rsp+gprsize+16*21], m4
+    mova   [rsp+gprsize+16*24], m5
+    mova   [rsp+gprsize+16*25], m6
+    mova   [rsp+gprsize+16*20], m7
+
+    call m(idct_8x32_internal).main_fast
+    SAVE_8ROWS    rsp+gprsize+16*3, 16
+
+    LOAD_8ROWS     coeffq+32*1, 32*2
+    mova   [rsp+gprsize+16*35], m0                        ;in1
+    mova   [rsp+gprsize+16*49], m1                        ;in3
+    mova   [rsp+gprsize+16*43], m2                        ;in5
+    mova   [rsp+gprsize+16*41], m3                        ;in7
+    mova   [rsp+gprsize+16*39], m4                        ;in9
+    mova   [rsp+gprsize+16*45], m5                        ;in11
+    mova   [rsp+gprsize+16*47], m6                        ;in13
+    mova   [rsp+gprsize+16*37], m7                        ;in15
+
+    LOAD_8ROWS    coeffq+32*17, 32*2
+    mova   [rsp+gprsize+16*63], m0                        ;in17
+    mova   [rsp+gprsize+16*53], m1                        ;in19
+    mova   [rsp+gprsize+16*55], m2                        ;in21
+    mova   [rsp+gprsize+16*61], m3                        ;in23
+    mova   [rsp+gprsize+16*59], m4                        ;in25
+    mova   [rsp+gprsize+16*57], m5                        ;in27
+    mova   [rsp+gprsize+16*51], m6                        ;in29
+    mova   [rsp+gprsize+16*65], m7                        ;in31
+
+    call m(idct_16x64_internal).main
+
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end:
+    SAVE_8ROWS     coeffq+32*0, 32
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end1:
+    SAVE_8ROWS     coeffq+32*8, 32
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end2)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end2:
+    SAVE_8ROWS    coeffq+32*16, 32
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end3)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end3:
+    SAVE_8ROWS    coeffq+32*24, 32
+    LOAD_8ROWS   rsp+gprsize+16*35, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end4)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end4:
+    SAVE_8ROWS       dstq+32*0, 32
+    LOAD_8ROWS   rsp+gprsize+16*43, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end5)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end5:
+    SAVE_8ROWS       dstq+32*8, 32
+    LOAD_8ROWS   rsp+gprsize+16*51, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end6)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end6:
+    SAVE_8ROWS      dstq+32*16, 32
+    LOAD_8ROWS   rsp+gprsize+16*59, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end7)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end7:
+    SAVE_8ROWS      dstq+32*24, 32
+
+    add                 coeffq, 16
+    add                   dstq, 16
+    dec                    r3d
+    jg .pass1_loop
+
+.pass2:
+    mov                   dstq, [rsp+gprsize*2+16*67]
+    sub                 coeffq, 32
+    mov                    r3d, 4
+
+.pass2_loop:
+    mov  [rsp+gprsize*1+16*67], r3d
+
+    LOAD_4ROWS     coeffq+16*0, 32*2
+    LOAD_4ROWS_H   coeffq+16*1, 32*2
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_4ROWS     coeffq+16*2, 32*2
+    LOAD_4ROWS_H   coeffq+16*3, 32*2
+    call m(idct_16x8_internal).main
+
+    mov                    r3, dstq
+    lea                  tx2q, [o(m(idct_64x16_internal).end)]
+    lea                  dstq, [dstq+strideq*8]
+    jmp  m(idct_8x8_internal).end
+
+.end:
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
+    mova   [rsp+gprsize+16*0], m7
+    lea                  tx2q, [o(m(idct_64x16_internal).end1)]
+    mov                  dstq, r3
+    jmp  m(idct_8x8_internal).end
+
+.end1:
+    pxor                   m7, m7
+    REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
+
+    add                 coeffq, 16*16
+    mov                    r3d, [rsp+gprsize*1+16*67]
+    mov                   dstq, [rsp+gprsize*2+16*67]
+    add                   dstq, 8
+    mov  [rsp+gprsize*2+16*67], dstq
+    dec                    r3d
+    jg .pass2_loop
+
+    mov                    r3d, 4
+    lea                 coeffq, [rsp+gprsize+16*68]
+.pass2_loop2:
+    mov  [rsp+gprsize*1+16*67], r3d
+
+    LOAD_4ROWS     coeffq+16*0, 32*2
+    LOAD_4ROWS_H   coeffq+16*1, 32*2
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_4ROWS     coeffq+16*2, 32*2
+    LOAD_4ROWS_H   coeffq+16*3, 32*2
+    call m(idct_16x8_internal).main
+
+    mov                    r3, dstq
+    lea                  tx2q, [o(m(idct_64x16_internal).end2)]
+    lea                  dstq, [dstq+strideq*8]
+    jmp  m(idct_8x8_internal).end
+
+.end2:
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
+    mova   [rsp+gprsize+16*0], m7
+    lea                  tx2q, [o(m(idct_64x16_internal).end3)]
+    mov                  dstq, r3
+    jmp  m(idct_8x8_internal).end
+
+.end3:
+
+    add                 coeffq, 16*16
+    mov                    r3d, [rsp+gprsize*1+16*67]
+    mov                   dstq, [rsp+gprsize*2+16*67]
+    add                   dstq, 8
+    mov  [rsp+gprsize*2+16*67], dstq
+    dec                    r3d
+    jg .pass2_loop2
+    ret
+
+
+cglobal inv_txfm_add_dct_dct_32x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+
+    call m(idct_32x64_internal)
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_16384)]
+    mov               [coeffq], eobd
+    pmulhrsw                m0, m1
+    mov                    r3d, 64
+    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x64).end)]
+    jmp m(inv_txfm_add_dct_dct_32x8).body
+
+.end:
+    RET
+
+
+cglobal idct_32x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    mov                    r4d, 2
+    sub                   eobd, 136
+    mov  [rsp+gprsize*1+16*67], eobd
+    mov                    r3d, 4
+    cmovs                  r3d, r4d
+
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+
+    mov  [rsp+gprsize*2+16*67], coeffq
+
+.pass1_loop:
+    LOAD_8ROWS     coeffq+64*1, 64*2, 1
+    mova   [rsp+gprsize+16*19], m0                        ;in1
+    mova   [rsp+gprsize+16*26], m1                        ;in3
+    mova   [rsp+gprsize+16*23], m2                        ;in5
+    mova   [rsp+gprsize+16*22], m3                        ;in7
+    mova   [rsp+gprsize+16*21], m4                        ;in9
+    mova   [rsp+gprsize+16*24], m5                        ;in11
+    mova   [rsp+gprsize+16*25], m6                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+
+    mov                   tx2d, [rsp+gprsize*1+16*67]
+    test                  tx2d, tx2d
+    jl .fast
+
+.full:
+    LOAD_8ROWS     coeffq+64*0, 64*4, 1
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+64*2, 64*4, 1
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    LOAD_8ROWS    coeffq+64*17, 64*2, 1
+    mova   [rsp+gprsize+16*33], m0                        ;in17
+    mova   [rsp+gprsize+16*28], m1                        ;in19
+    mova   [rsp+gprsize+16*29], m2                        ;in21
+    mova   [rsp+gprsize+16*32], m3                        ;in23
+    mova   [rsp+gprsize+16*31], m4                        ;in25
+    mova   [rsp+gprsize+16*30], m5                        ;in27
+    mova   [rsp+gprsize+16*27], m6                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+
+    call m(idct_8x32_internal).main
+    jmp .pass1_end
+
+.fast:
+    LOAD_4ROWS          coeffq, 256, 1
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_4ROWS    coeffq+128*1, 256, 1
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call m(idct_8x32_internal).main_fast
+
+.pass1_end:
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_32x64_internal).pass1_end1)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end1:
+    SAVE_8ROWS     coeffq+64*0, 64
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_32x64_internal).pass1_end2)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end2:
+    SAVE_8ROWS     coeffq+64*8, 64
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_32x64_internal).pass1_end3)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end3:
+    SAVE_8ROWS    coeffq+64*16, 64
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_32x64_internal).pass1_end4)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end4:
+    SAVE_8ROWS    coeffq+64*24, 64
+
+    add                 coeffq, 16
+    dec                    r3d
+    jg .pass1_loop
+
+.pass2:
+    mov                 coeffq, [rsp+gprsize*2+16*67]
+    mov                    r3d, 4
+    lea                     r4, [dstq+8]
+    mov  [rsp+gprsize*2+16*67], r4
+    lea                     r4, [o(m(idct_16x64_internal).end1)]
+    jmp m(idct_16x64_internal).pass2_loop
+
+
+cglobal inv_txfm_add_dct_dct_64x32, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+
+    call m(idct_64x32_internal)
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_16384)]
+    pmulhrsw                m0, m1
+    mov               [coeffq], eobd
+    mov                    r3d, 32
+    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)]
+    jmp m(inv_txfm_add_dct_dct_64x16).body
+
+.end:
+    RET
+
+cglobal idct_64x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    mov                    r4d, 2
+    sub                   eobd, 136
+    mov  [rsp+gprsize*1+16*67], eobd
+    mov                    r3d, 4
+    cmovs                  r3d, r4d
+
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+
+    mov  [rsp+gprsize*2+16*67], coeffq
+    mov  [rsp+gprsize*3+16*67], dstq
+    lea                   dstq, [rsp+gprsize+16*69]
+    mov  [rsp+gprsize*4+16*67], dstq
+
+.pass1_loop:
+    LOAD_4ROWS     coeffq+64*0, 64*8, 1
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    pxor                    m4, m4
+    LOAD_4ROWS     coeffq+64*4, 64*8, 1
+
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    LOAD_8ROWS     coeffq+64*2, 64*4, 1
+    mova   [rsp+gprsize+16*19], m0
+    mova   [rsp+gprsize+16*26], m1
+    mova   [rsp+gprsize+16*23], m2
+    mova   [rsp+gprsize+16*22], m3
+    mova   [rsp+gprsize+16*21], m4
+    mova   [rsp+gprsize+16*24], m5
+    mova   [rsp+gprsize+16*25], m6
+    mova   [rsp+gprsize+16*20], m7
+
+    call m(idct_8x32_internal).main_fast
+    SAVE_8ROWS    rsp+gprsize+16*3, 16
+
+    LOAD_8ROWS     coeffq+64*1, 64*2, 1
+    mova   [rsp+gprsize+16*35], m0                        ;in1
+    mova   [rsp+gprsize+16*49], m1                        ;in3
+    mova   [rsp+gprsize+16*43], m2                        ;in5
+    mova   [rsp+gprsize+16*41], m3                        ;in7
+    mova   [rsp+gprsize+16*39], m4                        ;in9
+    mova   [rsp+gprsize+16*45], m5                        ;in11
+    mova   [rsp+gprsize+16*47], m6                        ;in13
+    mova   [rsp+gprsize+16*37], m7                        ;in15
+
+    LOAD_8ROWS    coeffq+64*17, 64*2, 1
+    mova   [rsp+gprsize+16*63], m0                        ;in17
+    mova   [rsp+gprsize+16*53], m1                        ;in19
+    mova   [rsp+gprsize+16*55], m2                        ;in21
+    mova   [rsp+gprsize+16*61], m3                        ;in23
+    mova   [rsp+gprsize+16*59], m4                        ;in25
+    mova   [rsp+gprsize+16*57], m5                        ;in27
+    mova   [rsp+gprsize+16*51], m6                        ;in29
+    mova   [rsp+gprsize+16*65], m7                        ;in31
+
+    call m(idct_16x64_internal).main
+
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end:
+    SAVE_8ROWS     coeffq+64*0, 64
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end1)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end1:
+    SAVE_8ROWS     coeffq+64*8, 64
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end2)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end2:
+    SAVE_8ROWS    coeffq+64*16, 64
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end3)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end3:
+    SAVE_8ROWS    coeffq+64*24, 64
+    LOAD_8ROWS   rsp+gprsize+16*35, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end4)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end4:
+    SAVE_8ROWS       dstq+64*0, 64
+    LOAD_8ROWS   rsp+gprsize+16*43, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end5)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end5:
+    SAVE_8ROWS       dstq+64*8, 64
+    LOAD_8ROWS   rsp+gprsize+16*51, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end6)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end6:
+    SAVE_8ROWS      dstq+64*16, 64
+    LOAD_8ROWS   rsp+gprsize+16*59, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end7)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end7:
+    SAVE_8ROWS      dstq+64*24, 64
+
+    add                 coeffq, 16
+    add                   dstq, 16
+    dec                    r3d
+    jg .pass1_loop
+
+.pass2:
+    mov                 coeffq, [rsp+gprsize*4+16*67]
+    mov                   dstq, [rsp+gprsize*3+16*67]
+    mov                   eobd, [rsp+gprsize*1+16*67]
+    lea                   dstq, [dstq+32]
+    mov  [rsp+gprsize*1+16*35], eobd
+    lea                   tx2q, [o(m(idct_64x32_internal).pass2_end)]
+    mov                    r3d, 4
+    jmp m(idct_32x32_internal).pass2_loop
+
+.pass2_end:
+    mova    [rsp+gprsize+16*0], m7
+    lea                     r3, [o(m(idct_64x32_internal).pass2_end1)]
+    jmp  m(idct_8x32_internal).end2
+
+.pass2_end1:
+    lea                   tx2q, [o(m(idct_64x32_internal).pass2_end)]
+    add                 coeffq, 16*32
+    mov                   dstq, [rsp+gprsize*2+16*35]
+    mov                    r3d, [rsp+gprsize*3+16*35]
+    dec                    r3d
+    jg m(idct_32x32_internal).pass2_loop
+
+.pass2_end2:
+    mov                   dstq, [rsp+gprsize*3+16*67]
+    mov                 coeffq, [rsp+gprsize*2+16*67]
+    lea                   tx2q, [o(m(idct_32x32_internal).pass2_end)]
+    mov                    r3d, 4
+    jmp m(idct_32x32_internal).pass2_loop
+
+
+cglobal inv_txfm_add_dct_dct_64x64, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+
+    call m(idct_64x64_internal)
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_8192)]
+    mov               [coeffq], eobd
+    mov                    r3d, 64
+    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)]
+    jmp m(inv_txfm_add_dct_dct_64x16).body
+
+cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    mov                    r5d, 4
+    mov                    r4d, 2
+    sub                   eobd, 136
+    cmovns                 r4d, r5d
+
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+
+    mov  [rsp+gprsize*1+16*67], eobd
+    mov                    r3d, r4d
+    mov  [rsp+gprsize*4+16*67], coeffq
+    mov  [rsp+gprsize*3+16*67], dstq
+    lea                   dstq, [rsp+gprsize+16*69]
+    mov  [rsp+gprsize*2+16*67], dstq
+
+.pass1_loop:
+    LOAD_4ROWS     coeffq+64*0, 64*8
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    pxor                    m4, m4
+    LOAD_4ROWS     coeffq+64*4, 64*8
+
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    LOAD_8ROWS     coeffq+64*2, 64*4
+    mova   [rsp+gprsize+16*19], m0
+    mova   [rsp+gprsize+16*26], m1
+    mova   [rsp+gprsize+16*23], m2
+    mova   [rsp+gprsize+16*22], m3
+    mova   [rsp+gprsize+16*21], m4
+    mova   [rsp+gprsize+16*24], m5
+    mova   [rsp+gprsize+16*25], m6
+    mova   [rsp+gprsize+16*20], m7
+
+    call m(idct_8x32_internal).main_fast
+    SAVE_8ROWS    rsp+gprsize+16*3, 16
+
+    LOAD_8ROWS     coeffq+64*1, 64*2
+    mova   [rsp+gprsize+16*35], m0                        ;in1
+    mova   [rsp+gprsize+16*49], m1                        ;in3
+    mova   [rsp+gprsize+16*43], m2                        ;in5
+    mova   [rsp+gprsize+16*41], m3                        ;in7
+    mova   [rsp+gprsize+16*39], m4                        ;in9
+    mova   [rsp+gprsize+16*45], m5                        ;in11
+    mova   [rsp+gprsize+16*47], m6                        ;in13
+    mova   [rsp+gprsize+16*37], m7                        ;in15
+
+    LOAD_8ROWS    coeffq+64*17, 64*2
+    mova   [rsp+gprsize+16*63], m0                        ;in17
+    mova   [rsp+gprsize+16*53], m1                        ;in19
+    mova   [rsp+gprsize+16*55], m2                        ;in21
+    mova   [rsp+gprsize+16*61], m3                        ;in23
+    mova   [rsp+gprsize+16*59], m4                        ;in25
+    mova   [rsp+gprsize+16*57], m5                        ;in27
+    mova   [rsp+gprsize+16*51], m6                        ;in29
+    mova   [rsp+gprsize+16*65], m7                        ;in31
+
+    call m(idct_16x64_internal).main
+
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end:
+    SAVE_8ROWS     coeffq+64*0, 64
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end1:
+    SAVE_8ROWS     coeffq+64*8, 64
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end2)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end2:
+    SAVE_8ROWS    coeffq+64*16, 64
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end3)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end3:
+    SAVE_8ROWS    coeffq+64*24, 64
+    LOAD_8ROWS   rsp+gprsize+16*35, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end4)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end4:
+    SAVE_8ROWS       dstq+64*0, 64
+    LOAD_8ROWS   rsp+gprsize+16*43, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end5)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end5:
+    SAVE_8ROWS       dstq+64*8, 64
+    LOAD_8ROWS   rsp+gprsize+16*51, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end6)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end6:
+    SAVE_8ROWS      dstq+64*16, 64
+    LOAD_8ROWS   rsp+gprsize+16*59, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end7)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end7:
+    SAVE_8ROWS      dstq+64*24, 64
+
+    add                 coeffq, 16
+    add                   dstq, 16
+    dec                    r3d
+    jg .pass1_loop
+
+.pass2:
+    mov                   dstq, [rsp+gprsize*3+16*67]
+    mov                 coeffq, [rsp+gprsize*2+16*67]
+    lea                   dstq, [dstq+32]
+    mov                    r3d, 4
+    lea                     r4, [dstq+8]
+    mov  [rsp+gprsize*2+16*67], r4
+    lea                     r4, [o(m(idct_64x64_internal).pass2_end)]
+    jmp m(idct_16x64_internal).pass2_loop
+
+.pass2_end:
+    LOAD_8ROWS   rsp+gprsize+16*35, 16
+    lea                   dstq, [dstq+strideq*2]
+    add                    rsp, 16*32
+    mova    [rsp+gprsize+16*0], m7
+    lea                     r3, [o(m(idct_64x64_internal).pass2_end1)]
+    jmp  m(idct_8x32_internal).end2
+
+.pass2_end1:
+    add                 coeffq, 16*32
+    sub                    rsp, 16*32
+
+    mov                   dstq, [rsp+gprsize*2+16*67]
+    mov                    r3d, [rsp+gprsize*3+16*67]
+    lea                     r4, [dstq+8]
+    mov  [rsp+gprsize*2+16*67], r4
+    lea                     r4, [o(m(idct_64x64_internal).pass2_end)]
+
+    dec                    r3d
+    jg  m(idct_16x64_internal).pass2_loop
+
+.pass2_end2:
+    mov                 coeffq, [rsp+gprsize*4+16*67]
+    mov                   dstq, [rsp+gprsize*2+16*67]
+    mov                    r3d, 4
+    sub                   dstq, 72
+    lea                     r4, [dstq+8]
+    mov  [rsp+gprsize*2+16*67], r4
+    lea                     r4, [o(m(idct_16x64_internal).end1)]
+    jmp m(idct_16x64_internal).pass2_loop
diff --git a/src/x86/loopfilter.asm b/src/x86/loopfilter.asm
new file mode 100644 (file)
index 0000000..5011868
--- /dev/null
@@ -0,0 +1,1600 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+pb_7_1: times 16 db 7, 1
+pb_3_1: times 16 db 3, 1
+pb_2_1: times 16 db 2, 1
+pb_m1_0: times 16 db -1, 0
+pb_m1_1: times 16 db -1, 1
+pb_m1_2: times 16 db -1, 2
+pb_1: times 32 db 1
+pb_2: times 32 db 2
+pb_3: times 32 db 3
+pb_4: times 32 db 4
+pb_16: times 32 db 16
+pb_63: times 32 db 63
+pb_64: times 32 db 64
+pb_128: times 32 db 0x80
+pb_129: times 32 db 0x81
+pb_240: times 32 db 0xf0
+pb_248: times 32 db 0xf8
+pb_254: times 32 db 0xfe
+
+pw_2048: times 16 dw 2048
+pw_4096: times 16 dw 4096
+
+pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+%macro ABSSUB 4 ; dst, a, b, tmp
+    psubusb       %1, %2, %3
+    psubusb       %4, %3, %2
+    por           %1, %4
+%endmacro
+
+%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
+    ; transpose 16x4
+    punpcklbw    m%5, m%1, m%2
+    punpckhbw    m%1, m%2
+    punpcklbw    m%2, m%3, m%4
+    punpckhbw    m%3, m%4
+    punpcklwd    m%4, m%5, m%2
+    punpckhwd    m%5, m%2
+    punpcklwd    m%2, m%1, m%3
+    punpckhwd    m%1, m%3
+
+    ; write out
+    movd [dstq+strideq*0-2], xm%4
+    pextrd [dstq+strideq*1-2], xm%4, 1
+    pextrd [dstq+strideq*2-2], xm%4, 2
+    pextrd [dstq+stride3q-2], xm%4, 3
+    lea         dstq, [dstq+strideq*4]
+    movd [dstq+strideq*0-2], xm%5
+    pextrd [dstq+strideq*1-2], xm%5, 1
+    pextrd [dstq+strideq*2-2], xm%5, 2
+    pextrd [dstq+stride3q-2], xm%5, 3
+    lea         dstq, [dstq+strideq*4]
+    movd [dstq+strideq*0-2], xm%2
+    pextrd [dstq+strideq*1-2], xm%2, 1
+    pextrd [dstq+strideq*2-2], xm%2, 2
+    pextrd [dstq+stride3q-2], xm%2, 3
+    lea         dstq, [dstq+strideq*4]
+    movd [dstq+strideq*0-2], xm%1
+    pextrd [dstq+strideq*1-2], xm%1, 1
+    pextrd [dstq+strideq*2-2], xm%1, 2
+    pextrd [dstq+stride3q-2], xm%1, 3
+    lea         dstq, [dstq+strideq*4]
+
+    vextracti128 xm%4, m%4, 1
+    vextracti128 xm%5, m%5, 1
+    vextracti128 xm%2, m%2, 1
+    vextracti128 xm%1, m%1, 1
+
+    movd [dstq+strideq*0-2], xm%4
+    pextrd [dstq+strideq*1-2], xm%4, 1
+    pextrd [dstq+strideq*2-2], xm%4, 2
+    pextrd [dstq+stride3q-2], xm%4, 3
+    lea         dstq, [dstq+strideq*4]
+    movd [dstq+strideq*0-2], xm%5
+    pextrd [dstq+strideq*1-2], xm%5, 1
+    pextrd [dstq+strideq*2-2], xm%5, 2
+    pextrd [dstq+stride3q-2], xm%5, 3
+    lea         dstq, [dstq+strideq*4]
+    movd [dstq+strideq*0-2], xm%2
+    pextrd [dstq+strideq*1-2], xm%2, 1
+    pextrd [dstq+strideq*2-2], xm%2, 2
+    pextrd [dstq+stride3q-2], xm%2, 3
+    lea         dstq, [dstq+strideq*4]
+    movd [dstq+strideq*0-2], xm%1
+    pextrd [dstq+strideq*1-2], xm%1, 1
+    pextrd [dstq+strideq*2-2], xm%1, 2
+    pextrd [dstq+stride3q-2], xm%1, 3
+    lea         dstq, [dstq+strideq*4]
+%endmacro
+
+%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
+%if %1 == 0
+    mova          %3, m15
+%endif
+
+    ; input in m0-15
+    punpcklbw    m15, m0, m1
+    punpckhbw     m0, m1
+    punpcklbw     m1, m2, m3
+    punpckhbw     m2, m3
+    punpcklbw     m3, m4, m5
+    punpckhbw     m4, m5
+    punpcklbw     m5, m6, m7
+    punpckhbw     m6, m7
+    punpcklbw     m7, m8, m9
+    punpckhbw     m8, m9
+    punpcklbw     m9, m10, m11
+    punpckhbw    m10, m11
+    punpcklbw    m11, m12, m13
+    punpckhbw    m12, m13
+    mova         m13, %3
+    mova          %3, m12
+    punpcklbw    m12, m14, m13
+    punpckhbw    m13, m14, m13
+
+    ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13
+    punpcklwd    m14, m15, m1
+    punpckhwd    m15, m1
+    punpcklwd     m1, m0, m2
+    punpckhwd     m0, m2
+    punpcklwd     m2, m3, m5
+    punpckhwd     m3, m5
+    punpcklwd     m5, m4, m6
+    punpckhwd     m4, m6
+    punpcklwd     m6, m7, m9
+    punpckhwd     m7, m9
+    punpcklwd     m9, m8, m10
+    punpckhwd     m8, m10
+    punpcklwd    m10, m11, m12
+    punpckhwd    m11, m12
+    mova         m12, %3
+    mova          %3, m11
+    punpcklwd    m11, m12, m13
+    punpckhwd    m12, m13
+
+    ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12
+    punpckldq    m13, m14, m2
+    punpckhdq    m14, m2
+    punpckldq     m2, m15, m3
+    punpckhdq    m15, m3
+    punpckldq     m3, m1, m5
+    punpckhdq     m1, m5
+    punpckldq     m5, m0, m4
+    punpckhdq     m0, m4
+    punpckldq     m4, m6, m10
+    punpckhdq     m6, m10
+    punpckldq    m10, m9, m11
+    punpckhdq     m9, m11
+    punpckldq    m11, m8, m12
+    punpckhdq     m8, m12
+    mova         m12, %3
+    mova          %3, m8
+    punpckldq     m8, m7, m12
+    punpckhdq     m7, m12
+
+    ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3
+    punpcklqdq   m12, m13, m4
+    punpckhqdq   m13, m4
+    punpcklqdq    m4, m14, m6
+    punpckhqdq   m14, m6
+    punpcklqdq    m6, m2, m8
+    punpckhqdq    m2, m8
+    punpcklqdq    m8, m15, m7
+    punpckhqdq   m15, m7
+    punpcklqdq    m7, m3, m10
+    punpckhqdq    m3, m10
+    punpcklqdq   m10, m1, m9
+    punpckhqdq    m1, m9
+    punpcklqdq    m9, m5, m11
+    punpckhqdq    m5, m11
+    mova         m11, %3
+    mova          %3, m12
+    punpcklqdq   m12, m0, m11
+    punpckhqdq    m0, m11
+%if %2 == 0
+    mova         m11, %3
+%endif
+
+    ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0
+    SWAP          0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15
+    SWAP          3, 14, 12, 9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+    ; load data
+%ifidn %2, v
+%if %1 == 4
+    lea         tmpq, [dstq+mstrideq*2]
+    mova          m3, [tmpq+strideq*0]          ; p1
+    mova          m4, [tmpq+strideq*1]          ; p0
+    mova          m5, [tmpq+strideq*2]          ; q0
+    mova          m6, [tmpq+stride3q]           ; q1
+%else
+    ; load 6-8 pixels, remainder (for wd=16) will be read inline
+    lea         tmpq, [dstq+mstrideq*4]
+%if %1 != 6
+    mova         m12, [tmpq+strideq*0]
+%endif
+    mova         m13, [tmpq+strideq*1]
+    mova          m3, [tmpq+strideq*2]
+    mova          m4, [tmpq+stride3q]
+    mova          m5, [dstq+strideq*0]
+    mova          m6, [dstq+strideq*1]
+    mova         m14, [dstq+strideq*2]
+%if %1 != 6
+    mova         m15, [dstq+stride3q]
+%endif
+%endif
+%else
+    ; load lines
+%if %1 == 4
+    movd         xm3, [dstq+strideq*0-2]
+    movd         xm4, [dstq+strideq*1-2]
+    movd         xm5, [dstq+strideq*2-2]
+    movd         xm6, [dstq+stride3q -2]
+    lea         tmpq, [dstq+strideq*4]
+    pinsrd       xm3, [tmpq+strideq*0-2], 2
+    pinsrd       xm4, [tmpq+strideq*1-2], 2
+    pinsrd       xm5, [tmpq+strideq*2-2], 2
+    pinsrd       xm6, [tmpq+stride3q -2], 2
+    lea         tmpq, [tmpq+strideq*4]
+    pinsrd       xm3, [tmpq+strideq*0-2], 1
+    pinsrd       xm4, [tmpq+strideq*1-2], 1
+    pinsrd       xm5, [tmpq+strideq*2-2], 1
+    pinsrd       xm6, [tmpq+stride3q -2], 1
+    lea         tmpq, [tmpq+strideq*4]
+    pinsrd       xm3, [tmpq+strideq*0-2], 3
+    pinsrd       xm4, [tmpq+strideq*1-2], 3
+    pinsrd       xm5, [tmpq+strideq*2-2], 3
+    pinsrd       xm6, [tmpq+stride3q -2], 3
+    lea         tmpq, [tmpq+strideq*4]
+    movd        xm12, [tmpq+strideq*0-2]
+    movd        xm13, [tmpq+strideq*1-2]
+    movd        xm14, [tmpq+strideq*2-2]
+    movd        xm15, [tmpq+stride3q -2]
+    lea         tmpq, [tmpq+strideq*4]
+    pinsrd      xm12, [tmpq+strideq*0-2], 2
+    pinsrd      xm13, [tmpq+strideq*1-2], 2
+    pinsrd      xm14, [tmpq+strideq*2-2], 2
+    pinsrd      xm15, [tmpq+stride3q -2], 2
+    lea         tmpq, [tmpq+strideq*4]
+    pinsrd      xm12, [tmpq+strideq*0-2], 1
+    pinsrd      xm13, [tmpq+strideq*1-2], 1
+    pinsrd      xm14, [tmpq+strideq*2-2], 1
+    pinsrd      xm15, [tmpq+stride3q -2], 1
+    lea         tmpq, [tmpq+strideq*4]
+    pinsrd      xm12, [tmpq+strideq*0-2], 3
+    pinsrd      xm13, [tmpq+strideq*1-2], 3
+    pinsrd      xm14, [tmpq+strideq*2-2], 3
+    pinsrd      xm15, [tmpq+stride3q -2], 3
+    vinserti128   m3, xm12, 1
+    vinserti128   m4, xm13, 1
+    vinserti128   m5, xm14, 1
+    vinserti128   m6, xm15, 1
+
+    ; transpose 4x16
+    ; xm3: A-D0,A-D8,A-D4,A-D12
+    ; xm4: A-D1,A-D9,A-D5,A-D13
+    ; xm5: A-D2,A-D10,A-D6,A-D14
+    ; xm6: A-D3,A-D11,A-D7,A-D15
+    punpcklbw     m7, m3, m4
+    punpckhbw     m3, m4
+    punpcklbw     m4, m5, m6
+    punpckhbw     m5, m6
+    ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
+    ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
+    ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
+    ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
+    punpcklwd     m6, m7, m4
+    punpckhwd     m7, m4
+    punpcklwd     m4, m3, m5
+    punpckhwd     m3, m5
+    ; xm6: A0-3,B0-3,C0-3,D0-3
+    ; xm7: A8-11,B8-11,C8-11,D8-11
+    ; xm4: A4-7,B4-7,C4-7,D4-7
+    ; xm3: A12-15,B12-15,C12-15,D12-15
+    punpckldq     m5, m6, m4
+    punpckhdq     m6, m4
+    punpckldq     m4, m7, m3
+    punpckhdq     m7, m3
+    ; xm5: A0-7,B0-7
+    ; xm6: C0-7,D0-7
+    ; xm4: A8-15,B8-15
+    ; xm7: C8-15,D8-15
+    punpcklqdq    m3, m5, m4
+    punpckhqdq    m4, m5, m4
+    punpcklqdq    m5, m6, m7
+    punpckhqdq    m6, m7
+    ; xm3: A0-15
+    ; xm5: B0-15
+    ; xm4: C0-15
+    ; xm6: D0-15
+%elif %1 == 6 || %1 == 8
+    movq         xm3, [dstq+strideq*0-%1/2]
+    movq         xm4, [dstq+strideq*1-%1/2]
+    movq         xm5, [dstq+strideq*2-%1/2]
+    movq         xm6, [dstq+stride3q -%1/2]
+    lea         tmpq, [dstq+strideq*8]
+    movhps       xm3, [tmpq+strideq*0-%1/2]
+    movhps       xm4, [tmpq+strideq*1-%1/2]
+    movhps       xm5, [tmpq+strideq*2-%1/2]
+    movhps       xm6, [tmpq+stride3q -%1/2]
+    lea         tmpq, [tmpq+strideq*8]
+    movq         xm7, [tmpq+strideq*0-%1/2]
+    movq         xm8, [tmpq+strideq*1-%1/2]
+    movq         xm9, [tmpq+strideq*2-%1/2]
+    movq        xm11, [tmpq+stride3q -%1/2]
+    lea         tmpq, [tmpq+strideq*8]
+    movhps       xm7, [tmpq+strideq*0-%1/2]
+    movhps       xm8, [tmpq+strideq*1-%1/2]
+    movhps       xm9, [tmpq+strideq*2-%1/2]
+    movhps      xm11, [tmpq+stride3q -%1/2]
+    vinserti128   m3, xm7, 1
+    vinserti128   m4, xm8, 1
+    vinserti128   m5, xm9, 1
+    vinserti128   m6, xm11, 1
+    lea         tmpq, [dstq+strideq*4]
+    movq        xm12, [tmpq+strideq*0-%1/2]
+    movq        xm13, [tmpq+strideq*1-%1/2]
+    movq        xm14, [tmpq+strideq*2-%1/2]
+    movq        xm15, [tmpq+stride3q -%1/2]
+    lea         tmpq, [tmpq+strideq*8]
+    movhps      xm12, [tmpq+strideq*0-%1/2]
+    movhps      xm13, [tmpq+strideq*1-%1/2]
+    movhps      xm14, [tmpq+strideq*2-%1/2]
+    movhps      xm15, [tmpq+stride3q -%1/2]
+    lea         tmpq, [tmpq+strideq*8]
+    movq         xm7, [tmpq+strideq*0-%1/2]
+    movq         xm8, [tmpq+strideq*1-%1/2]
+    movq         xm9, [tmpq+strideq*2-%1/2]
+    movq        xm11, [tmpq+stride3q -%1/2]
+    lea         tmpq, [tmpq+strideq*8]
+    movhps       xm7, [tmpq+strideq*0-%1/2]
+    movhps       xm8, [tmpq+strideq*1-%1/2]
+    movhps       xm9, [tmpq+strideq*2-%1/2]
+    movhps      xm11, [tmpq+stride3q -%1/2]
+    vinserti128  m12, xm7, 1
+    vinserti128  m13, xm8, 1
+    vinserti128  m14, xm9, 1
+    vinserti128  m15, xm11, 1
+
+    ; transpose 8x16
+    ; xm3: A-H0,A-H8
+    ; xm4: A-H1,A-H9
+    ; xm5: A-H2,A-H10
+    ; xm6: A-H3,A-H11
+    ; xm12: A-H4,A-H12
+    ; xm13: A-H5,A-H13
+    ; xm14: A-H6,A-H14
+    ; xm15: A-H7,A-H15
+    punpcklbw    m7, m3, m4
+    punpckhbw    m3, m4
+    punpcklbw    m4, m5, m6
+    punpckhbw    m5, m6
+    punpcklbw    m6, m12, m13
+    punpckhbw   m12, m13
+    punpcklbw   m13, m14, m15
+    punpckhbw   m14, m15
+    ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+    ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+    ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+    ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+    ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+    ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+    ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+    ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+    punpcklwd   m15, m7, m4
+    punpckhwd    m7, m4
+    punpcklwd    m4, m3, m5
+    punpckhwd    m3, m5
+    punpcklwd    m5, m6, m13
+    punpckhwd    m6, m13
+    punpcklwd   m13, m12, m14
+    punpckhwd   m12, m14
+    ; xm15: A0-3,B0-3,C0-3,D0-3
+    ; xm7: E0-3,F0-3,G0-3,H0-3
+    ; xm4: A8-11,B8-11,C8-11,D8-11
+    ; xm3: E8-11,F8-11,G8-11,H8-11
+    ; xm5: A4-7,B4-7,C4-7,D4-7
+    ; xm6: E4-7,F4-7,G4-7,H4-7
+    ; xm13: A12-15,B12-15,C12-15,D12-15
+    ; xm12: E12-15,F12-15,G12-15,H12-15
+    punpckldq   m14, m15, m5
+    punpckhdq   m15, m5
+    punpckldq    m5, m7, m6
+%if %1 != 6
+    punpckhdq    m7, m6
+%endif
+    punpckldq    m6, m4, m13
+    punpckhdq    m4, m13
+    punpckldq   m13, m3, m12
+%if %1 != 6
+    punpckhdq   m12, m3, m12
+%endif
+    ; xm14: A0-7,B0-7
+    ; xm15: C0-7,D0-7
+    ; xm5: E0-7,F0-7
+    ; xm7: G0-7,H0-7
+    ; xm6: A8-15,B8-15
+    ; xm4: C8-15,D8-15
+    ; xm13: E8-15,F8-15
+    ; xm12: G8-15,H8-15
+    punpcklqdq   m3, m14, m6
+    punpckhqdq  m14, m6
+    punpckhqdq   m6, m15, m4
+    punpcklqdq  m15, m4
+    punpcklqdq   m4, m5, m13
+    punpckhqdq  m13, m5, m13
+%if %1 == 8
+    punpcklqdq   m5, m7, m12
+    punpckhqdq  m12, m7, m12
+    ; xm3: A0-15
+    ; xm14: B0-15
+    ; xm15: C0-15
+    ; xm6: D0-15
+    ; xm4: E0-15
+    ; xm13: F0-15
+    ; xm5: G0-15
+    ; xm12: H0-15
+    SWAP         12, 3, 15
+    SWAP         13, 14, 5, 4, 6
+    ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15
+%else
+    SWAP         13, 3, 14
+    SWAP          6, 4, 15, 5
+    ; 3,14,15,6,4,13 -> 13,3,4,5,6,14
+%endif
+%else
+    ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+    ; remainder at the end for the second transpose
+    movu         xm0, [dstq+strideq*0-8]
+    movu         xm1, [dstq+strideq*1-8]
+    movu         xm2, [dstq+strideq*2-8]
+    movu         xm3, [dstq+stride3q -8]
+    lea         tmpq, [dstq+strideq*4]
+    movu         xm4, [tmpq+strideq*0-8]
+    movu         xm5, [tmpq+strideq*1-8]
+    movu         xm6, [tmpq+strideq*2-8]
+    movu         xm7, [tmpq+stride3q -8]
+    lea         tmpq, [tmpq+strideq*4]
+    movu         xm8, [tmpq+strideq*0-8]
+    movu         xm9, [tmpq+strideq*1-8]
+    movu        xm10, [tmpq+strideq*2-8]
+    movu        xm11, [tmpq+stride3q -8]
+    lea         tmpq, [tmpq+strideq*4]
+    movu        xm12, [tmpq+strideq*0-8]
+    movu        xm13, [tmpq+strideq*1-8]
+    movu        xm14, [tmpq+strideq*2-8]
+    movu        xm15, [tmpq+stride3q -8]
+    lea         tmpq, [tmpq+strideq*4]
+    vinserti128   m0, [tmpq+strideq*0-8], 1
+    vinserti128   m1, [tmpq+strideq*1-8], 1
+    vinserti128   m2, [tmpq+strideq*2-8], 1
+    vinserti128   m3, [tmpq+stride3q -8], 1
+    lea         tmpq, [tmpq+strideq*4]
+    vinserti128   m4, [tmpq+strideq*0-8], 1
+    vinserti128   m5, [tmpq+strideq*1-8], 1
+    vinserti128   m6, [tmpq+strideq*2-8], 1
+    vinserti128   m7, [tmpq+stride3q -8], 1
+    lea         tmpq, [tmpq+strideq*4]
+    vinserti128   m8, [tmpq+strideq*0-8], 1
+    vinserti128   m9, [tmpq+strideq*1-8], 1
+    vinserti128  m10, [tmpq+strideq*2-8], 1
+    vinserti128  m11, [tmpq+stride3q -8], 1
+    lea         tmpq, [tmpq+strideq*4]
+    vinserti128  m12, [tmpq+strideq*0-8], 1
+    vinserti128  m13, [tmpq+strideq*1-8], 1
+    vinserti128  m14, [tmpq+strideq*2-8], 1
+    vinserti128  m15, [tmpq+stride3q -8], 1
+
+    TRANSPOSE_16X16B 0, 1, [rsp+11*32]
+    mova  [rsp+12*32], m1
+    mova  [rsp+13*32], m2
+    mova  [rsp+14*32], m3
+    mova  [rsp+15*32], m12
+    mova  [rsp+16*32], m13
+    mova  [rsp+17*32], m14
+    mova  [rsp+18*32], m15
+    ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
+    SWAP           12, 4, 7
+    SWAP           13, 5, 8
+    SWAP            3, 6, 9
+    SWAP           10, 14
+    SWAP           11, 15
+%endif
+%endif
+
+    ; load L/E/I/H
+%ifidn %2, v
+    movu          m1, [lq]
+    movu          m0, [lq+l_strideq]
+%else
+    movq         xm1, [lq]
+    movq         xm2, [lq+l_strideq*2]
+    movhps       xm1, [lq+l_strideq]
+    movhps       xm2, [lq+l_stride3q]
+    lea           lq, [lq+l_strideq*4]
+    movq        xm10, [lq]
+    movq         xm0, [lq+l_strideq*2]
+    movhps      xm10, [lq+l_strideq]
+    movhps       xm0, [lq+l_stride3q]
+    lea           lq, [lq+l_strideq*4]
+    vinserti128   m1, xm10, 1
+    vinserti128   m2, xm0, 1
+    shufps        m0, m1, m2, q3131
+    shufps        m1, m2, q2020
+%endif
+    pxor          m2, m2
+    pcmpeqb      m10, m2, m0
+    pand          m1, m10
+    por           m0, m1                        ; l[x][] ? l[x][] : l[x-stride][]
+    pshufb        m0, [pb_4x1_4x5_4x9_4x13]     ; l[x][1]
+    pcmpeqb      m10, m2, m0                    ; !L
+    psrlq         m2, m0, [lutq+128]
+    pand          m2, [pb_63]
+    vpbroadcastb  m1, [lutq+136]
+    pminub        m2, m1
+    pmaxub        m2, [pb_1]                    ; I
+    pand          m1, m0, [pb_240]
+    psrlq         m1, 4                         ; H
+    paddb         m0, [pb_2]
+    paddb         m0, m0
+    paddb         m0, m2                        ; E
+    pxor          m1, [pb_128]
+    pxor          m2, [pb_128]
+    pxor          m0, [pb_128]
+
+    ABSSUB        m8, m3, m4, m9                ; abs(p1-p0)
+    pmaxub        m8, m10
+    ABSSUB        m9, m5, m6, m10               ; abs(q1-q0)
+    pmaxub        m8, m9
+%if %1 == 4
+    pxor          m8, [pb_128]
+    pcmpgtb       m7, m8, m1                    ; hev
+%else
+    pxor          m7, m8, [pb_128]
+    pcmpgtb       m7, m1                        ; hev
+
+%if %1 == 6
+    ABSSUB        m9, m13, m4, m10              ; abs(p2-p0)
+    pmaxub        m9, m8
+%else
+    ABSSUB        m9, m12, m4, m10              ; abs(p3-p0)
+    pmaxub        m9, m8
+    ABSSUB       m10, m13, m4, m11              ; abs(p2-p0)
+    pmaxub        m9, m10
+%endif
+    ABSSUB       m10, m5,  m14, m11             ; abs(q2-q0)
+    pmaxub        m9, m10
+%if %1 != 6
+    ABSSUB       m10, m5,  m15, m11             ; abs(q3-q0)
+    pmaxub        m9, m10
+%endif
+    pxor          m9, [pb_128]
+    pcmpgtb       m9, [pb_129]                  ; !flat8in
+
+%if %1 == 6
+    ABSSUB       m10, m13, m3,  m1              ; abs(p2-p1)
+%else
+    ABSSUB       m10, m12, m13, m11             ; abs(p3-p2)
+    ABSSUB       m11, m13, m3,  m1              ; abs(p2-p1)
+    pmaxub       m10, m11
+    ABSSUB       m11, m14, m15, m1              ; abs(q3-q2)
+    pmaxub       m10, m11
+%endif
+    ABSSUB       m11, m14, m6,  m1              ; abs(q2-q1)
+    pmaxub       m10, m11
+%if %1 == 16
+    vpbroadcastd m11, [maskq+8]
+    vpbroadcastd  m1, [maskq+4]
+    por          m11, m1
+    pand         m11, [pb_mask]
+    pcmpeqd      m11, [pb_mask]
+    pand         m10, m11
+%else
+    vpbroadcastd m11, [maskq+4]
+    pand         m11, [pb_mask]
+    pcmpeqd      m11, [pb_mask]
+    pand         m10, m11                       ; only apply fm-wide to wd>4 blocks
+%endif
+    pmaxub        m8, m10
+
+    pxor          m8, [pb_128]
+%endif
+    pcmpgtb       m8, m2
+
+    ABSSUB       m10, m3, m6, m11               ; abs(p1-q1)
+    ABSSUB       m11, m4, m5, m2                ; abs(p0-q0)
+    paddusb      m11, m11
+    pand         m10, [pb_254]
+    psrlq        m10, 1
+    paddusb      m10, m11                       ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+    pxor         m10, [pb_128]
+    pcmpgtb      m10, m0                        ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+    por           m8, m10
+
+%if %1 == 16
+%ifidn %2, v
+    lea         tmpq, [dstq+mstrideq*8]
+    mova          m0, [tmpq+strideq*1]
+%else
+    mova          m0, [rsp+12*32]
+%endif
+    ABSSUB        m1, m0, m4, m2
+%ifidn %2, v
+    mova          m0, [tmpq+strideq*2]
+%else
+    mova          m0, [rsp+13*32]
+%endif
+    ABSSUB        m2, m0, m4, m10
+    pmaxub        m1, m2
+%ifidn %2, v
+    mova          m0, [tmpq+stride3q]
+%else
+    mova          m0, [rsp+14*32]
+%endif
+    ABSSUB        m2, m0, m4, m10
+    pmaxub        m1, m2
+%ifidn %2, v
+    lea         tmpq, [dstq+strideq*4]
+    mova          m0, [tmpq+strideq*0]
+%else
+    mova          m0, [rsp+15*32]
+%endif
+    ABSSUB        m2, m0, m5, m10
+    pmaxub        m1, m2
+%ifidn %2, v
+    mova          m0, [tmpq+strideq*1]
+%else
+    mova          m0, [rsp+16*32]
+%endif
+    ABSSUB        m2, m0, m5, m10
+    pmaxub        m1, m2
+%ifidn %2, v
+    mova          m0, [tmpq+strideq*2]
+%else
+    mova          m0, [rsp+17*32]
+%endif
+    ABSSUB        m2, m0, m5, m10
+    pmaxub        m1, m2
+    pxor          m1, [pb_128]
+    pcmpgtb       m1, [pb_129]                  ; !flat8out
+    por           m1, m9                        ; !flat8in | !flat8out
+    vpbroadcastd  m2, [maskq+8]
+    pand         m10, m2, [pb_mask]
+    pcmpeqd      m10, [pb_mask]
+    pandn         m1, m10                       ; flat16
+    pandn         m1, m8, m1                    ; flat16 & fm
+
+    vpbroadcastd m10, [maskq+4]
+    por          m10, m2
+    pand          m2, m10, [pb_mask]
+    pcmpeqd       m2, [pb_mask]
+    pandn         m9, m2                        ; flat8in
+    pandn         m9, m8, m9
+    vpbroadcastd  m2, [maskq+0]
+    por           m2, m10
+    pand          m2, [pb_mask]
+    pcmpeqd       m2, [pb_mask]
+    pandn         m8, m2
+    pandn         m8, m9, m8                    ; fm & !flat8 & !flat16
+    pandn         m9, m1, m9                    ; flat8 & !flat16
+%elif %1 != 4
+    vpbroadcastd  m0, [maskq+4]
+    pand          m2, m0, [pb_mask]
+    pcmpeqd       m2, [pb_mask]
+    pandn         m9, m2
+    pandn         m9, m8, m9                    ; flat8 & fm
+    vpbroadcastd  m2, [maskq+0]
+    por           m0, m2
+    pand          m0, [pb_mask]
+    pcmpeqd       m0, [pb_mask]
+    pandn         m8, m0
+    pandn         m8, m9, m8                    ; fm & !flat8
+%else
+    vpbroadcastd  m0, [maskq+0]
+    pand          m0, [pb_mask]
+    pcmpeqd       m0, [pb_mask]
+    pandn         m8, m0                        ; fm
+%endif
+
+    ; short filter
+
+    pxor          m3, [pb_128]
+    pxor          m6, [pb_128]
+    psubsb       m10, m3, m6                    ; iclip_diff(p1-q1)
+    pand         m10, m7                        ; f=iclip_diff(p1-q1)&hev
+    pxor          m4, [pb_128]
+    pxor          m5, [pb_128]
+    psubsb       m11, m5, m4
+    paddsb       m10, m11
+    paddsb       m10, m11
+    paddsb       m10, m11                       ; f=iclip_diff(3*(q0-p0)+f)
+    pand          m8, m10                       ; f&=fm
+    paddsb       m10, m8, [pb_3]
+    paddsb        m8, [pb_4]
+    pand         m10, [pb_248]
+    pand          m8, [pb_248]
+    psrlq        m10, 3
+    psrlq         m8, 3
+    pxor         m10, [pb_16]
+    pxor          m8, [pb_16]
+    psubb        m10, [pb_16]                   ; f2
+    psubb         m8, [pb_16]                   ; f1
+    paddsb        m4, m10
+    psubsb        m5, m8
+    pxor          m4, [pb_128]
+    pxor          m5, [pb_128]
+
+    pxor          m8, [pb_128]
+    pxor         m10, m10
+    pavgb         m8, m10                       ; f=(f1+1)>>1
+    psubb         m8, [pb_64]
+    pandn         m8, m7, m8                    ; f&=!hev
+    paddsb        m3, m8
+    psubsb        m6, m8
+    pxor          m3, [pb_128]
+    pxor          m6, [pb_128]
+
+%if %1 == 16
+    ; flat16 filter
+%ifidn %2, v
+    lea         tmpq, [dstq+mstrideq*8]
+    mova          m0, [tmpq+strideq*1]          ; p6
+    mova          m2, [tmpq+strideq*2]          ; p5
+    mova          m7, [tmpq+stride3q]           ; p4
+%else
+    mova          m0, [rsp+12*32]
+    mova          m2, [rsp+13*32]
+    mova          m7, [rsp+14*32]
+%endif
+
+    mova  [rsp+0*32], m9
+    mova  [rsp+1*32], m14
+    mova  [rsp+2*32], m15
+
+    ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
+    ; write -6
+    punpcklbw    m14, m0, m12
+    punpckhbw    m15, m0, m12
+    pmaddubsw    m10, m14, [pb_7_1]
+    pmaddubsw    m11, m15, [pb_7_1]             ; p6*7+p3
+    punpcklbw     m8, m2, m7
+    punpckhbw     m9, m2, m7
+    pmaddubsw     m8, [pb_2]
+    pmaddubsw     m9, [pb_2]
+    paddw        m10, m8
+    paddw        m11, m9                        ; p6*7+p5*2+p4*2+p3
+    punpcklbw     m8, m13, m3
+    punpckhbw     m9, m13, m3
+    pmaddubsw     m8, [pb_1]
+    pmaddubsw     m9, [pb_1]
+    paddw        m10, m8
+    paddw        m11, m9                        ; p6*7+p5*2+p4*2+p3+p2+p1
+    punpcklbw     m8, m4, m5
+    punpckhbw     m9, m4, m5
+    pmaddubsw     m8, [pb_1]
+    pmaddubsw     m9, [pb_1]
+    paddw        m10, m8
+    paddw        m11, m9                        ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+    pmulhrsw      m8, m10, [pw_2048]
+    pmulhrsw      m9, m11, [pw_2048]
+    packuswb      m8, m9
+    pand          m8, m1
+    pandn         m9, m1, m2
+    por           m8, m9
+%ifidn %2, v
+    mova [tmpq+strideq*2], m8                   ; p5
+%else
+    mova [rsp+13*32], m8
+%endif
+
+    ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
+    ; write -5
+    pmaddubsw    m14, [pb_m1_1]
+    pmaddubsw    m15, [pb_m1_1]
+    paddw        m10, m14
+    paddw        m11, m15                       ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+    punpcklbw     m8, m0, m6
+    punpckhbw     m9, m0, m6
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m9, [pb_m1_1]
+    mova  [rsp+3*32], m8
+    mova  [rsp+4*32], m9
+    paddw        m10, m8
+    paddw        m11, m9                        ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+    pmulhrsw      m8, m10, [pw_2048]
+    pmulhrsw      m9, m11, [pw_2048]
+    packuswb      m8, m9
+    pand          m8, m1
+    pandn         m9, m1, m7
+    por           m8, m9
+%ifidn %2, v
+    mova [tmpq+stride3q], m8                    ; p4
+%else
+    mova [rsp+14*32], m8
+%endif
+
+    ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
+    ; write -4
+    mova         m14, [rsp+1*32]
+    punpcklbw     m8, m0, m13
+    punpckhbw     m9, m0, m13
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m9, [pb_m1_1]
+    paddw        m10, m8
+    paddw        m11, m9                        ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+    punpcklbw     m8, m2, m14
+    punpckhbw     m2, m14
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m2, [pb_m1_1]
+    mova  [rsp+1*32], m8
+    paddw        m10, m8
+    paddw        m11, m2                        ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
+    pmulhrsw      m8, m10, [pw_2048]
+    pmulhrsw      m9, m11, [pw_2048]
+    packuswb      m8, m9
+    pand          m8, m1
+    pandn         m9, m1, m12
+    por           m8, m9
+%ifidn %2, v
+    mova [tmpq+strideq*4], m8                   ; p3
+%else
+    mova [rsp+19*32], m8
+%endif
+
+    ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
+    ; write -3
+    mova         m15, [rsp+2*32]
+    punpcklbw     m8, m0, m3
+    punpckhbw     m9, m0, m3
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m9, [pb_m1_1]
+    paddw        m10, m8
+    paddw        m11, m9                        ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+    punpcklbw     m8, m7, m15
+    punpckhbw     m7, m15
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m7, [pb_m1_1]
+    mova  [rsp+2*32], m8
+    paddw        m10, m8
+    paddw        m11, m7                        ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
+    pmulhrsw      m8, m10, [pw_2048]
+    pmulhrsw      m9, m11, [pw_2048]
+    packuswb      m8, m9
+    pand          m8, m1
+    pandn         m9, m1, m13
+    por           m8, m9
+    mova  [rsp+6*32], m8                        ; don't clobber p2/m13 since we need it in F
+
+    ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
+    ; write -2
+%ifidn %2, v
+    lea         tmpq, [dstq+strideq*4]
+%endif
+    punpcklbw     m8, m0, m4
+    punpckhbw     m9, m0, m4
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m9, [pb_m1_1]
+    paddw        m10, m8
+    paddw        m11, m9                        ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+%ifidn %2, v
+    mova          m9, [tmpq+strideq*0]          ; q4
+%else
+    mova          m9, [rsp+15*32]
+%endif
+    punpcklbw     m8, m12, m9
+    punpckhbw     m9, m12, m9
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m9, [pb_m1_1]
+    mova  [rsp+7*32], m8
+    mova  [rsp+5*32], m9
+    paddw        m10, m8
+    paddw        m11, m9                        ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+    pmulhrsw      m8, m10, [pw_2048]
+    pmulhrsw      m9, m11, [pw_2048]
+    packuswb      m8, m9
+    pand          m8, m1
+    pandn         m9, m1, m3
+    por           m8, m9
+    mova  [rsp+8*32], m8                        ; don't clobber p1/m3 since we need it in G
+
+    ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
+    ; write -1
+%ifidn %2, v
+    mova          m9, [tmpq+strideq*1]          ; q5
+%else
+    mova          m9, [rsp+16*32]
+%endif
+    punpcklbw     m8, m0, m5
+    punpckhbw     m0, m5
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m0, [pb_m1_1]
+    paddw        m10, m8
+    paddw        m11, m0                        ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+    punpcklbw     m0, m13, m9
+    punpckhbw     m9, m13, m9
+    mova         m13, [rsp+6*32]
+    pmaddubsw     m0, [pb_m1_1]
+    pmaddubsw     m9, [pb_m1_1]
+    mova [rsp+ 9*32], m0
+    mova [rsp+10*32], m9
+    paddw        m10, m0
+    paddw        m11, m9                        ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+    pmulhrsw      m0, m10, [pw_2048]
+    pmulhrsw      m8, m11, [pw_2048]
+    packuswb      m0, m8
+    pand          m0, m1
+    pandn         m8, m1, m4
+    por           m0, m8
+    mova  [rsp+6*32], m0                        ; don't clobber p0/m4 since we need it in H
+
+    ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
+    ; write +0
+%ifidn %2, v
+    mova          m0, [tmpq+strideq*2]          ; q6
+%else
+    mova          m0, [rsp+17*32]
+%endif
+    paddw        m10, [rsp+3*32]
+    paddw        m11, [rsp+4*32]                ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
+    punpcklbw     m8, m3, m0
+    punpckhbw     m9, m3, m0
+    mova          m3, [rsp+8*32]
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m9, [pb_m1_1]
+    mova  [rsp+3*32], m8
+    mova  [rsp+4*32], m9
+    paddw        m10, m8
+    paddw        m11, m9                        ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+    pmulhrsw      m8, m10, [pw_2048]
+    pmulhrsw      m9, m11, [pw_2048]
+    packuswb      m8, m9
+    pand          m8, m1
+    pandn         m9, m1, m5
+    por           m8, m9
+    mova  [rsp+8*32], m8                        ; don't clobber q0/m5 since we need it in I
+
+    ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
+    ; write +1
+    paddw        m10, [rsp+1*32]
+    paddw        m11, m2                        ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+    punpcklbw     m8, m4, m0
+    punpckhbw     m2, m4, m0
+    mova          m4, [rsp+6*32]
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m2, [pb_m1_1]
+    paddw        m10, m8
+    paddw        m11, m2                        ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
+    pmulhrsw      m2, m10, [pw_2048]
+    pmulhrsw      m9, m11, [pw_2048]
+    packuswb      m2, m9
+    pand          m2, m1
+    pandn         m9, m1, m6
+    por           m2, m9                        ; don't clobber q1/m6 since we need it in K
+
+    ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
+    ; write +2
+    paddw        m10, [rsp+2*32]
+    paddw        m11, m7                        ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+    punpcklbw     m8, m5, m0
+    punpckhbw     m9, m5, m0
+    mova          m5, [rsp+8*32]
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m9, [pb_m1_1]
+    paddw        m10, m8
+    paddw        m11, m9                        ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+    pmulhrsw      m7, m10, [pw_2048]
+    pmulhrsw      m9, m11, [pw_2048]
+    packuswb      m7, m9
+    pand          m7, m1
+    pandn         m9, m1, m14
+    por           m7, m9                        ; don't clobber q2/m14 since we need it in K
+
+    ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
+    ; write +3
+    paddw        m10, [rsp+7*32]
+    paddw        m11, [rsp+5*32]                ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+    punpcklbw     m8, m6, m0
+    punpckhbw     m9, m6, m0
+    SWAP           2, 6
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m9, [pb_m1_1]
+    paddw        m10, m8
+    paddw        m11, m9                        ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+    pmulhrsw      m8, m10, [pw_2048]
+    pmulhrsw      m9, m11, [pw_2048]
+    packuswb      m8, m9
+    pand          m8, m1
+    pandn         m9, m1, m15
+    por           m8, m9
+%ifidn %2, v
+    mova [tmpq+mstrideq], m8                    ; q3
+%else
+    mova [rsp+20*32], m8
+%endif
+
+    ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
+    ; write +4
+    paddw        m10, [rsp+ 9*32]
+    paddw        m11, [rsp+10*32]               ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+    punpcklbw     m8, m14, m0
+    punpckhbw     m9, m14, m0
+    SWAP          14, 7
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m9, [pb_m1_1]
+    paddw        m10, m8
+    paddw        m11, m9                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+    pmulhrsw      m8, m10, [pw_2048]
+    pmulhrsw      m9, m11, [pw_2048]
+    packuswb      m8, m9
+    pand          m8, m1
+%ifidn %2, v
+    pandn         m9, m1, [tmpq+strideq*0]
+%else
+    pandn         m9, m1, [rsp+15*32]
+%endif
+    por           m8, m9
+%ifidn %2, v
+    mova [tmpq+strideq*0], m8                    ; q4
+%else
+    mova [rsp+15*32], m8
+%endif
+
+    ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
+    ; write +5
+    paddw        m10, [rsp+3*32]
+    paddw        m11, [rsp+4*32]                ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+    punpcklbw     m8, m15, m0
+    punpckhbw     m9, m15, m0
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw     m9, [pb_m1_1]
+    paddw        m10, m8
+    paddw        m11, m9                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+    pmulhrsw     m10, [pw_2048]
+    pmulhrsw     m11, [pw_2048]
+    packuswb     m10, m11
+    pand         m10, m1
+%ifidn %2, v
+    pandn        m11, m1, [tmpq+strideq*1]
+%else
+    pandn        m11, m1, [rsp+16*32]
+%endif
+    por          m10, m11
+%ifidn %2, v
+    mova [tmpq+strideq*1], m10                  ; q5
+%else
+    mova [rsp+16*32], m10
+%endif
+
+    mova          m9, [rsp+0*32]
+%ifidn %2, v
+    lea         tmpq, [dstq+mstrideq*4]
+%endif
+%endif
+%if %1 >= 8
+    ; flat8 filter
+    punpcklbw     m0, m12, m3
+    punpckhbw     m1, m12, m3
+    pmaddubsw     m2, m0, [pb_3_1]
+    pmaddubsw     m7, m1, [pb_3_1]              ; 3 * p3 + p1
+    punpcklbw     m8, m13, m4
+    punpckhbw    m11, m13, m4
+    pmaddubsw     m8, [pb_2_1]
+    pmaddubsw    m11, [pb_2_1]
+    paddw         m2, m8
+    paddw         m7, m11                       ; 3 * p3 + 2 * p2 + p1 + p0
+    punpcklbw     m8, m5, [pb_4]
+    punpckhbw    m11, m5, [pb_4]
+    pmaddubsw     m8, [pb_1]
+    pmaddubsw    m11, [pb_1]
+    paddw         m2, m8
+    paddw         m7, m11                       ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+    psrlw         m8, m2, 3
+    psrlw        m11, m7, 3
+    packuswb      m8, m11
+    pand          m8, m9
+    pandn        m11, m9, m13
+    por          m10, m8, m11                  ; p2
+%ifidn %2, v
+    mova [tmpq+strideq*1], m10                 ; p2
+%endif
+
+    pmaddubsw     m8, m0, [pb_m1_1]
+    pmaddubsw    m11, m1, [pb_m1_1]
+    paddw         m2, m8
+    paddw         m7, m11
+    punpcklbw     m8, m13, m6
+    punpckhbw    m11, m13, m6
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw    m11, [pb_m1_1]
+    paddw         m2, m8
+    paddw         m7, m11                       ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+    psrlw         m8, m2, 3
+    psrlw        m11, m7, 3
+    packuswb      m8, m11
+    pand          m8, m9
+    pandn        m11, m9, m3
+    por           m8, m11                       ; p1
+%ifidn %2, v
+    mova [tmpq+strideq*2], m8                   ; p1
+%else
+    mova  [rsp+0*32], m8
+%endif
+
+    pmaddubsw     m0, [pb_1]
+    pmaddubsw     m1, [pb_1]
+    psubw         m2, m0
+    psubw         m7, m1
+    punpcklbw     m8, m4, m14
+    punpckhbw    m11, m4, m14
+    pmaddubsw     m8, [pb_1]
+    pmaddubsw    m11, [pb_1]
+    paddw         m2, m8
+    paddw         m7, m11                       ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+    psrlw         m8, m2, 3
+    psrlw        m11, m7, 3
+    packuswb      m8, m11
+    pand          m8, m9
+    pandn        m11, m9, m4
+    por           m8, m11                       ; p0
+%ifidn %2, v
+    mova [tmpq+stride3q ], m8                   ; p0
+%else
+    mova  [rsp+1*32], m8
+%endif
+
+    punpcklbw     m0, m5, m15
+    punpckhbw     m1, m5, m15
+    pmaddubsw     m8, m0, [pb_1]
+    pmaddubsw    m11, m1, [pb_1]
+    paddw         m2, m8
+    paddw         m7, m11
+    punpcklbw     m8, m4, m12
+    punpckhbw    m11, m4, m12
+    pmaddubsw     m8, [pb_1]
+    pmaddubsw    m11, [pb_1]
+    psubw         m2, m8
+    psubw         m7, m11                       ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+    psrlw         m8, m2, 3
+    psrlw        m11, m7, 3
+    packuswb      m8, m11
+    pand          m8, m9
+    pandn        m11, m9, m5
+    por          m11, m8, m11                   ; q0
+%ifidn %2, v
+    mova [dstq+strideq*0], m11                  ; q0
+%endif
+
+    pmaddubsw     m0, [pb_m1_1]
+    pmaddubsw     m1, [pb_m1_1]
+    paddw         m2, m0
+    paddw         m7, m1
+    punpcklbw     m8, m13, m6
+    punpckhbw    m13, m6
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw    m13, [pb_m1_1]
+    paddw         m2, m8
+    paddw         m7, m13                       ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+    psrlw         m8, m2, 3
+    psrlw        m13, m7, 3
+    packuswb      m8, m13
+    pand          m8, m9
+    pandn        m13, m9, m6
+    por          m13, m8, m13                   ; q1
+%ifidn %2, v
+    mova [dstq+strideq*1], m13                  ; q1
+%endif
+
+    punpcklbw     m0, m3, m6
+    punpckhbw     m1, m3, m6
+    pmaddubsw     m0, [pb_1]
+    pmaddubsw     m1, [pb_1]
+    psubw         m2, m0
+    psubw         m7, m1
+    punpcklbw     m0, m14, m15
+    punpckhbw     m1, m14, m15
+    pmaddubsw     m0, [pb_1]
+    pmaddubsw     m1, [pb_1]
+    paddw         m2, m0
+    paddw         m7, m1                        ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+    psrlw         m2, 3
+    psrlw         m7, 3
+    packuswb      m2, m7
+    pand          m2, m9
+    pandn         m7, m9, m14
+    por           m2, m7                        ; q2
+%ifidn %2, v
+    mova [dstq+strideq*2], m2                   ; q2
+%else
+    mova          m0, [rsp+0*32]
+    mova          m1, [rsp+1*32]
+%if %1 == 8
+    ; 16x8 transpose
+    punpcklbw     m3, m12, m10
+    punpckhbw    m12, m10
+    punpcklbw    m10, m0, m1
+    punpckhbw     m0, m1
+    punpcklbw     m1, m11, m13
+    punpckhbw    m11, m13
+    punpcklbw    m13, m2, m15
+    punpckhbw     m2, m15
+
+    punpcklwd    m15, m3, m10
+    punpckhwd     m3, m10
+    punpcklwd    m10, m12, m0
+    punpckhwd    m12, m0
+    punpcklwd     m0, m1, m13
+    punpckhwd     m1, m13
+    punpcklwd    m13, m11, m2
+    punpckhwd    m11, m2
+
+    punpckldq     m2, m15, m0
+    punpckhdq    m15, m0
+    punpckldq     m0, m3, m1
+    punpckhdq     m3, m1
+    punpckldq     m1, m10, m13
+    punpckhdq    m10, m13
+    punpckldq    m13, m12, m11
+    punpckhdq    m12, m11
+
+    ; write 8x32
+    movq   [dstq+strideq*0-4], xm2
+    movhps [dstq+strideq*1-4], xm2
+    movq   [dstq+strideq*2-4], xm15
+    movhps [dstq+stride3q -4], xm15
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm0
+    movhps [dstq+strideq*1-4], xm0
+    movq   [dstq+strideq*2-4], xm3
+    movhps [dstq+stride3q -4], xm3
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm1
+    movhps [dstq+strideq*1-4], xm1
+    movq   [dstq+strideq*2-4], xm10
+    movhps [dstq+stride3q -4], xm10
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm13
+    movhps [dstq+strideq*1-4], xm13
+    movq   [dstq+strideq*2-4], xm12
+    movhps [dstq+stride3q -4], xm12
+    lea         dstq, [dstq+strideq*4]
+
+    vextracti128  xm2,  m2, 1
+    vextracti128 xm15, m15, 1
+    vextracti128  xm0,  m0, 1
+    vextracti128  xm3,  m3, 1
+    vextracti128  xm1,  m1, 1
+    vextracti128 xm10, m10, 1
+    vextracti128 xm13, m13, 1
+    vextracti128 xm12, m12, 1
+
+    movq   [dstq+strideq*0-4], xm2
+    movhps [dstq+strideq*1-4], xm2
+    movq   [dstq+strideq*2-4], xm15
+    movhps [dstq+stride3q -4], xm15
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm0
+    movhps [dstq+strideq*1-4], xm0
+    movq   [dstq+strideq*2-4], xm3
+    movhps [dstq+stride3q -4], xm3
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm1
+    movhps [dstq+strideq*1-4], xm1
+    movq   [dstq+strideq*2-4], xm10
+    movhps [dstq+stride3q -4], xm10
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm13
+    movhps [dstq+strideq*1-4], xm13
+    movq   [dstq+strideq*2-4], xm12
+    movhps [dstq+stride3q -4], xm12
+    lea         dstq, [dstq+strideq*4]
+%else
+    ; 16x16 transpose and store
+    SWAP           5, 10, 2
+    SWAP           6, 0
+    SWAP           7, 1
+    SWAP           8, 11
+    SWAP           9, 13
+    mova          m0, [rsp+11*32]
+    mova          m1, [rsp+12*32]
+    mova          m2, [rsp+13*32]
+    mova          m3, [rsp+14*32]
+    mova          m4, [rsp+19*32]
+    mova         m11, [rsp+20*32]
+    mova         m12, [rsp+15*32]
+    mova         m13, [rsp+16*32]
+    mova         m14, [rsp+17*32]
+    TRANSPOSE_16X16B 1, 0, [rsp+18*32]
+    movu [dstq+strideq*0-8], xm0
+    movu [dstq+strideq*1-8], xm1
+    movu [dstq+strideq*2-8], xm2
+    movu [dstq+stride3q -8], xm3
+    lea         dstq, [dstq+strideq*4]
+    movu [dstq+strideq*0-8], xm4
+    movu [dstq+strideq*1-8], xm5
+    movu [dstq+strideq*2-8], xm6
+    movu [dstq+stride3q -8], xm7
+    lea         dstq, [dstq+strideq*4]
+    movu [dstq+strideq*0-8], xm8
+    movu [dstq+strideq*1-8], xm9
+    movu [dstq+strideq*2-8], xm10
+    movu [dstq+stride3q -8], xm11
+    lea         dstq, [dstq+strideq*4]
+    movu [dstq+strideq*0-8], xm12
+    movu [dstq+strideq*1-8], xm13
+    movu [dstq+strideq*2-8], xm14
+    movu [dstq+stride3q -8], xm15
+    lea         dstq, [dstq+strideq*4]
+    vextracti128 [dstq+strideq*0-8], m0, 1
+    vextracti128 [dstq+strideq*1-8], m1, 1
+    vextracti128 [dstq+strideq*2-8], m2, 1
+    vextracti128 [dstq+stride3q -8], m3, 1
+    lea         dstq, [dstq+strideq*4]
+    vextracti128 [dstq+strideq*0-8], m4, 1
+    vextracti128 [dstq+strideq*1-8], m5, 1
+    vextracti128 [dstq+strideq*2-8], m6, 1
+    vextracti128 [dstq+stride3q -8], m7, 1
+    lea         dstq, [dstq+strideq*4]
+    vextracti128 [dstq+strideq*0-8], m8, 1
+    vextracti128 [dstq+strideq*1-8], m9, 1
+    vextracti128 [dstq+strideq*2-8], m10, 1
+    vextracti128 [dstq+stride3q -8], m11, 1
+    lea         dstq, [dstq+strideq*4]
+    vextracti128 [dstq+strideq*0-8], m12, 1
+    vextracti128 [dstq+strideq*1-8], m13, 1
+    vextracti128 [dstq+strideq*2-8], m14, 1
+    vextracti128 [dstq+stride3q -8], m15, 1
+    lea         dstq, [dstq+strideq*4]
+%endif
+%endif
+%elif %1 == 6
+    ; flat6 filter
+
+    punpcklbw     m8, m13, m5
+    punpckhbw    m11, m13, m5
+    pmaddubsw     m0, m8, [pb_3_1]
+    pmaddubsw     m1, m11, [pb_3_1]
+    punpcklbw     m7, m4, m3
+    punpckhbw    m10, m4, m3
+    pmaddubsw     m2, m7, [pb_2]
+    pmaddubsw    m12, m10, [pb_2]
+    paddw         m0, m2
+    paddw         m1, m12
+    pmulhrsw      m2, m0, [pw_4096]
+    pmulhrsw     m12, m1, [pw_4096]
+    packuswb      m2, m12
+    pand          m2, m9
+    pandn        m12, m9, m3
+    por           m2, m12
+%ifidn %2, v
+    mova [tmpq+strideq*2], m2                   ; p1
+%endif
+
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw    m11, [pb_m1_1]
+    paddw         m0, m8
+    paddw         m1, m11
+    punpcklbw     m8, m13, m6
+    punpckhbw    m11, m13, m6
+    pmaddubsw     m8, [pb_m1_1]
+    pmaddubsw    m11, [pb_m1_1]
+    paddw         m0, m8
+    paddw         m1, m11
+    pmulhrsw     m12, m0, [pw_4096]
+    pmulhrsw     m13, m1, [pw_4096]
+    packuswb     m12, m13
+    pand         m12, m9
+    pandn        m13, m9, m4
+    por          m12, m13
+%ifidn %2, v
+    mova [tmpq+stride3q], m12                   ; p0
+%endif
+
+    paddw         m0, m8
+    paddw         m1, m11
+    punpcklbw     m8, m3, m14
+    punpckhbw    m11, m3, m14
+    pmaddubsw    m14, m8, [pb_m1_1]
+    pmaddubsw    m13, m11, [pb_m1_1]
+    paddw         m0, m14
+    paddw         m1, m13
+    pmulhrsw     m14, m0, [pw_4096]
+    pmulhrsw     m13, m1, [pw_4096]
+    packuswb     m14, m13
+    pand         m14, m9
+    pandn        m13, m9, m5
+    por          m14, m13
+%ifidn %2, v
+    mova [dstq+strideq*0], m14                  ; q0
+%endif
+
+    pmaddubsw     m8, [pb_m1_2]
+    pmaddubsw    m11, [pb_m1_2]
+    paddw         m0, m8
+    paddw         m1, m11
+    pmaddubsw     m7, [pb_m1_0]
+    pmaddubsw    m10, [pb_m1_0]
+    paddw         m0, m7
+    paddw         m1, m10
+    pmulhrsw      m0, [pw_4096]
+    pmulhrsw      m1, [pw_4096]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m9, m6
+    por           m0, m9
+%ifidn %2, v
+    mova [dstq+strideq*1], m0                   ; q1
+%else
+    TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1
+%endif
+%else
+%ifidn %2, v
+    mova [tmpq+strideq*0], m3                   ; p1
+    mova [tmpq+strideq*1], m4                   ; p0
+    mova [tmpq+strideq*2], m5                   ; q0
+    mova [tmpq+stride3q ], m6                   ; q1
+%else
+    TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7
+%endif
+%endif
+%endmacro
+
+INIT_YMM avx2
+cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \
+                    dst, stride, mask, l, l_stride, lut, \
+                    w, stride3, mstride, tmp
+    shl    l_strideq, 2
+    sub           lq, l_strideq
+    mov     mstrideq, strideq
+    neg     mstrideq
+    lea     stride3q, [strideq*3]
+
+.loop:
+    cmp byte [maskq+8], 0                       ; vmask[2]
+    je .no_flat16
+
+    FILTER        16, v
+    jmp .end
+
+.no_flat16:
+    cmp byte [maskq+4], 0                       ; vmask[1]
+    je .no_flat
+
+    FILTER         8, v
+    jmp .end
+
+.no_flat:
+    cmp byte [maskq+0], 0                       ; vmask[0]
+    je .end
+
+    FILTER         4, v
+
+.end:
+    add           lq, 32
+    add         dstq, 32
+    add        maskq, 1
+    sub           wd, 8
+    jg .loop
+    RET
+
+INIT_YMM avx2
+cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \
+                    dst, stride, mask, l, l_stride, lut, \
+                    h, stride3, l_stride3, tmp
+    shl    l_strideq, 2
+    sub           lq, 4
+    lea     stride3q, [strideq*3]
+    lea   l_stride3q, [l_strideq*3]
+
+.loop:
+    cmp byte [maskq+8], 0                       ; vmask[2]
+    je .no_flat16
+
+    FILTER        16, h
+    jmp .end
+
+.no_flat16:
+    cmp byte [maskq+4], 0                       ; vmask[1]
+    je .no_flat
+
+    FILTER         8, h
+    jmp .end
+
+.no_flat:
+    cmp byte [maskq+0], 0                       ; vmask[0]
+    je .no_filter
+
+    FILTER         4, h
+    jmp .end
+
+.no_filter:
+    lea         dstq, [dstq+stride3q*8]
+    lea           lq, [lq+l_strideq*8]
+    lea         dstq, [dstq+strideq*8]
+.end:
+    add        maskq, 1
+    sub           hd, 8
+    jg .loop
+    RET
+
+INIT_YMM avx2
+cglobal lpf_v_sb_uv, 7, 10, 16, \
+                     dst, stride, mask, l, l_stride, lut, \
+                     w, stride3, mstride, tmp
+    shl    l_strideq, 2
+    sub           lq, l_strideq
+    mov     mstrideq, strideq
+    neg     mstrideq
+    lea     stride3q, [strideq*3]
+
+.loop:
+    cmp byte [maskq+4], 0                       ; vmask[1]
+    je .no_flat
+
+    FILTER         6, v
+    jmp .end
+
+.no_flat:
+    cmp byte [maskq+0], 0                       ; vmask[0]
+    je .end
+
+    FILTER         4, v
+
+.end:
+    add           lq, 32
+    add         dstq, 32
+    add        maskq, 1
+    sub           wd, 8
+    jg .loop
+    RET
+
+INIT_YMM avx2
+cglobal lpf_h_sb_uv, 7, 10, 16, \
+                     dst, stride, mask, l, l_stride, lut, \
+                     h, stride3, l_stride3, tmp
+    shl    l_strideq, 2
+    sub           lq, 4
+    lea     stride3q, [strideq*3]
+    lea   l_stride3q, [l_strideq*3]
+
+.loop:
+    cmp byte [maskq+4], 0                       ; vmask[1]
+    je .no_flat
+
+    FILTER         6, h
+    jmp .end
+
+.no_flat:
+    cmp byte [maskq+0], 0                       ; vmask[0]
+    je .no_filter
+
+    FILTER         4, h
+    jmp .end
+
+.no_filter:
+    lea         dstq, [dstq+stride3q*8]
+    lea           lq, [lq+l_strideq*8]
+    lea         dstq, [dstq+strideq*8]
+.end:
+    add        maskq, 1
+    sub           hd, 8
+    jg .loop
+    RET
+
+%endif ; ARCH_X86_64
diff --git a/src/x86/loopfilter_init_tmpl.c b/src/x86/loopfilter_init_tmpl.c
new file mode 100644 (file)
index 0000000..4d48c90
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+#define decl_loopfilter_sb_fns(ext) \
+decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_##ext); \
+decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_##ext); \
+decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_##ext); \
+decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_##ext)
+
+decl_loopfilter_sb_fns(ssse3);
+decl_loopfilter_sb_fns(avx2);
+
+COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+    c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_ssse3;
+    c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_ssse3;
+    c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_ssse3;
+    c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_ssse3;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+    c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_avx2;
+    c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_avx2;
+    c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_avx2;
+    c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_avx2;
+#endif
+}
diff --git a/src/x86/loopfilter_ssse3.asm b/src/x86/loopfilter_ssse3.asm
new file mode 100644 (file)
index 0000000..cc70051
--- /dev/null
@@ -0,0 +1,2348 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+pb_7_1: times 8 db 7, 1
+pb_3_1: times 8 db 3, 1
+pb_2_1: times 8 db 2, 1
+pb_m1_0: times 8 db -1, 0
+pb_m1_1: times 8 db -1, 1
+pb_m1_2: times 8 db -1, 2
+pb_1: times 16 db 1
+pb_2: times 16 db 2
+pb_3: times 16 db 3
+pb_4: times 16 db 4
+pb_16: times 16 db 16
+pb_63: times 16 db 63
+pb_64: times 16 db 64
+pb_128: times 16 db 0x80
+pb_129: times 16 db 0x81
+pb_240: times 16 db 0xf0
+pb_248: times 16 db 0xf8
+pb_254: times 16 db 0xfe
+
+pw_2048: times 8 dw 2048
+pw_4096: times 8 dw 4096
+
+pd_mask: dd 1, 2, 4, 8
+
+SECTION .text
+
+%macro ABSSUB 4 ; dst, a, b, tmp
+    psubusb       %1, %2, %3
+    psubusb       %4, %3, %2
+    por           %1, %4
+%endmacro
+
+%macro TRANSPOSE_16x4_AND_WRITE_4x16 5
+    ; transpose 16x4
+    punpcklbw    m%5, m%1, m%2
+    punpckhbw    m%1, m%2
+    punpcklbw    m%2, m%3, m%4
+    punpckhbw    m%3, m%4
+    punpcklwd    m%4, m%5, m%2
+    punpckhwd    m%5, m%2
+    punpcklwd    m%2, m%1, m%3
+    punpckhwd    m%1, m%3
+
+    ; write out
+%assign %%n 0
+%rep 4
+    movd [dstq+strideq *0-2], xm%4
+    movd [dstq+strideq *4-2], xm%5
+    movd [dstq+strideq *8-2], xm%2
+    movd [dstq+stride3q*4-2], xm%1
+    add         dstq, strideq
+%if %%n < 3
+    psrldq      xm%4, 4
+    psrldq      xm%5, 4
+    psrldq      xm%2, 4
+    psrldq      xm%1, 4
+%endif
+%assign %%n (%%n+1)
+%endrep
+    lea         dstq, [dstq+stride3q*4]
+%endmacro
+
+%macro TRANSPOSE_16X16B 2 ; output_transpose, mem
+%if %1 == 0
+    mova          %2, m15 ; m7 in 32-bit
+%endif
+
+    ; input in m0-7
+    punpcklbw    m15, m0, m1
+    punpckhbw     m0, m1
+    punpcklbw     m1, m2, m3
+    punpckhbw     m2, m3
+    punpcklbw     m3, m4, m5
+    punpckhbw     m4, m5
+%if ARCH_X86_64
+    SWAP           4, 5, 7
+%else
+ %if %1 == 0
+    mova          m5, %2
+ %else
+    mova          m5, [esp+1*16]
+ %endif
+    mova          %2, m4
+%endif
+    punpcklbw     m4, m6, m5
+    punpckhbw     m6, m5
+
+    ; interleaved in m15,0,1,2,3,7,4,6
+    punpcklwd     m5, m15, m1
+    punpckhwd    m15, m1
+    punpcklwd     m1, m0, m2
+    punpckhwd     m0, m2
+    punpcklwd     m2, m3, m4
+    punpckhwd     m3, m4
+%if ARCH_X86_64
+    SWAP           3, 4, 7
+%else
+    mova          m4, %2
+    mova          %2, m3
+%endif
+    punpcklwd     m3, m4, m6
+    punpckhwd     m4, m6
+
+    ; interleaved in m5,15,1,0,2,7,3,4
+    punpckldq     m6, m5, m2
+    punpckhdq     m5, m2
+%if ARCH_X86_64
+    SWAP           2, 7, 5
+%else
+    mova          m2, %2
+    mova  [esp+1*16], m5
+%endif
+    punpckldq     m5, m15, m2
+    punpckhdq    m15, m2
+    punpckldq     m2, m1, m3
+    punpckhdq     m1, m3
+    punpckldq     m3, m0, m4
+    punpckhdq     m0, m4
+
+%if ARCH_X86_32
+    mova  [esp+0*16], m6
+    mova  [esp+2*16], m5
+    mova  [esp+3*16], m15
+    mova  [esp+4*16], m2
+    mova  [esp+5*16], m1
+    mova  [esp+6*16], m3
+    mova  [esp+7*16], m0
+    mova          m8, [esp+ 8*16]
+    mova          m9, [esp+ 9*16]
+    mova         m10, [esp+10*16]
+ %if %1 == 0
+    mova         m11, [esp+11*16]
+    mova         m12, [esp+12*16]
+    mova         m13, [esp+13*16]
+    mova         m14, [esp+14*16]
+ %else
+    mova         m11, [esp+20*16]
+    mova         m12, [esp+15*16]
+    mova         m13, [esp+16*16]
+    mova         m14, [esp+17*16]
+ %endif
+%endif
+
+    ; input in m8-m15
+%if ARCH_X86_64
+    SWAP           7, 4
+%endif
+    punpcklbw     m7, m8, m9
+    punpckhbw     m8, m9
+    punpcklbw     m9, m10, m11
+    punpckhbw    m10, m11
+    punpcklbw    m11, m12, m13
+    punpckhbw    m12, m13
+%if ARCH_X86_64
+    mova         m13, %2
+%else
+ %if %1 == 0
+    mova         m13, [esp+15*16]
+ %else
+    mova         m13, [esp+18*16]
+ %endif
+%endif
+    mova          %2, m12
+    punpcklbw    m12, m14, m13
+    punpckhbw    m14, m14, m13
+
+    ; interleaved in m7,8,9,10,11,rsp%2,12,14
+    punpcklwd    m13, m7, m9
+    punpckhwd     m7, m9
+    punpcklwd     m9, m8, m10
+    punpckhwd     m8, m10
+    punpcklwd    m10, m11, m12
+    punpckhwd    m11, m12
+    mova         m12, %2
+    mova          %2, m11
+    punpcklwd    m11, m12, m14
+    punpckhwd    m12, m14
+
+    ; interleaved in m13,7,9,8,10,rsp%2,11,12
+    punpckldq    m14, m13, m10
+    punpckhdq    m13, m10
+    punpckldq    m10, m9, m11
+    punpckhdq     m9, m11
+    punpckldq    m11, m8, m12
+    punpckhdq     m8, m12
+    mova         m12, %2
+    mova          %2, m8
+    punpckldq     m8, m7, m12
+    punpckhdq     m7, m12
+
+%if ARCH_X86_32
+    mova [esp+ 8*16], m10
+    mova [esp+ 9*16], m9
+    mova [esp+10*16], m11
+    SWAP           6, 1
+    SWAP           4, 2
+    SWAP           5, 3
+    mova          m6, [esp+0*16]
+    mova          m4, [esp+1*16]
+    mova          m5, [esp+2*16]
+%endif
+
+    ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7
+    punpcklqdq   m12, m6, m14
+    punpckhqdq    m6, m14
+    punpcklqdq   m14, m4, m13
+    punpckhqdq    m4, m13
+    punpcklqdq   m13, m5, m8
+    punpckhqdq    m5, m8
+%if ARCH_X86_64
+    SWAP           8, 5
+%else
+    mova          m8, [esp+3*16]
+    mova [esp+27*16], m5
+ %define m15 m8
+%endif
+    punpcklqdq    m5, m15, m7
+    punpckhqdq   m15, m7
+
+%if ARCH_X86_32
+    mova [esp+11*16], m12
+    mova [esp+12*16], m6
+    mova [esp+13*16], m14
+    mova [esp+14*16], m4
+    mova [esp+26*16], m13
+    mova [esp+ 0*16], m5
+    mova [esp+ 1*16], m15
+    mova          m2, [esp+ 4*16]
+    mova         m10, [esp+ 8*16]
+    mova          m1, [esp+ 5*16]
+    mova          m9, [esp+ 9*16]
+    mova          m3, [esp+ 6*16]
+    mova         m11, [esp+10*16]
+    mova          m0, [esp+ 7*16]
+%endif
+
+    punpcklqdq    m7, m2, m10
+    punpckhqdq    m2, m10
+    punpcklqdq   m10, m1, m9
+    punpckhqdq    m1, m9
+    punpcklqdq    m9, m3, m11
+    punpckhqdq    m3, m11
+    mova         m11, %2
+%if ARCH_X86_32
+ %define m12 m3
+%endif
+    mova          %2, m12
+    punpcklqdq   m12, m0, m11
+    punpckhqdq    m0, m11
+%if %1 == 1
+    mova         m11, %2
+%endif
+
+%if ARCH_X86_64
+    ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0
+    SWAP           0, 11, 1, 6, 5, 8, 7, 15
+    SWAP           2, 14, 12, 9
+    SWAP           3, 4, 13
+%else
+ %if %1 == 0
+    mova [esp+15*16], m9
+    mova [esp+17*16], m12
+    mova [esp+18*16], m0
+    mova [esp+28*16], m10
+    mova [esp+29*16], m1
+    mova          m3, [esp+0*16]
+    mova          m4, [esp+1*16]
+    SWAP          m5, m7
+    SWAP          m6, m2
+ %else
+    SWAP           0, 7
+    SWAP           3, 1, 2, 4, 6
+ %endif
+%endif
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+%if ARCH_X86_64
+ %define %%flat8mem [rsp+0*16]
+ %define %%q2mem    [rsp+1*16]
+ %define %%q3mem    [rsp+2*16]
+%else
+ %if %1 == 4 || %1 == 6
+  %define %%p2mem      [esp+ 8*16]
+  %define %%q2mem      [esp+ 9*16]
+  %define %%flat8mem   [esp+10*16]
+ %else
+  %ifidn %2, v
+   %define %%p2mem      [esp+16*16]
+   %define %%q2mem      [esp+ 1*16]
+   %define %%q3mem      [esp+18*16]
+   %define %%flat8mem   [esp+ 0*16]
+   %define %%flat16mem  [esp+20*16]
+  %else
+   %define %%p2mem     [esp+27*16]
+   %define %%q2mem     [esp+28*16]
+   %define %%q3mem     [esp+29*16]
+   %define %%flat8mem  [esp+21*16]
+   %define %%flat16mem [esp+30*16]
+  %endif
+ %endif
+ %xdefine m12reg m12
+%endif
+
+%if ARCH_X86_32
+    lea     stride3q, [strideq*3]
+%endif
+    ; load data
+%ifidn %2, v
+%if ARCH_X86_32
+    mov     mstrideq, strideq
+    neg     mstrideq
+%endif
+%if %1 == 4
+    lea         tmpq, [dstq+mstrideq*2]
+    mova          m3, [tmpq+strideq*0]          ; p1
+    mova          m4, [tmpq+strideq*1]          ; p0
+    mova          m5, [tmpq+strideq*2]          ; q0
+    mova          m6, [tmpq+stride3q]           ; q1
+%else
+    ; load 6-8 pixels, remainder (for wd=16) will be read inline
+    lea         tmpq, [dstq+mstrideq*4]
+    ; we load p3 later
+%define %%p3mem [dstq+mstrideq*4]
+ %if ARCH_X86_32
+  %define m13 m0
+  %define m14 m1
+  %define m15 m2
+ %endif
+    mova         m13, [tmpq+strideq*1]
+    mova          m3, [tmpq+strideq*2]
+    mova          m4, [tmpq+stride3q]
+    mova          m5, [dstq+strideq*0]
+    mova          m6, [dstq+strideq*1]
+    mova         m14, [dstq+strideq*2]
+%if %1 != 6
+    mova         m15, [dstq+stride3q]
+%endif
+ %if ARCH_X86_32
+    mova     %%p2mem, m13
+    mova     %%q2mem, m14
+  %define m13 %%p2mem
+  %define m14 %%q2mem
+  %if %1 != 6
+    mova     %%q3mem, m15
+   %define m15 %%q3mem
+  %endif
+ %endif
+%endif
+%else ; %2 == h
+    ; load lines
+%if %1 == 4
+    ; transpose 4x16
+    movd          m7, [dstq+strideq*0-2]
+    movd          m3, [dstq+strideq*1-2]
+    movd          m4, [dstq+strideq*2-2]
+    movd          m5, [dstq+stride3q -2]
+    lea         tmpq, [dstq+strideq*4]
+    punpcklbw     m7, m3
+    punpcklbw     m4, m5
+    movd          m3, [tmpq+strideq*0-2]
+    movd          m1, [tmpq+strideq*1-2]
+    movd          m5, [tmpq+strideq*2-2]
+    movd          m6, [tmpq+stride3q -2]
+    lea         tmpq, [tmpq+strideq*4]
+    punpcklbw     m3, m1
+    punpcklbw     m5, m6
+    movd          m0, [tmpq+strideq*0-2]
+    movd          m1, [tmpq+strideq*1-2]
+    punpcklbw     m0, m1
+    movd          m1, [tmpq+strideq*2-2]
+    movd          m2, [tmpq+stride3q -2]
+    punpcklbw     m1, m2
+    punpcklqdq    m7, m0
+    punpcklqdq    m4, m1
+    lea         tmpq, [tmpq+strideq*4]
+    movd          m0, [tmpq+strideq*0-2]
+    movd          m1, [tmpq+strideq*1-2]
+    punpcklbw     m0, m1
+    movd          m1, [tmpq+strideq*2-2]
+    movd          m2, [tmpq+stride3q -2]
+    punpcklbw     m1, m2
+    punpcklqdq    m3, m0
+    punpcklqdq    m5, m1
+    ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
+    ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
+    ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
+    ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
+    punpcklwd     m6, m7, m4
+    punpckhwd     m7, m4
+    punpcklwd     m4, m3, m5
+    punpckhwd     m3, m5
+    ; xm6: A0-3,B0-3,C0-3,D0-3
+    ; xm7: A8-11,B8-11,C8-11,D8-11
+    ; xm4: A4-7,B4-7,C4-7,D4-7
+    ; xm3: A12-15,B12-15,C12-15,D12-15
+    punpckldq     m5, m6, m4
+    punpckhdq     m6, m4
+    punpckldq     m4, m7, m3
+    punpckhdq     m7, m3
+    ; xm5: A0-7,B0-7
+    ; xm6: C0-7,D0-7
+    ; xm4: A8-15,B8-15
+    ; xm7: C8-15,D8-15
+    punpcklqdq    m3, m5, m4
+    punpckhqdq    m5, m5, m4
+    punpcklqdq    m4, m6, m7
+    punpckhqdq    m6, m7
+    ; xm3: A0-15
+    ; xm5: B0-15
+    ; xm4: C0-15
+    ; xm6: D0-15
+    SWAP           4, 5
+%elif %1 == 6 || %1 == 8
+    ; transpose 8x16
+    movq          m7, [dstq+strideq*0-%1/2]
+    movq          m3, [dstq+strideq*1-%1/2]
+    movq          m4, [dstq+strideq*2-%1/2]
+    movq          m5, [dstq+stride3q -%1/2]
+    lea         tmpq, [dstq+strideq*8]
+    punpcklbw     m7, m3
+    punpcklbw     m4, m5
+    movq          m3, [tmpq+strideq*0-%1/2]
+    movq          m1, [tmpq+strideq*1-%1/2]
+    movq          m5, [tmpq+strideq*2-%1/2]
+    movq          m6, [tmpq+stride3q -%1/2]
+    lea         tmpq, [dstq+strideq*4]
+    punpcklbw     m3, m1
+    punpcklbw     m5, m6
+    movq          m6, [tmpq+strideq*0-%1/2]
+    movq          m0, [tmpq+strideq*1-%1/2]
+    movq          m1, [tmpq+strideq*2-%1/2]
+    movq          m2, [tmpq+stride3q -%1/2]
+    lea         tmpq, [tmpq+strideq*8]
+    punpcklbw     m6, m0
+    punpcklbw     m1, m2
+    movq          m2, [tmpq+strideq*2-%1/2]
+    movq          m0, [tmpq+stride3q -%1/2]
+    punpcklbw     m2, m0
+%if ARCH_X86_64
+    SWAP         m15, m2
+%else
+ %define m15 [esp+3*16]
+    mova         m15, m2
+%endif
+    movq          m0, [tmpq+strideq*0-%1/2]
+    movq          m2, [tmpq+strideq*1-%1/2]
+    punpcklbw     m0, m2
+    ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+    ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+    ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+    ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+    ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+    ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+    ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+    ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+    punpcklwd     m2, m7, m4
+    punpckhwd     m7, m4
+    punpcklwd     m4, m3, m5
+    punpckhwd     m3, m5
+    punpcklwd     m5, m6, m1
+    punpckhwd     m6, m1
+    punpcklwd     m1, m0, m15
+    punpckhwd     m0, m15
+%if ARCH_X86_64
+    SWAP         m15, m0
+%else
+    mova         m15, m0
+%endif
+    ; xm2: A0-3,B0-3,C0-3,D0-3
+    ; xm7: E0-3,F0-3,G0-3,H0-3
+    ; xm4: A8-11,B8-11,C8-11,D8-11
+    ; xm3: E8-11,F8-11,G8-11,H8-11
+    ; xm5: A4-7,B4-7,C4-7,D4-7
+    ; xm6: E4-7,F4-7,G4-7,H4-7
+    ; xm1: A12-15,B12-15,C12-15,D12-15
+    ; xm0: E12-15,F12-15,G12-15,H12-15
+    punpckldq     m0, m2, m5
+    punpckhdq     m2, m5
+    punpckldq     m5, m7, m6
+%if %1 != 6
+    punpckhdq     m7, m6
+%endif
+    punpckldq     m6, m4, m1
+    punpckhdq     m4, m1
+    punpckldq     m1, m3, m15
+%if %1 != 6
+    punpckhdq     m3, m15
+ %if ARCH_X86_64
+    SWAP         m15, m3
+ %else
+    mova         m15, m3
+ %endif
+%endif
+    ; xm0: A0-7,B0-7
+    ; xm2: C0-7,D0-7
+    ; xm5: E0-7,F0-7
+    ; xm7: G0-7,H0-7
+    ; xm6: A8-15,B8-15
+    ; xm4: C8-15,D8-15
+    ; xm1: E8-15,F8-15
+    ; xm3: G8-15,H8-15
+    punpcklqdq    m3, m0, m6
+    punpckhqdq    m0, m6
+    punpckhqdq    m6, m2, m4
+    punpcklqdq    m2, m4
+    punpcklqdq    m4, m5, m1
+    punpckhqdq    m5, m1
+%if %1 == 8
+    punpcklqdq    m1, m7, m15
+    punpckhqdq    m7, m15
+    ; xm3: A0-15
+    ; xm0: B0-15
+    ; xm2: C0-15
+    ; xm6: D0-15
+    ; xm4: E0-15
+    ; xm5: F0-15
+    ; xm1: G0-15
+    ; xm7: H0-15
+%if ARCH_X86_64
+    SWAP          11, 3, 2
+    SWAP          13, 0
+    SWAP           6, 5, 4
+    SWAP          14, 1
+    SWAP          15, 7
+    ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15
+    mova [rsp+21*16], m11
+ %define %%p3mem [rsp+21*16]
+%else
+ %define m11 [esp+26*16]
+ %define m13 [esp+27*16]
+ %define m14 [esp+28*16]
+ %define m15 [esp+29*16]
+    mova         m11, m3
+    mova         m13, m0
+    SWAP           3, 2
+    SWAP           6, 5, 4
+    mova         m14, m1
+    mova         m15, m7
+ %define %%p3mem [esp+26*16]
+%endif
+%else
+ %if ARCH_X86_64
+    SWAP          13, 3, 0
+    SWAP          14, 5, 6, 4, 2
+    ; 3,0,2,6,4,5 -> 13,3,4,5,6,14
+ %else
+  %define m13 %%p2mem
+  %define m14 %%q2mem
+    mova         m13, m3
+    mova         m14, m5
+    SWAP           3, 0
+    SWAP           5, 6, 4, 2
+    ; 0,2,6,4 -> 3,4,5,6
+ %endif
+%endif
+%else
+%if ARCH_X86_64
+    mova [rsp+20*16], m12
+%endif
+    ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+    ; remainder at the end for the second transpose
+%if ARCH_X86_32
+ %xdefine m8  m0
+ %xdefine m9  m1
+ %xdefine m10 m2
+ %xdefine m11 m3
+ %xdefine m12 m4
+ %xdefine m13 m5
+ %xdefine m14 m6
+ %xdefine m15 m7
+    lea         tmpq, [dstq+strideq*8]
+    movu          m8, [tmpq+strideq*0-8]
+    movu          m9, [tmpq+strideq*1-8]
+    movu         m10, [tmpq+strideq*2-8]
+    movu         m11, [tmpq+stride3q -8]
+    lea         tmpq, [tmpq+strideq*4]
+    movu         m12, [tmpq+strideq*0-8]
+    movu         m13, [tmpq+strideq*1-8]
+    movu         m14, [tmpq+strideq*2-8]
+    movu         m15, [tmpq+stride3q -8]
+    mova [esp+ 8*16], m8
+    mova [esp+ 9*16], m9
+    mova [esp+10*16], m10
+    mova [esp+11*16], m11
+    mova [esp+12*16], m12
+    mova [esp+13*16], m13
+    mova [esp+14*16], m14
+    mova [esp+15*16], m15
+%endif
+    movu          m0, [dstq+strideq*0-8]
+    movu          m1, [dstq+strideq*1-8]
+    movu          m2, [dstq+strideq*2-8]
+    movu          m3, [dstq+stride3q -8]
+    lea         tmpq, [dstq+strideq*4]
+    movu          m4, [tmpq+strideq*0-8]
+    movu          m5, [tmpq+strideq*1-8]
+    movu          m6, [tmpq+strideq*2-8]
+    movu          m7, [tmpq+stride3q -8]
+    lea         tmpq, [tmpq+strideq*4]
+%if ARCH_X86_64
+    movu          m8, [tmpq+strideq*0-8]
+    movu          m9, [tmpq+strideq*1-8]
+    movu         m10, [tmpq+strideq*2-8]
+    movu         m11, [tmpq+stride3q -8]
+    lea         tmpq, [tmpq+strideq*4]
+    movu         m12, [tmpq+strideq*0-8]
+    movu         m13, [tmpq+strideq*1-8]
+    movu         m14, [tmpq+strideq*2-8]
+    movu         m15, [tmpq+stride3q -8]
+%endif
+
+%if ARCH_X86_64
+    TRANSPOSE_16X16B 0, [rsp+11*16]
+    mova [rsp+12*16], m1
+    mova [rsp+13*16], m2
+    mova [rsp+14*16], m3
+    mova [rsp+15*16], m12
+    mova [rsp+16*16], m13
+    mova [rsp+17*16], m14
+    mova [rsp+18*16], m15
+    ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
+    SWAP          12, 4, 7
+    SWAP          13, 5, 8
+    SWAP           3, 6, 9
+    SWAP          10, 14
+    SWAP          11, 15
+    mova [rsp+21*16], m12
+ %define %%p3mem [rsp+21*16]
+    mova         m12, [rsp+20*16]
+%else
+    TRANSPOSE_16X16B 0, [esp+16*16]
+ %define %%p3mem [esp+26*16]
+ %define m11 %%p3mem
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %define m15 %%q3mem
+%endif
+%endif ; if 4 elif 6 or 8 else 16
+%endif ; if v else h
+
+    ; load L/E/I/H
+%if ARCH_X86_32
+    mov    l_strideq, l_stridem
+%endif
+%ifidn %2, v
+    movu          m1, [lq]
+    movu          m0, [lq+l_strideq]
+%else
+ %if ARCH_X86_32
+    lea   l_stride3q, [l_strideq*3]
+ %endif
+    movq         xm1, [lq]
+    movq         xm2, [lq+l_strideq*2]
+    movhps       xm1, [lq+l_strideq]
+    movhps       xm2, [lq+l_stride3q]
+    shufps        m0, m1, m2, q3131
+    shufps        m1, m2, q2020
+ %if ARCH_X86_32
+    lea     stride3q, [strideq*3]
+ %endif
+%endif
+
+%if ARCH_X86_32
+ %ifidn %2, v
+    mov         lutd, lutm
+ %endif
+%endif
+    pxor          m2, m2
+    pcmpeqb       m7, m2, m0
+    pand          m1, m7
+    por           m0, m1                        ; l[x][] ? l[x][] : l[x-stride][]
+    pshufb        m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1]
+    pcmpeqb       m2, m0                        ; !L
+    psrlq         m7, m0, [lutq+128]
+    pand          m7, [PIC_sym(pb_63)]
+    pminub        m7, minlvl
+    pmaxub        m7, [PIC_sym(pb_1)]           ; I
+    pand          m1, m0, [PIC_sym(pb_240)]
+    psrlq         m1, 4                         ; H
+    paddb         m0, [PIC_sym(pb_2)]
+    paddb         m0, m0
+    paddb         m0, m7                        ; E
+    pxor          m1, [PIC_sym(pb_128)]
+    pxor          m7, [PIC_sym(pb_128)]
+    pxor          m0, [PIC_sym(pb_128)]
+    SWAP           2, 7
+
+%if ARCH_X86_64
+    SWAP           0, 8
+    SWAP           2, 10
+%else
+ %ifidn %2, v
+    mov     mstrideq, strideq
+    neg     mstrideq
+  %if %1 == 4
+    lea         tmpq, [dstq+mstrideq*2]
+  %elif %1 == 6 || %1 == 8
+    lea         tmpq, [dstq+mstrideq*4]
+  %endif
+ %endif
+    mova  [esp+3*16], m0
+    mova  [esp+4*16], m2
+%endif
+
+    ABSSUB        m0, m3, m4, m2                ; abs(p1-p0)
+    pmaxub        m0, m7
+    ABSSUB        m2, m5, m6, m7                ; abs(q1-q0)
+    pmaxub        m0, m2
+%if %1 == 4
+    pxor          m0, [PIC_sym(pb_128)]
+    pcmpgtb       m7, m0, m1                    ; hev
+ %if ARCH_X86_64
+    SWAP           7, 11
+ %else
+    mova  [esp+5*16], m7
+ %endif
+%else
+    pxor          m7, m0, [PIC_sym(pb_128)]
+    pcmpgtb       m7, m1                        ; hev
+%if ARCH_X86_64
+    SWAP           7, 11
+%else
+    mova  [esp+5*16], m7
+%endif
+
+%if %1 == 6
+    ABSSUB        m1, m13, m4, m7               ; abs(p2-p0)
+    pmaxub        m1, m0
+%else
+    mova          m2, %%p3mem
+    ABSSUB        m1, m2, m4, m7                ; abs(p3-p0)
+    pmaxub        m1, m0
+    ABSSUB        m7, m13, m4, m2               ; abs(p2-p0)
+    pmaxub        m1, m7
+%endif
+    ABSSUB        m7, m5, m14, m2               ; abs(p2-p0)
+    pmaxub        m1, m7
+%if %1 != 6
+    ABSSUB        m7, m5, m15, m2               ; abs(q3-q0)
+    pmaxub        m1, m7
+%endif
+    pxor          m1, [PIC_sym(pb_128)]
+    pcmpgtb       m1, [PIC_sym(pb_129)]         ; !flat8in
+%if ARCH_X86_64
+    SWAP           1, 9
+%else
+    mova  [esp+6*16], m1
+%endif
+
+%if %1 == 6
+    ABSSUB        m7, m13, m3, m1               ; abs(p2-p1)
+%else
+    mova          m2, %%p3mem
+    ABSSUB        m7, m2, m13, m1               ; abs(p3-p2)
+    ABSSUB        m2, m13, m3, m1               ; abs(p2-p1)
+    pmaxub        m7, m2
+    ABSSUB        m2, m14, m15, m1              ; abs(q3-q2)
+    pmaxub        m7, m2
+%endif
+    ABSSUB        m2, m14, m6,  m1              ; abs(q2-q1)
+    pmaxub        m7, m2
+%if ARCH_X86_32
+ %define m12 m1
+    mova         m12, maskmem
+%endif
+    pand          m2, m12, mask1
+    pcmpeqd       m2, m12
+    pand          m7, m2                        ; only apply fm-wide to wd>4 blocks
+    pmaxub        m0, m7
+
+    pxor          m0, [PIC_sym(pb_128)]
+%endif ; %if %1 == 4 else
+%if ARCH_X86_64
+    SWAP           2, 10
+    pcmpgtb       m0, m2
+%else
+    pcmpgtb       m0, [esp+4*16]
+%endif
+
+    ABSSUB        m1, m3, m6, m7                ; abs(p1-q1)
+    ABSSUB        m7, m4, m5, m2                ; abs(p0-q0)
+    paddusb       m7, m7
+    pand          m1, [PIC_sym(pb_254)]
+    psrlq         m1, 1
+    paddusb       m1, m7                        ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+    pxor          m1, [PIC_sym(pb_128)]
+%if ARCH_X86_64
+    pcmpgtb       m1, m8                        ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+%else
+    pcmpgtb       m1, [esp+3*16]
+%endif
+    por           m0, m1
+
+%if %1 == 16
+%if ARCH_X86_64
+    SWAP           0, 8
+%else
+    mova  [esp+3*16], m0
+%endif
+%ifidn %2, v
+    lea         tmpq, [dstq+mstrideq*8]
+    mova          m0, [tmpq+strideq*1]
+%else
+    mova          m0, [rsp+12*16]
+%endif
+    ABSSUB        m1, m0, m4, m2
+%ifidn %2, v
+    mova          m0, [tmpq+strideq*2]
+%else
+    mova          m0, [rsp+13*16]
+%endif
+    ABSSUB        m2, m0, m4, m7
+    pmaxub        m1, m2
+%ifidn %2, v
+    mova          m0, [tmpq+stride3q]
+%else
+    mova          m0, [rsp+14*16]
+%endif
+    ABSSUB        m2, m0, m4, m7
+    pmaxub        m1, m2
+%ifidn %2, v
+    lea         tmpq, [dstq+strideq*4]
+    mova          m0, [tmpq+strideq*0]
+%else
+    mova          m0, [rsp+15*16]
+%endif
+    ABSSUB        m2, m0, m5, m7
+    pmaxub        m1, m2
+%ifidn %2, v
+    mova          m0, [tmpq+strideq*1]
+%else
+    mova          m0, [rsp+16*16]
+%endif
+    ABSSUB        m2, m0, m5, m7
+    pmaxub        m1, m2
+%ifidn %2, v
+    mova          m0, [tmpq+strideq*2]
+%else
+    mova          m0, [rsp+17*16]
+%endif
+    ABSSUB        m2, m0, m5, m7
+    pmaxub        m1, m2
+    pxor          m1, [PIC_sym(pb_128)]
+    pcmpgtb       m1, [PIC_sym(pb_129)]         ; !flat8out
+%if ARCH_X86_64
+    por           m1, m9                        ; !flat8in | !flat8out
+%else
+    por           m1, [esp+6*16]
+ %define m12 m7
+    mova         m12, maskmem
+%endif
+    pand          m2, m12, mask2
+    pcmpeqd       m2, m12
+    pandn         m1, m2                        ; flat16
+%if ARCH_X86_64
+    pandn         m2, m8, m1                    ; flat16 & fm
+%else
+    pandn         m2, [esp+3*16], m1            ; flat16 & fm
+    mova %%flat16mem, m2
+%endif
+    SWAP           1, 2
+
+    pand          m2, m12, mask1
+    pcmpeqd       m2, m12
+%if ARCH_X86_64
+    pandn         m9, m2                    ; flat8in
+    pandn         m2, m8, m9
+    SWAP           2, 9
+%else
+    pandn         m0, [esp+6*16], m2
+    pandn         m2, [esp+3*16], m0
+    mova  [esp+6*16], m2
+%endif
+    pand          m2, m12, mask0
+    pcmpeqd       m2, m12
+%if ARCH_X86_64
+    pandn         m8, m2
+    pandn         m2, m9, m8                    ; fm & !flat8 & !flat16
+    SWAP           2, 8
+    pandn         m2, m1, m9                    ; flat8 & !flat16
+    SWAP           2, 9
+    SWAP           0, 8
+    SWAP           1, 10
+%else
+    pandn         m0, [esp+3*16], m2
+    pandn         m2, [esp+6*16], m0
+    SWAP           2, 0
+    pandn         m2, m1, [esp+6*16]
+    mova  %%flat8mem, m2
+%endif
+%elif %1 != 4
+ %if ARCH_X86_64
+    SWAP           1, 9
+ %else
+  %define m12 m7
+    mova         m12, maskmem
+    mova          m1, [esp+6*16]
+ %endif
+    pand          m2, m12, mask1
+    pcmpeqd       m2, m12
+    pandn         m1, m2
+    pandn         m2, m0, m1                    ; flat8 & fm
+    pand          m1, m12, mask0
+    pcmpeqd       m1, m12
+    pandn         m0, m1
+    pandn         m1, m2, m0                    ; fm & !flat8
+    SWAP           1, 2, 0
+ %if ARCH_X86_64
+    SWAP           1, 9
+ %else
+    mova  %%flat8mem, m1
+ %endif
+%else
+%if ARCH_X86_32
+ %define m12 m1
+    mova         m12, maskmem
+%endif
+    pand          m2, m12, mask0
+    pcmpeqd       m2, m12
+    pandn         m0, m2                        ; fm
+%endif
+
+    ; short filter
+
+    mova          m1, [PIC_sym(pb_128)]
+%if ARCH_X86_64
+    SWAP           7, 11
+%else
+    mova          m7, [esp+5*16]
+%endif
+    pxor          m3, m1
+    pxor          m6, m1
+    pxor          m4, m1
+    pxor          m5, m1
+    psubsb        m1, m3, m6                    ; iclip_diff(p1-q1)
+    pand          m1, m7                        ; f=iclip_diff(p1-q1)&hev
+    psubsb        m2, m5, m4
+    paddsb        m1, m2
+    paddsb        m1, m2
+    paddsb        m1, m2                        ; f=iclip_diff(3*(q0-p0)+f)
+    mova          m2, [PIC_sym(pb_16)]
+    pand          m0, m1                        ; f&=fm
+    paddsb        m1, m0, [PIC_sym(pb_3)]
+    paddsb        m0, [PIC_sym(pb_4)]
+    pand          m1, [PIC_sym(pb_248)]
+    pand          m0, [PIC_sym(pb_248)]
+    psrlq         m1, 3
+    psrlq         m0, 3
+    pxor          m1, m2
+    pxor          m0, m2
+    psubb         m1, m2                        ; f2
+    psubb         m0, m2                        ; f1
+    mova          m2, [PIC_sym(pb_128)]
+    paddsb        m4, m1
+    psubsb        m5, m0
+    pxor          m4, m2
+    pxor          m5, m2
+
+    pxor          m0, m2
+    pxor          m1, m1
+    pavgb         m0, m1                        ; f=(f1+1)>>1
+    psubb         m0, [PIC_sym(pb_64)]
+    pandn         m7, m0                        ; f&=!hev
+    paddsb        m3, m7
+    psubsb        m6, m7
+    pxor          m3, m2
+    pxor          m6, m2
+
+%if %1 == 16
+    ; flat16 filter
+%ifidn %2, v
+    lea         tmpq, [dstq+mstrideq*8]
+    mova          m0, [tmpq+strideq*1]          ; p6
+    mova          m2, [tmpq+strideq*2]          ; p5
+    mova          m7, [tmpq+stride3q]           ; p4
+%else
+    mova          m0, [rsp+12*16]
+    mova          m2, [rsp+13*16]
+    mova          m7, [rsp+14*16]
+%endif
+
+%if ARCH_X86_64
+    SWAP           1, 10
+    mova  %%flat8mem, m9
+    mova     %%q2mem, m14
+    mova     %%q3mem, m15
+    SWAP           0, 8
+    SWAP           1, 9
+%else
+ %ifidn %2, v
+    mova [esp+17*16], m0
+    mova [esp+19*16], m3
+    mova [esp+21*16], m4
+    mova [esp+22*16], m5
+    mova [esp+23*16], m6
+  %xdefine m11 m3
+  %xdefine m14 m4
+  %xdefine m15 m5
+  %xdefine m10 m6
+  %define m13 %%p2mem
+  %define m8  [esp+17*16]
+  %define m9  %%flat16mem
+  %define m3  [esp+19*16]
+  %define m4  [esp+21*16]
+  %define m5  [esp+22*16]
+  %define m6  [esp+23*16]
+ %else
+    mova [esp+31*16], m0
+    mova [esp+32*16], m3
+    mova [esp+33*16], m4
+    mova [esp+34*16], m5
+    mova [esp+35*16], m6
+  %xdefine m11 m3
+  %xdefine m14 m4
+  %xdefine m15 m5
+  %xdefine m10 m6
+  %define m13 %%p2mem
+  %define m8  [esp+31*16]
+  %define m9  %%flat16mem
+  %define m3  [esp+32*16]
+  %define m4  [esp+33*16]
+  %define m5  [esp+34*16]
+  %define m6  [esp+35*16]
+ %endif
+%endif
+
+    ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
+    ; write -6
+    mova         m11, %%p3mem
+%if ARCH_X86_64
+    punpcklbw    m14, m8, m11
+    punpckhbw    m15, m8, m11
+%else
+    punpcklbw    m14, m0, m11
+    punpckhbw    m15, m0, m11
+%endif
+%ifidn %2, v
+    mova  [rsp+5*16], m11
+%endif
+    pmaddubsw    m10, m14, [PIC_sym(pb_7_1)]
+    pmaddubsw    m11, m15, [PIC_sym(pb_7_1)]    ; p6*7+p3
+    punpcklbw     m0, m2, m7
+    punpckhbw     m1, m2, m7
+    pmaddubsw     m0, [PIC_sym(pb_2)]
+    pmaddubsw     m1, [PIC_sym(pb_2)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*7+p5*2+p4*2+p3
+    punpcklbw     m0, m13, m3
+    punpckhbw     m1, m13, m3
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*7+p5*2+p4*2+p3+p2+p1
+    punpcklbw     m0, m4, m5
+    punpckhbw     m1, m4, m5
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m2
+    por           m0, m1
+%ifidn %2, v
+    mova [tmpq+strideq*2], m0                   ; p5
+%else
+    mova [rsp+13*16], m0
+%endif
+
+    ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
+    ; write -5
+    pmaddubsw    m14, [PIC_sym(pb_m1_1)]
+    pmaddubsw    m15, [PIC_sym(pb_m1_1)]
+    paddw        m10, m14
+    paddw        m11, m15                       ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+    punpcklbw     m0, m8, m6
+    punpckhbw     m1, m8, m6
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    mova  [rsp+3*16], m0
+    mova  [rsp+4*16], m1
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m7
+    por           m0, m1
+%ifidn %2, v
+    mova [tmpq+stride3q], m0                    ; p4
+%else
+    mova [rsp+14*16], m0
+%endif
+
+    ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
+    ; write -4
+    mova         m14, %%q2mem
+    punpcklbw     m0, m8, m13
+    punpckhbw     m1, m8, m13
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+    punpcklbw     m0, m2, m14
+    punpckhbw     m2, m14
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m2, [PIC_sym(pb_m1_1)]
+    mova  [rsp+1*16], m0
+    paddw        m10, m0
+    paddw        m11, m2                        ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, %%p3mem
+    por           m0, m1
+%ifidn %2, v
+    mova [tmpq+strideq*4], m0                   ; p3
+%else
+    mova [rsp+19*16], m0
+%endif
+
+    ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
+    ; write -3
+    mova         m15, %%q3mem
+    punpcklbw     m0, m8, m3
+    punpckhbw     m1, m8, m3
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+    punpcklbw     m0, m7, m15
+    punpckhbw     m7, m15
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m7, [PIC_sym(pb_m1_1)]
+    mova  [rsp+2*16], m0
+%if ARCH_X86_32
+ %ifidn %2, v
+    mova [esp+24*16], m7
+ %else
+    mova [esp+36*16], m7
+ %endif
+%endif
+    paddw        m10, m0
+    paddw        m11, m7                        ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m13
+    por           m0, m1
+    mova  [rsp+6*16], m0                        ; don't clobber p2/m13 since we need it in F
+
+    ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
+    ; write -2
+    punpcklbw     m0, m8, m4
+    punpckhbw     m1, m8, m4
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+%if ARCH_X86_64
+    SWAP           7, 8
+%endif
+%ifidn %2, v
+    mova          m1, [dstq+strideq*4]          ; q4
+    mova          m7, [rsp+5*16]                ; (pre-filter) p3
+%else
+    mova          m1, [rsp+15*16]
+    mova          m7, %%p3mem                   ; (pre-filter) p3
+%endif
+    punpcklbw     m0, m1, m7
+    punpckhbw     m1, m1, m7
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    mova  [rsp+7*16], m0
+    mova  [rsp+5*16], m1
+    psubw        m10, m0
+    psubw        m11, m1                        ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m3
+    por           m0, m1
+    mova  [rsp+8*16], m0                        ; don't clobber p1/m3 since we need it in G
+
+    ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
+    ; write -1
+%ifidn %2, v
+    mova          m7, [tmpq+strideq*1]          ; p6
+    lea         tmpq, [dstq+strideq*4]
+    mova          m1, [tmpq+strideq*1]          ; q5
+%else
+    mova          m7, [rsp+12*16]               ; p6
+    mova          m1, [rsp+16*16]
+%endif
+    punpcklbw     m0, m7, m5
+    punpckhbw     m7, m5
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m7, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m7                        ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+    punpcklbw     m7, m13, m1
+    pmaddubsw     m7, [PIC_sym(pb_m1_1)]
+    mova  [rsp+9*16], m7
+    paddw        m10, m7
+%if ARCH_X86_64
+    punpckhbw    m13, m1
+    mova          m1, [rsp+6*16]
+    SWAP           1, 13
+%else
+    punpckhbw     m7, m13, m1
+    mova          m1, [esp+6*16]
+    mova         m13, m1
+    SWAP           1, 7
+%endif
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    mova [rsp+10*16], m1
+    paddw        m11, m1                        ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+    pmulhrsw      m7, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m0, m11, [PIC_sym(pw_2048)]
+    packuswb      m7, m0
+    pand          m7, m9
+    pandn         m0, m9, m4
+    por           m7, m0
+    mova  [rsp+6*16], m7                        ; don't clobber p0/m4 since we need it in H
+
+    ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
+    ; write +0
+%ifidn %2, v
+    mova          m7, [tmpq+strideq*2]          ; q6
+%else
+    mova          m7, [rsp+17*16]
+%endif
+    paddw        m10, [rsp+3*16]
+    paddw        m11, [rsp+4*16]                ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
+    punpcklbw     m0, m3, m7
+    punpckhbw     m1, m3, m7
+%if ARCH_X86_64
+    mova          m3, [rsp+8*16]
+%endif
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    mova  [rsp+3*16], m0
+    mova  [rsp+4*16], m1
+    paddw        m10, m0
+    paddw        m11, m1                        ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m5
+    por           m0, m1
+%if ARCH_X86_32
+    mova          m1, [esp+8*16]
+    mova          m3, m1
+%endif
+    mova  [rsp+8*16], m0                        ; don't clobber q0/m5 since we need it in I
+
+    ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
+    ; write +1
+    paddw        m10, [rsp+1*16]
+    paddw        m11, m2                        ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+    punpcklbw     m0, m4, m7
+    punpckhbw     m2, m4, m7
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m2, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m2                        ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
+%if ARCH_X86_64
+    mova          m4, [rsp+6*16]
+%else
+ %define m4 [esp+6*16]
+%endif
+    pmulhrsw      m2, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m2, m1
+    pand          m2, m9
+    pandn         m1, m9, m6
+    por           m2, m1                        ; don't clobber q1/m6 since we need it in K
+
+    ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
+    ; write +2
+    paddw        m10, [rsp+2*16]
+%if ARCH_X86_64
+    SWAP           7, 8
+    paddw        m11, m7
+%else
+    mova          m8, m7
+ %ifidn %2, v
+    paddw        m11, [esp+24*16]               ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ %else
+    paddw        m11, [esp+36*16]               ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ %endif
+%endif
+    punpcklbw     m0, m5, m8
+    punpckhbw     m1, m5, m8
+%if ARCH_X86_64
+    mova          m5, [rsp+8*16]
+%else
+ %define m5 [esp+8*16]
+%endif
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+    pmulhrsw      m7, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m7, m1
+    pand          m7, m9
+    pandn         m1, m9, m14
+    por           m7, m1                        ; don't clobber q2/m14 since we need it in K
+
+    ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
+    ; write +3
+    psubw        m10, [rsp+7*16]
+    psubw        m11, [rsp+5*16]                ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+    punpcklbw     m0, m6, m8
+    punpckhbw     m1, m6, m8
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m15
+    por           m0, m1
+%ifidn %2, v
+    mova [tmpq+mstrideq], m0                    ; q3
+%else
+    mova [rsp+20*16], m0
+%endif
+
+    ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
+    ; write +4
+    paddw        m10, [rsp+ 9*16]
+    paddw        m11, [rsp+10*16]               ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+    punpcklbw     m0, m14, m8
+    punpckhbw     m1, m14, m8
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+%ifidn %2, v
+    pandn         m1, m9, [tmpq+strideq*0]
+%else
+    pandn         m1, m9, [rsp+15*16]
+%endif
+    por           m0, m1
+%ifidn %2, v
+    mova [tmpq+strideq*0], m0                    ; q4
+%else
+    mova [rsp+15*16], m0
+%endif
+
+    ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
+    ; write +5
+    paddw        m10, [rsp+3*16]
+    paddw        m11, [rsp+4*16]                ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+    punpcklbw     m0, m15, m8
+    punpckhbw     m1, m15, m8
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+    pmulhrsw     m10, [PIC_sym(pw_2048)]
+    pmulhrsw     m11, [PIC_sym(pw_2048)]
+    packuswb     m10, m11
+    pand         m10, m9
+%ifidn %2, v
+    pandn        m11, m9, [tmpq+strideq*1]
+%else
+    pandn        m11, m9, [rsp+16*16]
+%endif
+    por          m10, m11
+%ifidn %2, v
+    mova [tmpq+strideq*1], m10                  ; q5
+%else
+    mova [rsp+16*16], m10
+%endif
+
+%if ARCH_X86_64
+    SWAP           0, 8
+    SWAP           1, 9
+    SWAP          14, 7
+%else
+ %xdefine m3 m11
+ %xdefine m4 m14
+ %xdefine m5 m15
+ %xdefine m6 m10
+    mova     %%q2mem, m7
+ %ifidn %2, v
+    mova          m3, [esp+19*16]
+ %else
+    mova          m3, [esp+32*16]
+ %endif
+    mova          m4, [esp+ 6*16]
+    mova          m5, [esp+ 8*16]
+%endif
+    SWAP          m6, m2
+
+%if ARCH_X86_64
+    mova          m9, %%flat8mem
+%endif
+%ifidn %2, v
+    lea         tmpq, [dstq+mstrideq*4]
+%endif
+%endif ; if %1 == 16
+%if %1 >= 8
+    ; flat8 filter
+%if ARCH_X86_32
+ %define m9  %%flat8mem
+ %define m11 m1
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %define m15 %%q3mem
+%endif
+    mova         m11, %%p3mem
+    punpcklbw     m0, m11, m3
+    punpcklbw     m7, m13, m4
+    pmaddubsw     m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1
+    pmaddubsw     m7, [PIC_sym(pb_2_1)]
+    paddw         m2, m7                    ; 3 * p3 + 2 * p2 + p1 + p0
+    punpcklbw     m7, m5, [PIC_sym(pb_4)]
+    pmaddubsw     m7, [PIC_sym(pb_1)]
+    paddw         m2, m7                    ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+    punpckhbw     m1, m11, m3
+    pmaddubsw     m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1
+    punpckhbw     m0, m13, m4
+    pmaddubsw     m0, [PIC_sym(pb_2_1)]
+    paddw         m7, m0                    ; 3 * p3 + 2 * p2 + p1 + p0
+    punpckhbw     m0, m5, [PIC_sym(pb_4)]
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    paddw         m7, m0                    ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+    psrlw         m0, m2, 3
+    psrlw         m1, m7, 3
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m13
+    por           m0, m1                    ; p2
+%ifidn %2, v
+    mova [tmpq+strideq*1], m0
+%else
+ %if ARCH_X86_64
+    SWAP           0, 10
+ %else
+    mova  [esp+2*16], m0
+ %endif
+%endif
+
+%if ARCH_X86_32
+    mova         m11, %%p3mem
+%endif
+    punpcklbw     m0, m11, m3
+    punpckhbw     m1, m11, m3
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw         m2, m0
+    paddw         m7, m1
+    punpcklbw     m0, m13, m6
+    punpckhbw     m1, m13, m6
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw         m2, m0
+    paddw         m7, m1            ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+    psrlw         m0, m2, 3
+    psrlw         m1, m7, 3
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m3
+    por           m0, m1            ; p1
+%ifidn %2, v
+    mova [tmpq+strideq*2], m0
+%else
+    mova  [rsp+0*16], m0
+%endif
+
+%if ARCH_X86_32
+    mova         m11, %%p3mem
+%endif
+    punpcklbw     m0, m11, m3
+    punpckhbw     m1, m11, m3
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
+    psubw         m2, m0
+    psubw         m7, m1
+    punpcklbw     m0, m4, m14
+    punpckhbw     m1, m4, m14
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
+    paddw         m2, m0
+    paddw         m7, m1            ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+    psrlw         m0, m2, 3
+    psrlw         m1, m7, 3
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m4
+    por           m0, m1            ; p0
+%ifidn %2, v
+    mova [tmpq+stride3q], m0
+%else
+    mova  [rsp+1*16], m0
+%endif
+
+    punpcklbw     m0, m5, m15
+    punpckhbw     m1, m5, m15
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
+    paddw         m2, m0
+    paddw         m7, m1
+%if ARCH_X86_32
+    mova         m11, %%p3mem
+%endif
+    punpcklbw     m0, m11, m4
+    punpckhbw    m11, m11, m4
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw    m11, [PIC_sym(pb_1)]
+    psubw         m2, m0
+    psubw         m7, m11           ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+    psrlw         m0, m2, 3
+    psrlw        m11, m7, 3
+    packuswb      m0, m11
+    pand          m0, m9
+    pandn        m11, m9, m5
+    por          m11, m0            ; q0
+%ifidn %2, v
+    mova [dstq+strideq*0], m11
+%elif ARCH_X86_32
+    mova  [esp+8*16], m11
+%endif
+
+    punpcklbw     m0, m5, m15
+    punpckhbw     m1, m5, m15
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw         m2, m0
+    paddw         m7, m1
+    punpcklbw     m0, m13, m6
+    punpckhbw     m1, m13, m6
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw         m2, m0
+    paddw         m7, m1            ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+    psrlw         m0, m2, 3
+    psrlw         m1, m7, 3
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m6
+    por           m0, m1            ; q1
+%ifidn %2, v
+    mova [dstq+strideq*1], m0
+%else
+ %if ARCH_X86_64
+    SWAP           0, 13
+ %else
+    mova  [esp+9*16], m0
+ %endif
+%endif
+
+    punpcklbw     m0, m3, m6
+    punpckhbw     m1, m3, m6
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
+    psubw         m2, m0
+    psubw         m7, m1
+    punpcklbw     m0, m14, m15
+    punpckhbw     m1, m14, m15
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
+    paddw         m2, m0
+    paddw         m7, m1            ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+    psrlw         m2, 3
+    psrlw         m7, 3
+    packuswb      m2, m7
+    pand          m2, m9
+    pandn         m7, m9, m14
+    por           m2, m7            ; q2
+%ifidn %2, v
+    mova [dstq+strideq*2], m2
+%else
+    mova          m0, [rsp+0*16]
+%if %1 == 8
+    mova          m1, [rsp+1*16]
+    mova          m4, %%p3mem
+
+%if ARCH_X86_32
+ %define m10 [esp+2*16]
+ %define m11 [esp+8*16]
+ %define m13 [esp+9*16]
+%endif
+
+    ; 16x8 transpose
+    punpcklbw     m3, m4, m10
+    punpckhbw     m4, m10
+    punpcklbw     m5, m0, m1
+    punpckhbw     m0, m1
+    punpcklbw     m1, m11, m13
+    punpckhbw     m6, m11, m13
+    punpcklbw     m7, m2, m15
+    punpckhbw     m2, m15
+%if ARCH_X86_64
+    SWAP           2, 15
+%else
+    mova         m15, m2
+%endif
+
+    punpcklwd     m2, m3, m5
+    punpckhwd     m3, m5
+    punpcklwd     m5, m4, m0
+    punpckhwd     m4, m0
+    punpcklwd     m0, m1, m7
+    punpckhwd     m1, m7
+    punpcklwd     m7, m6, m15
+    punpckhwd     m6, m15
+%if ARCH_X86_64
+    SWAP           6, 15
+%else
+    mova         m15, m6
+%endif
+
+    punpckldq     m6, m2, m0
+    punpckhdq     m2, m0
+    punpckldq     m0, m3, m1
+    punpckhdq     m3, m1
+    punpckldq     m1, m5, m7
+    punpckhdq     m5, m7
+    punpckldq     m7, m4, m15
+    punpckhdq     m4, m15
+
+    ; write 8x16
+    movq   [dstq+strideq*0-4], xm6
+    movhps [dstq+strideq*1-4], xm6
+    movq   [dstq+strideq*2-4], xm2
+    movhps [dstq+stride3q -4], xm2
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm0
+    movhps [dstq+strideq*1-4], xm0
+    movq   [dstq+strideq*2-4], xm3
+    movhps [dstq+stride3q -4], xm3
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm1
+    movhps [dstq+strideq*1-4], xm1
+    movq   [dstq+strideq*2-4], xm5
+    movhps [dstq+stride3q -4], xm5
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm7
+    movhps [dstq+strideq*1-4], xm7
+    movq   [dstq+strideq*2-4], xm4
+    movhps [dstq+stride3q -4], xm4
+    lea         dstq, [dstq+strideq*4]
+%else
+    ; 16x16 transpose and store
+    SWAP           6, 0
+    SWAP           7, 1
+ %if ARCH_X86_64
+    SWAP           5, 10, 2
+    SWAP           8, 11
+    SWAP           9, 13
+    mova [rsp+21*16], m12
+ %else
+    mova [esp+10*16], m2
+  %xdefine m8  m0
+  %xdefine m9  m1
+  %xdefine m10 m2
+  %xdefine m11 m3
+  %xdefine m12 m4
+  %xdefine m13 m5
+  %xdefine m14 m6
+  %xdefine m15 m7
+ %endif
+    mova          m0, [rsp+11*16]
+    mova          m1, [rsp+12*16]
+    mova          m2, [rsp+13*16]
+    mova          m3, [rsp+14*16]
+    mova          m4, [rsp+19*16]
+%if ARCH_X86_64
+    mova          m7, [rsp+ 1*16]
+    mova         m11, [rsp+20*16]
+    mova         m12, [rsp+15*16]
+    mova         m13, [rsp+16*16]
+    mova         m14, [rsp+17*16]
+    TRANSPOSE_16X16B 1, [rsp+18*16]
+%else
+    mova          m5, [esp+ 2*16]
+    TRANSPOSE_16X16B 1, [esp+32*16]
+    mov         tmpq, dstq
+    lea         dstq, [dstq+strideq*8]
+%endif
+    movu [dstq+strideq*0-8], xm0
+    movu [dstq+strideq*1-8], xm1
+    movu [dstq+strideq*2-8], xm2
+    movu [dstq+stride3q -8], xm3
+    lea         dstq, [dstq+strideq*4]
+    movu [dstq+strideq*0-8], xm4
+    movu [dstq+strideq*1-8], xm5
+    movu [dstq+strideq*2-8], xm6
+    movu [dstq+stride3q -8], xm7
+%if ARCH_X86_64
+    lea         dstq, [dstq+strideq*4]
+%else
+  %xdefine m8  m0
+  %xdefine m9  m1
+  %xdefine m10 m2
+  %xdefine m11 m3
+  %xdefine m12 m4
+  %xdefine m13 m5
+  %xdefine m14 m6
+  %xdefine m15 m7
+    mova          m8, [esp+11*16]
+    mova          m9, [esp+12*16]
+    mova         m10, [esp+13*16]
+    mova         m11, [esp+14*16]
+    mova         m12, [esp+26*16]
+    mova         m13, [esp+27*16]
+    mova         m14, [esp+ 0*16]
+    mova         m15, [esp+ 1*16]
+    mov         dstq, tmpq
+%endif
+    movu [dstq+strideq*0-8], xm8
+    movu [dstq+strideq*1-8], xm9
+    movu [dstq+strideq*2-8], xm10
+    movu [dstq+stride3q -8], xm11
+    lea         dstq, [dstq+strideq*4]
+    movu [dstq+strideq*0-8], xm12
+    movu [dstq+strideq*1-8], xm13
+    movu [dstq+strideq*2-8], xm14
+    movu [dstq+stride3q -8], xm15
+    lea         dstq, [dstq+strideq*4]
+%if ARCH_X86_32
+    lea         dstq, [dstq+strideq*8]
+%else
+    mova         m12, [rsp+21*16]
+%endif
+
+%endif ; if %1 == 8
+%endif ; ifidn %2, v
+%elif %1 == 6
+    ; flat6 filter
+%if ARCH_X86_32
+    mova  [esp+3*16], m3
+    mova  [esp+4*16], m4
+    mova  [esp+5*16], m5
+    mova  [esp+6*16], m6
+ %xdefine m8  m3
+ %xdefine m10 m4
+ %xdefine m11 m5
+ %xdefine m15 m6
+ %define m3  [esp+3*16]
+ %define m4  [esp+4*16]
+ %define m5  [esp+5*16]
+ %define m6  [esp+6*16]
+ %define m9  %%flat8mem
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+%endif
+
+    punpcklbw     m8, m13, m5
+    punpckhbw    m11, m13, m5
+    pmaddubsw     m0, m8, [PIC_sym(pb_3_1)]
+    pmaddubsw     m1, m11, [PIC_sym(pb_3_1)]
+    punpcklbw     m7, m4, m3
+    punpckhbw    m10, m4, m3
+    pmaddubsw     m2, m7, [PIC_sym(pb_2)]
+    pmaddubsw    m15, m10, [PIC_sym(pb_2)]
+    paddw         m0, m2
+    paddw         m1, m15
+    pmulhrsw      m2, m0, [PIC_sym(pw_4096)]
+    pmulhrsw     m15, m1, [PIC_sym(pw_4096)]
+    packuswb      m2, m15
+    pand          m2, m9
+    pandn        m15, m9, m3
+    por           m2, m15
+%ifidn %2, v
+    mova [tmpq+strideq*2], m2                   ; p1
+%elif ARCH_X86_32
+    mova [esp+11*16], m2
+%endif
+
+    pmaddubsw     m8, [PIC_sym(pb_m1_1)]
+    pmaddubsw    m11, [PIC_sym(pb_m1_1)]
+    paddw         m0, m8
+    paddw         m1, m11
+    punpcklbw     m8, m13, m6
+    punpckhbw    m11, m13, m6
+%if ARCH_X86_64
+    SWAP           2, 13
+%endif
+    pmaddubsw     m8, [PIC_sym(pb_m1_1)]
+    pmaddubsw    m11, [PIC_sym(pb_m1_1)]
+    paddw         m0, m8
+    paddw         m1, m11
+    pmulhrsw      m2, m0, [PIC_sym(pw_4096)]
+    pmulhrsw     m15, m1, [PIC_sym(pw_4096)]
+    packuswb      m2, m15
+    pand          m2, m9
+    pandn        m15, m9, m4
+    por           m2, m15
+%ifidn %2, v
+    mova [tmpq+stride3q], m2                    ; p0
+%elif ARCH_X86_32
+    mova  [esp+8*16], m2
+%endif
+
+    paddw         m0, m8
+    paddw         m1, m11
+    punpcklbw     m8, m3, m14
+    punpckhbw    m11, m3, m14
+%if ARCH_X86_64
+    SWAP           2, 14
+%endif
+    pmaddubsw     m2, m8, [PIC_sym(pb_m1_1)]
+    pmaddubsw    m15, m11, [PIC_sym(pb_m1_1)]
+    paddw         m0, m2
+    paddw         m1, m15
+    pmulhrsw      m2, m0, [PIC_sym(pw_4096)]
+    pmulhrsw     m15, m1, [PIC_sym(pw_4096)]
+    packuswb      m2, m15
+    pand          m2, m9
+    pandn        m15, m9, m5
+    por           m2, m15
+%ifidn %2, v
+    mova [dstq+strideq*0], m2                   ; q0
+%endif
+
+    pmaddubsw     m8, [PIC_sym(pb_m1_2)]
+    pmaddubsw    m11, [PIC_sym(pb_m1_2)]
+    paddw         m0, m8
+    paddw         m1, m11
+    pmaddubsw     m7, [PIC_sym(pb_m1_0)]
+    pmaddubsw    m10, [PIC_sym(pb_m1_0)]
+    paddw         m0, m7
+    paddw         m1, m10
+    pmulhrsw      m0, [PIC_sym(pw_4096)]
+    pmulhrsw      m1, [PIC_sym(pw_4096)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m6
+    por           m0, m1
+%if ARCH_X86_32
+ %xdefine m3 m8
+ %xdefine m4 m10
+ %xdefine m5 m11
+ %xdefine m6 m15
+%endif
+%ifidn %2, v
+    mova [dstq+strideq*1], m0                   ; q1
+%else
+ %if ARCH_X86_64
+    SWAP           3, 13
+    SWAP           4, 14
+ %else
+    mova          m3, [esp+11*16]
+    mova          m4, [esp+ 8*16]
+ %endif
+    SWAP           5, 2
+    SWAP           6, 0
+    TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7
+%endif
+%else ; if %1 == 4
+%ifidn %2, v
+    mova [tmpq+strideq*0], m3                   ; p1
+    mova [tmpq+strideq*1], m4                   ; p0
+    mova [tmpq+strideq*2], m5                   ; q0
+    mova [tmpq+stride3q ], m6                   ; q1
+%else
+    TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7
+%endif
+%endif
+%if ARCH_X86_32
+ %define m12 m12reg
+%endif
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;          32-bit PIC helpers          ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 0 ; PIC_reg
+  %define PIC_reg r2
+  %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4)
+    LEA      PIC_reg, $$
+ %endmacro
+
+ %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base
+  %if %1 == 0
+    mov [esp+PIC_reg_stk_offset], PIC_reg
+    mov      PIC_reg, maskm
+  %else
+    mov      PIC_reg, [esp+PIC_reg_stk_offset]
+  %endif
+ %endmacro
+
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 1
+ %endmacro
+ %define PIC_sym(sym) (sym)
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < required_stack_alignment
+  %assign copy_args 1
+ %else
+  %assign copy_args 0
+ %endif
+%endif
+
+%macro RELOC_ARGS 1
+ %if copy_args
+  %define maskm     [esp+stack_size-gprsize*1]
+  %define l_stridem [esp+stack_size-gprsize*2]
+  %define lutm      [esp+stack_size-gprsize*3]
+  %define %1m       [esp+stack_size-gprsize*4]
+    mov          r6d, r6m
+    mov        maskm, maskd
+    mov         lutm, lutd
+    mov          %1m, r6d
+ %else
+  %define %1m       r6m
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %define tmpq       r4
+ %define mstrideq   r5
+ %define stride3q   r6
+ %define l_stride3q r6
+%endif
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_v_sb_y, 7, 11, 16, 16 * 15, \
+                    dst, stride, mask, l, l_stride, lut, \
+                    w, stride3, mstride, tmp, mask_bits
+%else
+cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \
+                    dst, stride, mask, l, l_stride, lut, mask_bits
+    RELOC_ARGS w
+    SETUP_PIC
+ %define m12 m5
+%endif
+    shl    l_strideq, 2
+    sub           lq, l_strideq
+%if ARCH_X86_64
+    mov     mstrideq, strideq
+    neg     mstrideq
+    lea     stride3q, [strideq*3]
+%else
+    mov    l_stridem, l_strided
+%endif
+    mov   mask_bitsd, 0xf
+    mova         m12, [PIC_sym(pd_mask)]
+    XCHG_PIC_REG   0
+    movu          m0, [maskq]
+    pxor          m4, m4
+    movd          m3, [lutq+136]
+    pshufb        m3, m4
+    pshufd        m2, m0, q2222
+    pshufd        m1, m0, q1111
+    pshufd        m0, m0, q0000
+    por           m1, m2
+    por           m0, m1
+    mova [rsp+11*16], m0
+    mova [rsp+12*16], m1
+    mova [rsp+13*16], m2
+    mova [rsp+14*16], m3
+
+%define maskmem [esp+15*16]
+%define mask0   [rsp+11*16]
+%define mask1   [rsp+12*16]
+%define mask2   [rsp+13*16]
+%define minlvl  [rsp+14*16]
+
+.loop:
+    test   [maskq+8], mask_bitsd                ; vmask[2]
+    je .no_flat16
+
+%if ARCH_X86_32
+    XCHG_PIC_REG   1
+    mov  [esp+25*16], mask_bitsd
+    mova     maskmem, m12
+%endif
+    FILTER        16, v
+    jmp .end
+
+.no_flat16:
+    test   [maskq+4], mask_bitsd                ; vmask[1]
+    je .no_flat
+
+%if ARCH_X86_32
+    XCHG_PIC_REG   1
+    mov  [esp+25*16], mask_bitsd
+    mova     maskmem, m12
+%endif
+    FILTER         8, v
+    jmp .end
+
+.no_flat:
+    test   [maskq+0], mask_bitsd                ; vmask[0]
+    XCHG_PIC_REG   1
+    je .no_filter
+
+%if ARCH_X86_32
+    mov  [esp+25*16], mask_bitsd
+    mova     maskmem, m12
+%endif
+    FILTER         4, v
+
+.end:
+%if ARCH_X86_32
+    mova         m12, maskmem
+    mov   mask_bitsd, [esp+25*16]
+%endif
+.no_filter:
+    pslld        m12, 4
+    shl   mask_bitsd, 4
+    add           lq, 16
+    add         dstq, 16
+%if ARCH_X86_64
+    sub           wd, 4
+%else
+    sub     dword wm, 4
+%endif
+    XCHG_PIC_REG   0
+    jg .loop
+    RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_h_sb_y, 7, 11, 16, 16 * 26, \
+                    dst, stride, mask, l, l_stride, lut, \
+                    h, stride3, l_stride3, tmp, mask_bits
+%else
+cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \
+                    dst, stride, mask, l, l_stride, lut, mask_bits
+    RELOC_ARGS h
+    SETUP_PIC
+ %define m12 m5
+%endif
+    sub           lq, 4
+    shl    l_strideq, 2
+%if ARCH_X86_64
+    lea     stride3q, [strideq*3]
+    lea   l_stride3q, [l_strideq*3]
+%else
+    mov    l_stridem, l_strided
+%endif
+    mov   mask_bitsd, 0xf
+    mova         m12, [PIC_sym(pd_mask)]
+    XCHG_PIC_REG   0
+    movu          m0, [maskq]
+    pxor          m4, m4
+    movd          m3, [lutq+136]
+    pshufb        m3, m4
+    pshufd        m2, m0, q2222
+    pshufd        m1, m0, q1111
+    pshufd        m0, m0, q0000
+    por           m1, m2
+    por           m0, m1
+    mova [rsp+22*16], m0
+    mova [rsp+23*16], m1
+    mova [rsp+24*16], m2
+    mova [rsp+25*16], m3
+
+%define maskmem [esp+37*16]
+%define mask0   [rsp+22*16]
+%define mask1   [rsp+23*16]
+%define mask2   [rsp+24*16]
+%define minlvl  [rsp+25*16]
+
+.loop:
+    test   [maskq+8], mask_bitsd                ; vmask[2]
+    je .no_flat16
+
+%if ARCH_X86_32
+    XCHG_PIC_REG   1
+    mov  [esp+38*16], mask_bitsd
+    mova     maskmem, m12
+%endif
+    FILTER        16, h
+    jmp .end
+
+.no_flat16:
+    test   [maskq+4], mask_bitsd                ; vmask[1]
+    je .no_flat
+
+%if ARCH_X86_32
+    XCHG_PIC_REG   1
+    mov  [esp+38*16], mask_bitsd
+    mova     maskmem, m12
+%endif
+    FILTER         8, h
+    jmp .end
+
+.no_flat:
+    test   [maskq+0], mask_bitsd                ; vmask[0]
+    XCHG_PIC_REG   1
+    je .no_filter
+
+%if ARCH_X86_32
+    mov  [esp+38*16], mask_bitsd
+    mova     maskmem, m12
+%endif
+    FILTER         4, h
+    jmp .end
+
+.no_filter:
+    lea         dstq, [dstq+strideq*8]
+    lea         dstq, [dstq+strideq*8]
+%if ARCH_X86_32
+    jmp .end_noload
+.end:
+    mova         m12, maskmem
+    mov    l_strideq, l_stridem
+    mov   mask_bitsd, [esp+38*16]
+.end_noload:
+%else
+.end:
+%endif
+    lea           lq, [lq+l_strideq*4]
+    pslld        m12, 4
+    shl   mask_bitsd, 4
+%if ARCH_X86_64
+    sub           hd, 4
+%else
+    sub     dword hm, 4
+%endif
+    XCHG_PIC_REG   0
+    jg .loop
+    RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_v_sb_uv, 7, 11, 16, 3 * 16, \
+                     dst, stride, mask, l, l_stride, lut, \
+                     w, stride3, mstride, tmp, mask_bits
+%else
+cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \
+                     dst, stride, mask, l, l_stride, lut, mask_bits
+    RELOC_ARGS w
+    SETUP_PIC
+ %define m12 m4
+%endif
+    shl    l_strideq, 2
+    sub           lq, l_strideq
+%if ARCH_X86_64
+    mov     mstrideq, strideq
+    neg     mstrideq
+    lea     stride3q, [strideq*3]
+%else
+    mov    l_stridem, l_strided
+%endif
+    mov   mask_bitsd, 0xf
+    mova         m12, [PIC_sym(pd_mask)]
+    XCHG_PIC_REG   0
+    movq          m0, [maskq]
+    pxor          m3, m3
+    movd          m2, [lutq+136]
+    pshufb        m2, m3
+    pshufd        m1, m0, q1111
+    pshufd        m0, m0, q0000
+    por           m0, m1
+    mova  [rsp+0*16], m0
+    mova  [rsp+1*16], m1
+    mova  [rsp+2*16], m2
+
+%define maskmem [esp+7*16]
+%define mask0   [rsp+0*16]
+%define mask1   [rsp+1*16]
+%define minlvl  [rsp+2*16]
+
+.loop:
+    test   [maskq+4], mask_bitsd                ; vmask[1]
+    je .no_flat
+
+%if ARCH_X86_32
+    XCHG_PIC_REG   1
+    mov  [esp+11*16], mask_bitsd
+    mova     maskmem, m12
+%endif
+    FILTER         6, v
+    jmp .end
+
+.no_flat:
+    test   [maskq+0], mask_bitsd                ; vmask[1]
+    XCHG_PIC_REG   1
+    je .no_filter
+
+%if ARCH_X86_32
+    mov  [esp+11*16], mask_bitsd
+    mova     maskmem, m12
+%endif
+    FILTER         4, v
+
+.end:
+%if ARCH_X86_32
+    mova         m12, maskmem
+    mov   mask_bitsd, [esp+11*16]
+%endif
+.no_filter:
+    pslld        m12, 4
+    shl   mask_bitsd, 4
+    add           lq, 16
+    add         dstq, 16
+%if ARCH_X86_64
+    sub           wd, 4
+%else
+    sub     dword wm, 4
+%endif
+    XCHG_PIC_REG   0
+    jg .loop
+    RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_h_sb_uv, 7, 11, 16, 16 * 3, \
+                     dst, stride, mask, l, l_stride, lut, \
+                     h, stride3, l_stride3, tmp, mask_bits
+%else
+cglobal lpf_h_sb_uv, 6, 7, 8, -16 * (13 + copy_args), \
+                     dst, stride, mask, l, l_stride, lut, mask_bits
+    RELOC_ARGS h
+    SETUP_PIC
+ %define m12 m4
+%endif
+    sub           lq, 4
+    shl    l_strideq, 2
+%if ARCH_X86_64
+    lea     stride3q, [strideq*3]
+    lea   l_stride3q, [l_strideq*3]
+%else
+    mov    l_stridem, l_strided
+%endif
+    mov   mask_bitsd, 0xf
+    mova         m12, [PIC_sym(pd_mask)]
+    XCHG_PIC_REG   0
+    movq          m0, [maskq]
+    pxor          m3, m3
+    movd          m2, [lutq+136]
+    pshufb        m2, m3
+    pshufd        m1, m0, q1111
+    pshufd        m0, m0, q0000
+    por           m0, m1
+    mova  [rsp+0*16], m0
+    mova  [rsp+1*16], m1
+    mova  [rsp+2*16], m2
+
+%define maskmem [esp+7*16]
+%define mask0   [rsp+0*16]
+%define mask1   [rsp+1*16]
+%define minlvl  [rsp+2*16]
+
+.loop:
+    test   [maskq+4], mask_bitsd                ; vmask[1]
+    je .no_flat
+
+%if ARCH_X86_32
+    XCHG_PIC_REG   1
+    mov  [esp+12*16], mask_bitsd
+    mova     maskmem, m12
+%endif
+    FILTER         6, h
+    jmp .end
+
+.no_flat:
+    test   [maskq+0], mask_bitsd                ; vmask[1]
+    XCHG_PIC_REG   1
+    je .no_filter
+
+%if ARCH_X86_32
+    mov  [esp+12*16], mask_bitsd
+    mova     maskmem, m12
+%endif
+    FILTER         4, h
+    jmp .end
+
+.no_filter:
+    lea         dstq, [dstq+strideq*8]
+    lea         dstq, [dstq+strideq*8]
+%if ARCH_X86_32
+    jmp .end_noload
+.end:
+    mova         m12, maskmem
+    mov    l_strided, l_stridem
+    mov   mask_bitsd, [esp+12*16]
+.end_noload:
+%else
+.end:
+%endif
+    lea           lq, [lq+l_strideq*4]
+    pslld        m12, 4
+    shl   mask_bitsd, 4
+%if ARCH_X86_64
+    sub           hd, 4
+%else
+    sub     dword hm, 4
+%endif
+    XCHG_PIC_REG   0
+    jg .loop
+    RET
diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm
new file mode 100644 (file)
index 0000000..3e3c35c
--- /dev/null
@@ -0,0 +1,1157 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+pb_right_ext_mask: times 32 db 0xff
+                   times 32 db 0
+pb_14x0_1_2: times 14 db 0
+             db 1, 2
+pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
+                  db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
+pb_15: times 16 db 15
+pw_16: times 2 dw 16
+pw_256: times 2 dw 256
+pw_2048: times 2 dw 2048
+pw_16380: times 2 dw 16380
+pw_0_128: dw 0, 128
+pw_5_6: dw 5, 6
+pd_6: dd 6
+pd_1024: dd 1024
+pd_0xf0080029: dd 0xf0080029
+pd_0xf00801c7: dd 0xf00801c7
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge
+    mov        edged, edgem
+    vpbroadcastb m15, [fhq+0]
+    movifnidn     wd, wm
+    vpbroadcastb m14, [fhq+2]
+    mov           hd, hm
+    vpbroadcastb m13, [fhq+4]
+    vpbroadcastw m12, [fhq+6]
+    vpbroadcastd m11, [pw_2048]
+    vpbroadcastd m10, [pw_16380]
+    lea          r11, [pb_right_ext_mask]
+
+    DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
+
+    ; if (edge & has_right) align_w_to_32
+    ; else w -= 32, and use that as limit in x loop
+    test       edgeb, 2 ; has_right
+    jnz .align
+    mov        xlimq, -3
+    jmp .loop
+.align:
+    add           wd, 31
+    and           wd, ~31
+    xor        xlimd, xlimd
+
+    ; main y loop for vertical filter
+.loop:
+    mov      srcptrq, srcq
+    mov      dstptrq, dstq
+    lea           xq, [wq+xlimq]
+
+    ; load left edge pixels
+    test       edgeb, 1 ; have_left
+    jz .emu_left
+    test       leftq, leftq ; left == NULL for the edge-extended bottom/top
+    jz .load_left_combined
+    movd         xm0, [leftq]
+    add        leftq, 4
+    pinsrd       xm0, [srcq], 1
+    pslldq       xm0, 9
+    jmp .left_load_done
+.load_left_combined:
+    movq         xm0, [srcq-3]
+    pslldq       xm0, 10
+    jmp .left_load_done
+.emu_left:
+    movd         xm0, [srcq]
+    pshufb       xm0, [pb_14x0_1_2]
+
+    ; load right edge pixels
+.left_load_done:
+    cmp           xd, 32
+    jg .main_load
+    test          xd, xd
+    jg .load_and_splat
+    je .splat_right
+
+    ; for very small images (w=[1-2]), edge-extend the original cache,
+    ; ugly, but only runs in very odd cases
+    add           wd, wd
+    pshufb       xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
+    shr           wd, 1
+
+    ; main x loop, mostly this starts in .main_load
+.splat_right:
+    ; no need to load new pixels, just extend them from the (possibly previously
+    ; extended) previous load into m0
+    pshufb       xm1, xm0, [pb_15]
+    jmp .main_loop
+.load_and_splat:
+    ; load new pixels and extend edge for right-most
+    movu          m1, [srcptrq+3]
+    sub          r11, xq
+    movu          m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32]
+    add          r11, xq
+    vpbroadcastb  m3, [srcptrq+2+xq]
+    pand          m1, m2
+    pandn         m3, m2, m3
+    por           m1, m3
+    jmp .main_loop
+.main_load:
+    ; load subsequent line
+    movu          m1, [srcptrq+3]
+.main_loop:
+    vinserti128   m0, xm1, 1
+
+    palignr       m2, m1, m0, 10
+    palignr       m3, m1, m0, 11
+    palignr       m4, m1, m0, 12
+    palignr       m5, m1, m0, 13
+    palignr       m6, m1, m0, 14
+    palignr       m7, m1, m0, 15
+
+    punpcklbw     m0, m2, m1
+    punpckhbw     m2, m1
+    punpcklbw     m8, m3, m7
+    punpckhbw     m3, m7
+    punpcklbw     m7, m4, m6
+    punpckhbw     m4, m6
+    pxor          m9, m9
+    punpcklbw     m6, m5, m9
+    punpckhbw     m5, m9
+
+    pmaddubsw     m0, m15
+    pmaddubsw     m2, m15
+    pmaddubsw     m8, m14
+    pmaddubsw     m3, m14
+    pmaddubsw     m7, m13
+    pmaddubsw     m4, m13
+    paddw         m0, m8
+    paddw         m2, m3
+    psllw         m8, m6, 7
+    psllw         m3, m5, 7
+    psubw         m8, m10
+    psubw         m3, m10
+    pmullw        m6, m12
+    pmullw        m5, m12
+    paddw         m0, m7
+    paddw         m2, m4
+    paddw         m0, m6
+    paddw         m2, m5
+    ; for a signed overflow to happen we need filter and pixels as follow:
+    ; filter => -5,-23,-17,90,-17,-23,-5
+    ; pixels => 255,255,255,0,255,255,255 or 0,0,0,255,0,0,0
+    ; m0 would fall in the range [-59A6;+59A6] = [A65A;59A6]
+    ; m8 would fall in the range [-3FFC;+3F84] = [C004;3F84]
+    ;  32-bit arithmetic m0+m8 = [-99A2;+992A] = [FFFF665E;992A]
+    ; => signed 16-bit overflow occurs
+    paddsw        m0, m8  ; paddsw clips this range to [-8000;+7FFF]
+    paddsw        m2, m3
+    psraw         m0, 3   ; shift changes the range to [-1000;+FFF]
+    psraw         m2, 3
+    paddw         m0, m11 ; adding back 800 (removed in m8) changes the
+    paddw         m2, m11 ; range to [-800;+17FF] as defined in the spec
+    mova   [dstptrq], xm0 ; (note that adding another 800 would give us
+    mova [dstptrq+16], xm2;  the same range as in the C code => [0;1FFF])
+    vextracti128 [dstptrq+32], m0, 1
+    vextracti128 [dstptrq+48], m2, 1
+    vextracti128 xm0, m1, 1
+    add      srcptrq, 32
+    add      dstptrq, 64
+    sub           xq, 32
+    cmp           xd, 32
+    jg .main_load
+    test          xd, xd
+    jg .load_and_splat
+    cmp           xd, xlimd
+    jg .splat_right
+
+    add         srcq, strideq
+    add         dstq, 384*2
+    dec           hd
+    jg .loop
+    RET
+
+cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, fv, edge
+    movifnidn    fvq, fvmp
+    mov        edged, edgem
+    movifnidn     hd, hm
+    vpbroadcastd m10, [fvq]
+    vpbroadcastd m11, [fvq+4]
+    vpbroadcastd  m0, [pw_0_128]
+    vpbroadcastd m12, [pd_1024]
+
+    DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr
+    rorx       ylimd, edged, 2
+    paddw        m11, m0
+    and        ylimd, 2 ; have_bottom
+    sub        ylimd, 3
+
+    ; main x loop for vertical filter, does one column of 16 pixels
+.loop_x:
+    mova          m3, [midq] ; middle line
+
+    ; load top pixels
+    test       edgeb, 4 ; have_top
+    jz .emu_top
+    mova          m0, [midq-384*4]
+    mova          m2, [midq-384*2]
+    mova          m1, m0
+    jmp .load_bottom_pixels
+.emu_top:
+    mova          m0, m3
+    mova          m1, m3
+    mova          m2, m3
+
+    ; load bottom pixels
+.load_bottom_pixels:
+    mov           yd, hd
+    mov        mptrq, midq
+    mov      dstptrq, dstq
+    add           yd, ylimd
+    jg .load_threelines
+
+    ; the remainder here is somewhat messy but only runs in very weird
+    ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
+    ; so performance is not terribly important here...
+    je .load_twolines
+    cmp           yd, -1
+    je .load_oneline
+    ; h == 1 case
+    mova          m5, m3
+    mova          m4, m3
+    mova          m6, m3
+    jmp .loop
+.load_oneline:
+    ; h == 2 case
+    mova          m4, [midq+384*2]
+    mova          m5, m4
+    mova          m6, m4
+    jmp .loop
+.load_twolines:
+    ; h == 3 case
+    mova          m4, [midq+384*2]
+    mova          m5, [midq+384*4]
+    mova          m6, m5
+    jmp .loop
+.load_threelines:
+    ; h > 3 case
+    mova          m4, [midq+384*2]
+    mova          m5, [midq+384*4]
+    ; third line loaded in main loop below
+
+    ; main y loop for vertical filter
+.loop_load:
+    ; load one line into m6. if that pixel is no longer available, do
+    ; nothing, since m6 still has the data from the previous line in it. We
+    ; try to structure the loop so that the common case is evaluated fastest
+    mova          m6, [mptrq+384*6]
+.loop:
+    paddw         m0, m6
+    paddw         m7, m1, m5
+    paddw         m8, m2, m4
+    punpcklwd     m9, m0, m7
+    punpckhwd     m0, m7
+    punpcklwd     m7, m8, m3
+    punpckhwd     m8, m3
+    pmaddwd       m9, m10
+    pmaddwd       m0, m10
+    pmaddwd       m7, m11
+    pmaddwd       m8, m11
+    add        mptrq, 384*2
+    paddd         m7, m9
+    paddd         m0, m8
+    paddd         m7, m12
+    paddd         m0, m12
+    psrad         m7, 11
+    psrad         m0, 11
+    packssdw      m7, m0
+    vextracti128 xm0, m7, 1
+    packuswb     xm7, xm0
+    mova   [dstptrq], xm7
+    ; shift pixels one position
+    mova          m0, m1
+    mova          m1, m2
+    mova          m2, m3
+    mova          m3, m4
+    mova          m4, m5
+    mova          m5, m6
+    add      dstptrq, strideq
+    dec           yd
+    jg .loop_load
+    ; for the bottom pixels, continue using m6 (as extended edge)
+    cmp           yd, ylimd
+    jg .loop
+    add         midq, 32
+    add         dstq, 16
+    sub           wd, 16
+    jg .loop_x
+    RET
+
+INIT_YMM avx2
+cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+    mov        xlimd, edgem
+    movifnidn     wd, wm
+    mov           hd, hm
+    mov        edged, xlimd
+    and        xlimd, 2                             ; have_right
+    jz .no_right
+    add           wd, 2+15
+    and           wd, ~15
+.no_right:
+    lea          r10, [pb_right_ext_mask+32]
+    xor        xlimd, 2                             ; 2*!have_right
+    pxor          m1, m1
+    add         srcq, wq
+    lea         sumq, [sumq+wq*2-2]
+    lea       sumsqq, [sumsqq+wq*4-4]
+    neg           wq
+.loop_y:
+    mov           xq, wq
+
+    ; load left
+    test       edgeb, 1                             ; have_left
+    jz .no_left
+    test       leftq, leftq
+    jz .load_left_from_main
+    vpbroadcastw xm0, [leftq+2]
+    add        leftq, 4
+    jmp .expand_x
+.no_left:
+    vpbroadcastb xm0, [srcq+xq]
+    jmp .expand_x
+.load_left_from_main:
+    vpbroadcastw xm0, [srcq+xq-2]
+.expand_x:
+    punpckhbw    xm0, xm1
+
+    ; when we reach this, xm0 contains left two px in highest words
+    cmp           xd, -16
+    jle .loop_x
+.partial_load_and_extend:
+    vpbroadcastb  m3, [srcq-1]
+    pmovzxbw      m2, [srcq+xq]
+    movu          m4, [r10+xq*2]
+    punpcklbw     m3, m1
+    pand          m2, m4
+    pandn         m4, m3
+    por           m2, m4
+    jmp .loop_x_noload
+.right_extend:
+    psrldq       xm2, xm0, 14
+    vpbroadcastw  m2, xm2
+    jmp .loop_x_noload
+
+.loop_x:
+    pmovzxbw      m2, [srcq+xq]
+.loop_x_noload:
+    vinserti128   m0, xm2, 1
+    palignr       m3, m2, m0, 12
+    palignr       m4, m2, m0, 14
+
+    punpcklwd     m5, m3, m2
+    punpckhwd     m6, m3, m2
+    paddw         m3, m4
+    punpcklwd     m0, m4, m1
+    punpckhwd     m4, m1
+    pmaddwd       m5, m5
+    pmaddwd       m6, m6
+    pmaddwd       m0, m0
+    pmaddwd       m4, m4
+    paddw         m3, m2
+    paddd         m5, m0
+    vextracti128 xm0, m2, 1
+    paddd         m6, m4
+    movu [sumq+xq*2], m3
+    movu         [sumsqq+xq*4+ 0], xm5
+    movu         [sumsqq+xq*4+16], xm6
+    vextracti128 [sumsqq+xq*4+32], m5, 1
+    vextracti128 [sumsqq+xq*4+48], m6, 1
+    add           xq, 16
+
+    ; if x <= -16 we can reload more pixels
+    ; else if x < 0 we reload and extend (this implies have_right=0)
+    ; else if x < xlimd we extend from previous load (this implies have_right=0)
+    ; else we are done
+
+    cmp           xd, -16
+    jle .loop_x
+    test          xd, xd
+    jl .partial_load_and_extend
+    cmp           xd, xlimd
+    jl .right_extend
+
+    add       sumsqq, (384+16)*4
+    add         sumq, (384+16)*2
+    add         srcq, strideq
+    dec           hd
+    jg .loop_y
+    RET
+
+INIT_YMM avx2
+cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+    movifnidn  edged, edgem
+    mov           xq, -2
+    rorx       ylimd, edged, 2
+    and        ylimd, 2                             ; have_bottom
+    sub        ylimd, 2                             ; -2 if have_bottom=0, else 0
+.loop_x:
+    lea           yd, [hq+ylimq+2]
+    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
+    lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
+    test       edgeb, 4                             ; have_top
+    jnz .load_top
+    movu          m0, [sumsq_ptrq+(384+16)*4*1]
+    movu          m1, [sumsq_ptrq+(384+16)*4*1+32]
+    movu          m6, [sum_ptrq+(384+16)*2*1]
+    mova          m2, m0
+    mova          m3, m1
+    mova          m4, m0
+    mova          m5, m1
+    mova          m7, m6
+    mova          m8, m6
+    jmp .loop_y_noload
+.load_top:
+    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l2sq [left]
+    movu          m1, [sumsq_ptrq-(384+16)*4*1+32]   ; l2sq [right]
+    movu          m2, [sumsq_ptrq-(384+16)*4*0]      ; l1sq [left]
+    movu          m3, [sumsq_ptrq-(384+16)*4*0+32]   ; l1sq [right]
+    movu          m6, [sum_ptrq-(384+16)*2*1]        ; l2
+    movu          m7, [sum_ptrq-(384+16)*2*0]        ; l1
+.loop_y:
+    movu          m4, [sumsq_ptrq+(384+16)*4*1]      ; l0sq [left]
+    movu          m5, [sumsq_ptrq+(384+16)*4*1+32]   ; l0sq [right]
+    movu          m8, [sum_ptrq+(384+16)*2*1]        ; l0
+.loop_y_noload:
+    paddd         m0, m2
+    paddd         m1, m3
+    paddw         m6, m7
+    paddd         m0, m4
+    paddd         m1, m5
+    paddw         m6, m8
+    movu [sumsq_ptrq+ 0], m0
+    movu [sumsq_ptrq+32], m1
+    movu  [sum_ptrq], m6
+
+    ; shift position down by one
+    mova          m0, m2
+    mova          m1, m3
+    mova          m2, m4
+    mova          m3, m5
+    mova          m6, m7
+    mova          m7, m8
+    add   sumsq_ptrq, (384+16)*4
+    add     sum_ptrq, (384+16)*2
+    dec           yd
+    jg .loop_y
+    cmp           yd, ylimd
+    jg .loop_y_noload
+    add           xd, 16
+    cmp           xd, wd
+    jl .loop_x
+    RET
+
+INIT_YMM avx2
+cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s
+    sub           aq, (384+16-1)*4
+    sub           bq, (384+16-1)*2
+    add           hd, 2
+    lea           r5, [sgr_x_by_x-0xf03]
+%ifidn sd, sm
+    movd         xm6, sd
+    vpbroadcastd  m6, xm6
+%else
+    vpbroadcastd  m6, sm
+%endif
+    vpbroadcastd  m8, [pd_0xf00801c7]
+    vpbroadcastd  m9, [pw_256]
+    pcmpeqb       m7, m7
+    psrld        m10, m9, 13                        ; pd_2048
+    DEFINE_ARGS a, b, w, h, x
+
+.loop_y:
+    mov           xq, -2
+.loop_x:
+    pmovzxwd      m0, [bq+xq*2]
+    pmovzxwd      m1, [bq+xq*2+(384+16)*2]
+    movu          m2, [aq+xq*4]
+    movu          m3, [aq+xq*4+(384+16)*4]
+    pslld         m4, m2, 3
+    pslld         m5, m3, 3
+    paddd         m2, m4                            ; aa * 9
+    paddd         m3, m5
+    pmaddwd       m4, m0, m0
+    pmaddwd       m5, m1, m1
+    pmaddwd       m0, m8
+    pmaddwd       m1, m8
+    psubd         m2, m4                            ; p = aa * 9 - bb * bb
+    psubd         m3, m5
+    pmulld        m2, m6
+    pmulld        m3, m6
+    paddusw       m2, m8
+    paddusw       m3, m8
+    psrld         m2, 20                            ; z
+    psrld         m3, 20
+    mova          m5, m7
+    vpgatherdd    m4, [r5+m2], m5                   ; xx
+    mova          m5, m7
+    vpgatherdd    m2, [r5+m3], m5
+    psrld         m4, 24
+    psrld         m2, 24
+    pmulld        m0, m4
+    pmulld        m1, m2
+    packssdw      m4, m2
+    psubw         m4, m9, m4
+    vpermq        m4, m4, q3120
+    paddd         m0, m10
+    paddd         m1, m10
+    psrld         m0, 12
+    psrld         m1, 12
+    movu   [bq+xq*2], xm4
+    vextracti128 [bq+xq*2+(384+16)*2], m4, 1
+    movu   [aq+xq*4], m0
+    movu [aq+xq*4+(384+16)*4], m1
+    add           xd, 8
+    cmp           xd, wd
+    jl .loop_x
+    add           aq, (384+16)*4*2
+    add           bq, (384+16)*2*2
+    sub           hd, 2
+    jg .loop_y
+    RET
+
+INIT_YMM avx2
+cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
+                                       tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
+    movifnidn     wd, wm
+    mov           hd, hm
+    vpbroadcastd m15, [pw_16]
+    xor           xd, xd
+.loop_x:
+    lea     tmp_ptrq, [tq+xq*2]
+    lea     src_ptrq, [srcq+xq*1]
+    lea       a_ptrq, [aq+xq*4+(384+16)*4]
+    lea       b_ptrq, [bq+xq*2+(384+16)*2]
+    movu          m0, [aq+xq*4-(384+16)*4-4]
+    movu          m2, [aq+xq*4-(384+16)*4+4]
+    mova          m1, [aq+xq*4-(384+16)*4]           ; a:top [first half]
+    paddd         m0, m2                            ; a:tl+tr [first half]
+    movu          m2, [aq+xq*4-(384+16)*4-4+32]
+    movu          m4, [aq+xq*4-(384+16)*4+4+32]
+    mova          m3, [aq+xq*4-(384+16)*4+32]        ; a:top [second half]
+    paddd         m2, m4                            ; a:tl+tr [second half]
+    movu          m4, [aq+xq*4-4]
+    movu          m5, [aq+xq*4+4]
+    paddd         m1, [aq+xq*4]                     ; a:top+ctr [first half]
+    paddd         m4, m5                            ; a:l+r [first half]
+    movu          m5, [aq+xq*4+32-4]
+    movu          m6, [aq+xq*4+32+4]
+    paddd         m3, [aq+xq*4+32]                  ; a:top+ctr [second half]
+    paddd         m5, m6                            ; a:l+r [second half]
+
+    movu          m6, [bq+xq*2-(384+16)*2-2]
+    movu          m8, [bq+xq*2-(384+16)*2+2]
+    mova          m7, [bq+xq*2-(384+16)*2]          ; b:top
+    paddw         m6, m8                            ; b:tl+tr
+    movu          m8, [bq+xq*2-2]
+    movu          m9, [bq+xq*2+2]
+    paddw         m7, [bq+xq*2]                     ; b:top+ctr
+    paddw         m8, m9                            ; b:l+r
+    mov           yd, hd
+.loop_y:
+    movu          m9, [b_ptrq-2]
+    movu         m10, [b_ptrq+2]
+    paddw         m7, [b_ptrq]                      ; b:top+ctr+bottom
+    paddw         m9, m10                           ; b:bl+br
+    paddw        m10, m7, m8                        ; b:top+ctr+bottom+l+r
+    paddw         m6, m9                            ; b:tl+tr+bl+br
+    psubw         m7, [b_ptrq-(384+16)*2*2]         ; b:ctr+bottom
+    paddw        m10, m6
+    psllw        m10, 2
+    psubw        m10, m6                            ; aa
+    pmovzxbw     m12, [src_ptrq]
+    punpcklwd     m6, m10, m15
+    punpckhwd    m10, m15
+    punpcklwd    m13, m12, m15
+    punpckhwd    m12, m15
+    pmaddwd       m6, m13                           ; aa*src[x]+256 [first half]
+    pmaddwd      m10, m12                           ; aa*src[x]+256 [second half]
+
+    movu         m11, [a_ptrq-4]
+    movu         m12, [a_ptrq+4]
+    paddd         m1, [a_ptrq]                      ; a:top+ctr+bottom [first half]
+    paddd        m11, m12                           ; a:bl+br [first half]
+    movu         m12, [a_ptrq+32-4]
+    movu         m13, [a_ptrq+32+4]
+    paddd         m3, [a_ptrq+32]                   ; a:top+ctr+bottom [second half]
+    paddd        m12, m13                           ; a:bl+br [second half]
+    paddd        m13, m1, m4                        ; a:top+ctr+bottom+l+r [first half]
+    paddd        m14, m3, m5                        ; a:top+ctr+bottom+l+r [second half]
+    paddd         m0, m11                           ; a:tl+tr+bl+br [first half]
+    paddd         m2, m12                           ; a:tl+tr+bl+br [second half]
+    paddd        m13, m0
+    paddd        m14, m2
+    pslld        m13, 2
+    pslld        m14, 2
+    psubd        m13, m0                            ; bb [first half]
+    psubd        m14, m2                            ; bb [second half]
+    vperm2i128    m0, m13, m14, 0x31
+    vinserti128  m13, xm14, 1
+    psubd         m1, [a_ptrq-(384+16)*4*2]          ; a:ctr+bottom [first half]
+    psubd         m3, [a_ptrq-(384+16)*4*2+32]       ; a:ctr+bottom [second half]
+
+    paddd         m6, m13
+    paddd        m10, m0
+    psrad         m6, 9
+    psrad        m10, 9
+    packssdw      m6, m10
+    mova  [tmp_ptrq], m6
+
+    ; shift to next row
+    mova          m0, m4
+    mova          m2, m5
+    mova          m4, m11
+    mova          m5, m12
+    mova          m6, m8
+    mova          m8, m9
+
+    add       a_ptrq, (384+16)*4
+    add       b_ptrq, (384+16)*2
+    add     tmp_ptrq, 384*2
+    add     src_ptrq, strideq
+    dec           yd
+    jg .loop_y
+    add           xd, 16
+    cmp           xd, wd
+    jl .loop_x
+    RET
+
+INIT_YMM avx2
+cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt
+%ifidn wtd, wtm
+    shl          wtd, 4
+    movd         xm5, wtd
+    vpbroadcastw  m5, xm5
+%else
+    vpbroadcastw  m5, wtm
+    mov           hd, hm
+    psllw         m5, 4
+%endif
+    DEFINE_ARGS dst, stride, t, w, h, idx
+.loop_y:
+    xor         idxd, idxd
+.loop_x:
+    mova          m0, [tq+idxq*2+ 0]
+    mova          m1, [tq+idxq*2+32]
+    pmovzxbw      m2, [dstq+idxq+ 0]
+    pmovzxbw      m3, [dstq+idxq+16]
+    psllw         m4, m2, 4
+    psubw         m0, m4
+    psllw         m4, m3, 4
+    psubw         m1, m4
+    pmulhrsw      m0, m5
+    pmulhrsw      m1, m5
+    paddw         m0, m2
+    paddw         m1, m3
+    packuswb      m0, m1
+    vpermq        m0, m0, q3120
+    mova [dstq+idxq], m0
+    add         idxd, 32
+    cmp         idxd, wd
+    jl .loop_x
+    add           tq, 384*2
+    add         dstq, strideq
+    dec           hd
+    jg .loop_y
+    RET
+
+INIT_YMM avx2
+cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+    mov        edged, edgem
+    movifnidn     wd, wm
+    mov           hd, hm
+    test       edgeb, 2                             ; have_right
+    jz .no_right
+    xor        xlimd, xlimd
+    add           wd, 2+15
+    and           wd, ~15
+    jmp .right_done
+.no_right:
+    mov        xlimd, 3
+    sub           wd, 1
+.right_done:
+    lea          r10, [pb_right_ext_mask+32]
+    pxor          m1, m1
+    lea         srcq, [srcq+wq+1]
+    lea         sumq, [sumq+wq*2-2]
+    lea       sumsqq, [sumsqq+wq*4-4]
+    neg           wq
+.loop_y:
+    mov           xq, wq
+
+    ; load left
+    test       edgeb, 1                             ; have_left
+    jz .no_left
+    test       leftq, leftq
+    jz .load_left_from_main
+    vpbroadcastd xm2, [leftq]
+    movd         xm0, [srcq+xq-1]
+    add        leftq, 4
+    palignr      xm0, xm2, 1
+    jmp .expand_x
+.no_left:
+    vpbroadcastb xm0, [srcq+xq-1]
+    jmp .expand_x
+.load_left_from_main:
+    vpbroadcastd xm0, [srcq+xq-4]
+.expand_x:
+    punpckhbw    xm0, xm1
+
+    ; when we reach this, xm0 contains left two px in highest words
+    cmp           xd, -16
+    jle .loop_x
+    test          xd, xd
+    jge .right_extend
+.partial_load_and_extend:
+    vpbroadcastb  m3, [srcq-1]
+    pmovzxbw      m2, [srcq+xq]
+    movu          m4, [r10+xq*2]
+    punpcklbw     m3, m1
+    pand          m2, m4
+    pandn         m4, m3
+    por           m2, m4
+    jmp .loop_x_noload
+.right_extend:
+    psrldq       xm2, xm0, 14
+    vpbroadcastw  m2, xm2
+    jmp .loop_x_noload
+
+.loop_x:
+    pmovzxbw      m2, [srcq+xq]
+.loop_x_noload:
+    vinserti128   m0, xm2, 1
+    palignr       m3, m2, m0, 8
+    palignr       m4, m2, m0, 10
+    palignr       m5, m2, m0, 12
+    palignr       m6, m2, m0, 14
+
+    paddw         m0, m3, m2
+    punpcklwd     m7, m3, m2
+    punpckhwd     m3, m2
+    paddw         m0, m4
+    punpcklwd     m8, m4, m5
+    punpckhwd     m4, m5
+    paddw         m0, m5
+    punpcklwd     m9, m6, m1
+    punpckhwd     m5, m6, m1
+    paddw         m0, m6
+    pmaddwd       m7, m7
+    pmaddwd       m3, m3
+    pmaddwd       m8, m8
+    pmaddwd       m4, m4
+    pmaddwd       m9, m9
+    pmaddwd       m5, m5
+    paddd         m7, m8
+    paddd         m3, m4
+    paddd         m7, m9
+    paddd         m3, m5
+    movu [sumq+xq*2], m0
+    movu         [sumsqq+xq*4+ 0], xm7
+    movu         [sumsqq+xq*4+16], xm3
+    vextracti128 [sumsqq+xq*4+32], m7, 1
+    vextracti128 [sumsqq+xq*4+48], m3, 1
+
+    vextracti128 xm0, m2, 1
+    add           xq, 16
+
+    ; if x <= -16 we can reload more pixels
+    ; else if x < 0 we reload and extend (this implies have_right=0)
+    ; else if x < xlimd we extend from previous load (this implies have_right=0)
+    ; else we are done
+
+    cmp           xd, -16
+    jle .loop_x
+    test          xd, xd
+    jl .partial_load_and_extend
+    cmp           xd, xlimd
+    jl .right_extend
+
+    add         srcq, strideq
+    add       sumsqq, (384+16)*4
+    add         sumq, (384+16)*2
+    dec hd
+    jg .loop_y
+    RET
+
+INIT_YMM avx2
+cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+    movifnidn  edged, edgem
+    mov           xq, -2
+    rorx       ylimd, edged, 2
+    and        ylimd, 2                             ; have_bottom
+    sub        ylimd, 3                             ; -3 if have_bottom=0, else -1
+.loop_x:
+    lea           yd, [hq+ylimq+2]
+    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
+    lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
+    test       edgeb, 4                             ; have_top
+    jnz .load_top
+    movu          m0, [sumsq_ptrq+(384+16)*4*1]
+    movu          m1, [sumsq_ptrq+(384+16)*4*1+32]
+    movu         m10, [sum_ptrq+(384+16)*2*1]
+    mova          m2, m0
+    mova          m3, m1
+    mova          m4, m0
+    mova          m5, m1
+    mova          m6, m0
+    mova          m7, m1
+    mova         m11, m10
+    mova         m12, m10
+    mova         m13, m10
+    jmp .loop_y_second_load
+.load_top:
+    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l3/4sq [left]
+    movu          m1, [sumsq_ptrq-(384+16)*4*1+32]   ; l3/4sq [right]
+    movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
+    movu          m5, [sumsq_ptrq-(384+16)*4*0+32]   ; l2sq [right]
+    movu         m10, [sum_ptrq-(384+16)*2*1]        ; l3/4
+    movu         m12, [sum_ptrq-(384+16)*2*0]        ; l2
+    mova          m2, m0
+    mova          m3, m1
+    mova         m11, m10
+.loop_y:
+    movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
+    movu          m7, [sumsq_ptrq+(384+16)*4*1+32]   ; l1sq [right]
+    movu         m13, [sum_ptrq+(384+16)*2*1]        ; l1
+.loop_y_second_load:
+    test          yd, yd
+    jle .emulate_second_load
+    movu          m8, [sumsq_ptrq+(384+16)*4*2]      ; l0sq [left]
+    movu          m9, [sumsq_ptrq+(384+16)*4*2+32]   ; l0sq [right]
+    movu         m14, [sum_ptrq+(384+16)*2*2]        ; l0
+.loop_y_noload:
+    paddd         m0, m2
+    paddd         m1, m3
+    paddw        m10, m11
+    paddd         m0, m4
+    paddd         m1, m5
+    paddw        m10, m12
+    paddd         m0, m6
+    paddd         m1, m7
+    paddw        m10, m13
+    paddd         m0, m8
+    paddd         m1, m9
+    paddw        m10, m14
+    movu [sumsq_ptrq+ 0], m0
+    movu [sumsq_ptrq+32], m1
+    movu  [sum_ptrq], m10
+
+    ; shift position down by one
+    mova          m0, m4
+    mova          m1, m5
+    mova          m2, m6
+    mova          m3, m7
+    mova          m4, m8
+    mova          m5, m9
+    mova         m10, m12
+    mova         m11, m13
+    mova         m12, m14
+    add   sumsq_ptrq, (384+16)*4*2
+    add     sum_ptrq, (384+16)*2*2
+    sub           yd, 2
+    jge .loop_y
+    ; l1 = l0
+    mova          m6, m8
+    mova          m7, m9
+    mova         m13, m14
+    cmp           yd, ylimd
+    jg .loop_y_noload
+    add           xd, 16
+    cmp           xd, wd
+    jl .loop_x
+    RET
+.emulate_second_load:
+    mova          m8, m6
+    mova          m9, m7
+    mova         m14, m13
+    jmp .loop_y_noload
+
+INIT_YMM avx2
+cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s
+    sub           aq, (384+16-1)*4
+    sub           bq, (384+16-1)*2
+    add           hd, 2
+    lea           r5, [sgr_x_by_x-0xf03]
+%ifidn sd, sm
+    movd         xm6, sd
+    vpbroadcastd  m6, xm6
+%else
+    vpbroadcastd  m6, sm
+%endif
+    vpbroadcastd  m8, [pd_0xf0080029]
+    vpbroadcastd  m9, [pw_256]
+    pcmpeqb       m7, m7
+    psrld        m10, m9, 15                        ; pd_512
+    DEFINE_ARGS a, b, w, h, x
+.loop_y:
+    mov           xq, -2
+.loop_x:
+    pmovzxwd      m0, [bq+xq*2+ 0]
+    pmovzxwd      m1, [bq+xq*2+16]
+    movu          m2, [aq+xq*4+ 0]
+    movu          m3, [aq+xq*4+32]
+    pslld         m4, m2, 3                         ; aa * 8
+    pslld         m5, m3, 3
+    paddd         m2, m4                            ; aa * 9
+    paddd         m3, m5
+    paddd         m4, m4                            ; aa * 16
+    paddd         m5, m5
+    paddd         m2, m4                            ; aa * 25
+    paddd         m3, m5
+    pmaddwd       m4, m0, m0
+    pmaddwd       m5, m1, m1
+    psubd         m2, m4                            ; p = aa * 25 - bb * bb
+    psubd         m3, m5
+    pmulld        m2, m6
+    pmulld        m3, m6
+    paddusw       m2, m8
+    paddusw       m3, m8
+    psrld         m2, 20                            ; z
+    psrld         m3, 20
+    mova          m5, m7
+    vpgatherdd    m4, [r5+m2], m5                   ; xx
+    mova          m5, m7
+    vpgatherdd    m2, [r5+m3], m5
+    psrld         m4, 24
+    psrld         m2, 24
+    packssdw      m3, m4, m2
+    pmullw        m4, m8
+    pmullw        m2, m8
+    psubw         m3, m9, m3
+    vpermq        m3, m3, q3120
+    pmaddwd       m0, m4
+    pmaddwd       m1, m2
+    paddd         m0, m10
+    paddd         m1, m10
+    psrld         m0, 10
+    psrld         m1, 10
+    movu   [bq+xq*2], m3
+    movu [aq+xq*4+ 0], m0
+    movu [aq+xq*4+32], m1
+    add           xd, 16
+    cmp           xd, wd
+    jl .loop_x
+    add           aq, (384+16)*4*2
+    add           bq, (384+16)*2*2
+    sub           hd, 2
+    jg .loop_y
+    RET
+
+INIT_YMM avx2
+cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \
+                                       tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
+    movifnidn     wd, wm
+    mov           hd, hm
+    vpbroadcastd  m9, [pw_5_6]
+    vpbroadcastd m12, [pw_256]
+    psrlw        m11, m12, 1                    ; pw_128
+    psrlw        m10, m12, 8                    ; pw_1
+    xor           xd, xd
+.loop_x:
+    lea     tmp_ptrq, [tq+xq*2]
+    lea     src_ptrq, [srcq+xq*1]
+    lea       a_ptrq, [aq+xq*4+(384+16)*4]
+    lea       b_ptrq, [bq+xq*2+(384+16)*2]
+    movu          m0, [aq+xq*4-(384+16)*4-4]
+    mova          m1, [aq+xq*4-(384+16)*4]
+    movu          m2, [aq+xq*4-(384+16)*4+4]
+    movu          m3, [aq+xq*4-(384+16)*4-4+32]
+    mova          m4, [aq+xq*4-(384+16)*4+32]
+    movu          m5, [aq+xq*4-(384+16)*4+4+32]
+    paddd         m0, m2
+    paddd         m3, m5
+    paddd         m0, m1
+    paddd         m3, m4
+    pslld         m2, m0, 2
+    pslld         m5, m3, 2
+    paddd         m2, m0
+    paddd         m5, m3
+    paddd         m0, m2, m1                    ; prev_odd_b [first half]
+    paddd         m1, m5, m4                    ; prev_odd_b [second half]
+    movu          m3, [bq+xq*2-(384+16)*2-2]
+    mova          m4, [bq+xq*2-(384+16)*2]
+    movu          m5, [bq+xq*2-(384+16)*2+2]
+    paddw         m3, m5
+    punpcklwd     m5, m3, m4
+    punpckhwd     m3, m4
+    pmaddwd       m5, m9
+    pmaddwd       m3, m9
+    packssdw      m2, m5, m3                    ; prev_odd_a
+    mov           yd, hd
+.loop_y:
+    movu          m3, [a_ptrq-4]
+    mova          m4, [a_ptrq]
+    movu          m5, [a_ptrq+4]
+    movu          m6, [a_ptrq+32-4]
+    mova          m7, [a_ptrq+32]
+    movu          m8, [a_ptrq+32+4]
+    paddd         m3, m5
+    paddd         m6, m8
+    paddd         m3, m4
+    paddd         m6, m7
+    pslld         m5, m3, 2
+    pslld         m8, m6, 2
+    paddd         m5, m3
+    paddd         m8, m6
+    paddd         m3, m5, m4                    ; cur_odd_b [first half]
+    paddd         m4, m8, m7                    ; cur_odd_b [second half]
+    movu          m5, [b_ptrq-2]
+    mova          m6, [b_ptrq]
+    movu          m7, [b_ptrq+2]
+    paddw         m5, m7
+    punpcklwd     m7, m5, m6
+    punpckhwd     m5, m6
+    pmaddwd       m7, m9
+    pmaddwd       m5, m9
+    packssdw      m5, m7, m5                    ; cur_odd_a
+
+    paddd         m0, m3                        ; cur_even_b [first half]
+    paddd         m1, m4                        ; cur_even_b [second half]
+    paddw         m2, m5                        ; cur_even_a
+
+    pmovzxbw      m6, [src_ptrq]
+    vperm2i128    m8, m0, m1, 0x31
+    vinserti128   m0, xm1, 1
+    punpcklwd     m7, m6, m10
+    punpckhwd     m6, m10
+    punpcklwd     m1, m2, m12
+    punpckhwd     m2, m12
+    pmaddwd       m7, m1
+    pmaddwd       m6, m2
+    paddd         m7, m0
+    paddd         m6, m8
+    psrad         m7, 9
+    psrad         m6, 9
+
+    pmovzxbw      m8, [src_ptrq+strideq]
+    punpcklwd     m0, m8, m10
+    punpckhwd     m8, m10
+    punpcklwd     m1, m5, m11
+    punpckhwd     m2, m5, m11
+    pmaddwd       m0, m1
+    pmaddwd       m8, m2
+    vinserti128   m2, m3, xm4, 1
+    vperm2i128    m1, m3, m4, 0x31
+    paddd         m0, m2
+    paddd         m8, m1
+    psrad         m0, 8
+    psrad         m8, 8
+
+    packssdw      m7, m6
+    packssdw      m0, m8
+    mova [tmp_ptrq+384*2*0], m7
+    mova [tmp_ptrq+384*2*1], m0
+
+    mova          m0, m3
+    mova          m1, m4
+    mova          m2, m5
+    add       a_ptrq, (384+16)*4*2
+    add       b_ptrq, (384+16)*2*2
+    add     tmp_ptrq, 384*2*2
+    lea     src_ptrq, [src_ptrq+strideq*2]
+    sub           yd, 2
+    jg .loop_y
+    add           xd, 16
+    cmp           xd, wd
+    jl .loop_x
+    RET
+
+INIT_YMM avx2
+cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt
+    movifnidn     wd, wm
+    movifnidn     hd, hm
+    vpbroadcastd  m0, wtm
+    vpbroadcastd m10, [pd_1024]
+    DEFINE_ARGS dst, stride, t1, t2, w, h, idx
+.loop_y:
+    xor         idxd, idxd
+.loop_x:
+    mova          m1, [t1q+idxq*2+ 0]
+    mova          m2, [t1q+idxq*2+32]
+    mova          m3, [t2q+idxq*2+ 0]
+    mova          m4, [t2q+idxq*2+32]
+    pmovzxbw      m5, [dstq+idxq+ 0]
+    pmovzxbw      m6, [dstq+idxq+16]
+    psllw         m7, m5, 4
+    psllw         m8, m6, 4
+    psubw         m1, m7
+    psubw         m2, m8
+    psubw         m3, m7
+    psubw         m4, m8
+    punpcklwd     m9, m1, m3
+    punpckhwd     m1, m3
+    punpcklwd     m3, m2, m4
+    punpckhwd     m2, m4
+    pmaddwd       m9, m0
+    pmaddwd       m1, m0
+    pmaddwd       m3, m0
+    pmaddwd       m2, m0
+    paddd         m9, m10
+    paddd         m1, m10
+    paddd         m3, m10
+    paddd         m2, m10
+    psrad         m9, 11
+    psrad         m1, 11
+    psrad         m3, 11
+    psrad         m2, 11
+    packssdw      m1, m9, m1
+    packssdw      m2, m3, m2
+    paddw         m1, m5
+    paddw         m2, m6
+    packuswb      m1, m2
+    vpermq        m1, m1, q3120
+    mova [dstq+idxq], m1
+    add         idxd, 32
+    cmp         idxd, wd
+    jl .loop_x
+    add         dstq, strideq
+    add          t1q, 384 * 2
+    add          t2q, 384 * 2
+    dec           hd
+    jg .loop_y
+    RET
+%endif ; ARCH_X86_64
diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c
new file mode 100644 (file)
index 0000000..b0201ce
--- /dev/null
@@ -0,0 +1,233 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#include "common/intops.h"
+#include "src/tables.h"
+
+// Future potential optimizations:
+// - special chroma versions which don't filter [0]/[6];
+// - running filter_h_avx2 transposed (one col of 32 pixels per iteration, top
+//   to bottom) instead of scanline-ordered should be faster since then the
+//   if (have_left) and similar conditions run only once instead of per line;
+// - filter_v_avx2 currently runs 16 pixels per iteration, it should be possible
+//   to run 32 (like filter_h_avx2), and then all vpermqs can go;
+// - maybe split out the top/bottom filter_h_avx2 from the main body filter_h_avx2,
+//   since then the have_left condition can be inlined;
+// - consider having the wrapper (wiener_filter_${ext}) also in hand-written
+//   assembly, so the setup overhead is minimized.
+
+#define WIENER_FILTER(ext) \
+\
+void dav1d_wiener_filter_h_##ext(int16_t *dst, const pixel (*left)[4], \
+                                 const pixel *src, ptrdiff_t stride, \
+                                 const int16_t fh[7], const intptr_t w, \
+                                 int h, enum LrEdgeFlags edges); \
+void dav1d_wiener_filter_v_##ext(pixel *dst, ptrdiff_t stride, \
+                                 const int16_t *mid, int w, int h, \
+                                 const int16_t fv[7], enum LrEdgeFlags edges); \
+\
+static void wiener_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
+                                const pixel (*const left)[4], \
+                                const pixel *lpf, const ptrdiff_t lpf_stride, \
+                                const int w, const int h, const int16_t fh[7], \
+                                const int16_t fv[7], const enum LrEdgeFlags edges) \
+{ \
+    ALIGN_STK_32(int16_t, mid, 68 * 384,); \
+\
+    /* horizontal filter */ \
+    dav1d_wiener_filter_h_##ext(&mid[2 * 384], left, dst, dst_stride, \
+                               fh, w, h, edges); \
+    if (edges & LR_HAVE_TOP) \
+        dav1d_wiener_filter_h_##ext(mid, NULL, lpf, lpf_stride, \
+                                   fh, w, 2, edges); \
+    if (edges & LR_HAVE_BOTTOM) \
+        dav1d_wiener_filter_h_##ext(&mid[(2 + h) * 384], NULL, \
+                                   lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride, \
+                                   fh, w, 2, edges); \
+\
+    dav1d_wiener_filter_v_##ext(dst, dst_stride, &mid[2*384], w, h, fv, edges); \
+}
+
+#define SGR_FILTER(ext) \
+\
+void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
+                            const pixel (*left)[4], \
+                            const pixel *src, const ptrdiff_t stride, \
+                            const int w, const int h, \
+                            const enum LrEdgeFlags edges); \
+void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \
+                            const int w, const int h, \
+                            const enum LrEdgeFlags edges); \
+void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \
+                              const int w, const int h, const int strength); \
+void dav1d_sgr_finish_filter1_##ext(coef *tmp, \
+                                    const pixel *src, const ptrdiff_t stride, \
+                                    const int32_t *a, const int16_t *b, \
+                                    const int w, const int h); \
+\
+/* filter with a 3x3 box (radius=1) */ \
+static void dav1d_sgr_filter1_##ext(coef *tmp, \
+                                    const pixel *src, const ptrdiff_t stride, \
+                                    const pixel (*left)[4], \
+                                    const pixel *lpf, const ptrdiff_t lpf_stride, \
+                                    const int w, const int h, const int strength, \
+                                    const enum LrEdgeFlags edges) \
+{ \
+    ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
+    int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
+    ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
+    int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
+\
+    dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
+    if (edges & LR_HAVE_TOP) \
+        dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
+                              NULL, lpf, lpf_stride, w, 2, edges); \
+\
+    if (edges & LR_HAVE_BOTTOM) \
+        dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
+                              NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
+                              lpf_stride, w, 2, edges); \
+\
+    dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \
+    dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \
+    dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \
+} \
+\
+void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \
+                            const pixel (*left)[4], \
+                            const pixel *src, const ptrdiff_t stride, \
+                            const int w, const int h, \
+                            const enum LrEdgeFlags edges); \
+void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \
+                            const int w, const int h, \
+                            const enum LrEdgeFlags edges); \
+void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \
+                              const int w, const int h, const int strength); \
+void dav1d_sgr_finish_filter2_##ext(coef *tmp, \
+                                    const pixel *src, const ptrdiff_t stride, \
+                                    const int32_t *a, const int16_t *b, \
+                                    const int w, const int h); \
+\
+/* filter with a 5x5 box (radius=2) */ \
+static void dav1d_sgr_filter2_##ext(coef *tmp, \
+                                    const pixel *src, const ptrdiff_t stride, \
+                                    const pixel (*left)[4], \
+                                    const pixel *lpf, const ptrdiff_t lpf_stride, \
+                                    const int w, const int h, const int strength, \
+                                    const enum LrEdgeFlags edges) \
+{ \
+    ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
+    int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
+    ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
+    int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
+\
+    dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
+    if (edges & LR_HAVE_TOP) \
+        dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
+                              NULL, lpf, lpf_stride, w, 2, edges); \
+\
+    if (edges & LR_HAVE_BOTTOM) \
+        dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
+                              NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
+                              lpf_stride, w, 2, edges); \
+\
+    dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \
+    dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \
+    dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \
+} \
+\
+void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \
+                               const coef *t1, const int w, const int h, \
+                               const int wt); \
+void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
+                               const coef *t1, const coef *t2, \
+                               const int w, const int h, \
+                               const uint32_t wt); \
+\
+static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
+                             const pixel (*const left)[4], \
+                             const pixel *lpf, const ptrdiff_t lpf_stride, \
+                             const int w, const int h, const int sgr_idx, \
+                             const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \
+{ \
+    if (!dav1d_sgr_params[sgr_idx][0]) { \
+        ALIGN_STK_32(coef, tmp, 64 * 384,); \
+        dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
+                               w, h, dav1d_sgr_params[sgr_idx][3], edges); \
+        dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \
+    } else if (!dav1d_sgr_params[sgr_idx][1]) { \
+        ALIGN_STK_32(coef, tmp, 64 * 384,); \
+        dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
+                               w, h, dav1d_sgr_params[sgr_idx][2], edges); \
+        dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, sgr_wt[0]); \
+    } else { \
+        ALIGN_STK_32(coef, tmp1, 64 * 384,); \
+        ALIGN_STK_32(coef, tmp2, 64 * 384,); \
+        dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
+                               w, h, dav1d_sgr_params[sgr_idx][2], edges); \
+        dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
+                               w, h, dav1d_sgr_params[sgr_idx][3], edges); \
+        const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \
+        dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \
+    } \
+}
+
+#define DEF_LR_FILTERS(ext) \
+WIENER_FILTER(ext) \
+SGR_FILTER(ext)
+
+#if BITDEPTH == 8
+WIENER_FILTER(sse2)
+DEF_LR_FILTERS(ssse3)
+# if ARCH_X86_64
+DEF_LR_FILTERS(avx2)
+# endif
+#endif
+
+COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+#if BITDEPTH == 8
+    c->wiener = wiener_filter_sse2;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+#if BITDEPTH == 8
+    c->wiener = wiener_filter_ssse3;
+    c->selfguided = sgr_filter_ssse3;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+#if BITDEPTH == 8 && ARCH_X86_64
+    c->wiener = wiener_filter_avx2;
+    c->selfguided = sgr_filter_avx2;
+#endif
+}
diff --git a/src/x86/looprestoration_ssse3.asm b/src/x86/looprestoration_ssse3.asm
new file mode 100644 (file)
index 0000000..aaaea78
--- /dev/null
@@ -0,0 +1,1952 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+pb_right_ext_mask: times 16 db 0xff
+                   times 16 db 0
+pb_14x0_1_2: times 14 db 0
+             db 1, 2
+pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
+                  db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
+pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
+pb_0: times 16 db 0
+pb_2: times 16 db 2
+pb_3: times 16 db 3
+pb_4: times 16 db 4
+pb_15: times 16 db 15
+pb_0_1: times 8 db 0, 1
+pb_6_7: times 8 db 6, 7
+pb_14_15: times 8 db 14, 15
+pw_1: times 8 dw 1
+pw_16: times 8 dw 16
+pw_128: times 8 dw 128
+pw_255: times 8 dw 255
+pw_256: times 8 dw 256
+pw_2048: times 8 dw 2048
+pw_16380: times 8 dw 16380
+pw_5_6: times 4 dw 5, 6
+pw_0_128: times 4 dw 0, 128
+pd_1024: times 4 dd 1024
+%if ARCH_X86_32
+pd_256: times 4 dd 256
+pd_512: times 4 dd 512
+pd_2048: times 4 dd 2048
+%endif
+pd_0xF0080029: times 4 dd 0xF0080029
+pd_0xF00801C7: times 4 dd 0XF00801C7
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
+  %assign pic_reg_stk_off 4
+  %xdefine PIC_reg %1
+  %if %2 == 1
+    mov        [esp], %1
+  %endif
+    LEA      PIC_reg, PIC_base_offset
+  %if %3 == 1
+    XCHG_PIC_REG
+  %endif
+ %endmacro
+
+ %macro XCHG_PIC_REG 0
+    mov [esp+pic_reg_stk_off], PIC_reg
+    %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
+    mov PIC_reg, [esp+pic_reg_stk_off]
+ %endmacro
+
+ %define PIC_sym(sym)   (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 0
+ %endmacro
+
+ %define PIC_sym(sym)   (sym)
+%endif
+
+%macro PALIGNR 4 ; dst, src1, src2, shift
+ %if cpuflag(ssse3)
+    palignr       %1, %2, %3, %4
+ %else
+  %assign %%i regnumof%+%1 + 1
+  %define %%tmp m %+ %%i
+    psrldq        %1, %3, %4
+    pslldq     %%tmp, %2, 16-%4
+    por           %1, %%tmp
+ %endif
+%endmacro
+
+%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero
+ %if cpuflag(ssse3)
+    pmaddubsw     %1, %2
+ %else
+  %if %5 == 1
+    pxor          %3, %3
+  %endif
+    punpckhbw     %4, %1, %3
+    punpcklbw     %1, %3
+    pmaddwd       %4, %2
+    pmaddwd       %1, %2
+    packssdw      %1, %4
+ %endif
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;
+;;      wiener      ;;
+;;;;;;;;;;;;;;;;;;;;;;
+
+%macro WIENER_H 0
+%if ARCH_X86_64
+cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge
+    mov        edged, edgem
+    movifnidn     wd, wm
+    mov           hd, hm
+%else
+cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
+    mov           r5, edgem
+    mov     [esp+12], r5
+    mov           wd, wm
+    mov           hd, hm
+    SETUP_PIC hd
+ %define m15 m0
+ %define m14 m1
+ %define m13 m2
+ %define m12 m3
+%endif
+
+    movq         m15, [fhq]
+%if cpuflag(ssse3)
+    pshufb       m12, m15, [PIC_sym(pb_6_7)]
+    pshufb       m13, m15, [PIC_sym(pb_4)]
+    pshufb       m14, m15, [PIC_sym(pb_2)]
+    pshufb       m15, m15, [PIC_sym(pb_0)]
+%else
+    pshuflw      m12, m15, q3333
+    punpcklbw    m15, m15
+    pshufhw      m13, m15, q0000
+    pshuflw      m14, m15, q2222
+    pshuflw      m15, m15, q0000
+    punpcklqdq   m12, m12
+    punpckhqdq   m13, m13
+    punpcklqdq   m14, m14
+    punpcklqdq   m15, m15
+    psraw        m13, 8
+    psraw        m14, 8
+    psraw        m15, 8
+%endif
+
+%if ARCH_X86_64
+    mova         m11, [pw_2048]
+    mova         m10, [pw_16380]
+    lea          r11, [pb_right_ext_mask]
+
+    DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
+%else
+ %define m10    [PIC_sym(pw_16380)]
+ %define m11    [PIC_sym(pw_2048)]
+ %define m12    [esp+0x14]
+ %define m13    [esp+0x24]
+ %define m14    [esp+0x34]
+ %define m15    [esp+0x44]
+    mova         m12, m3
+    mova         m13, m2
+    mova         m14, m1
+    mova         m15, m0
+
+    DEFINE_ARGS dst, left, src, stride, x, w, h, edge
+ %define srcptrq    srcq
+ %define dstptrq    dstq
+ %define hd         dword [esp+ 0]
+ %define edgeb      byte  [esp+12]
+ %define xlimd      dword [esp+16]
+%endif
+
+    ; if (edge & has_right) align_w_to_16
+    ; else w -= 3, and use that as limit in x loop
+    test       edgeb, 2 ; has_right
+    jnz .align
+    mov        xlimd, -3
+    jmp .loop
+.align:
+    add           wd, 15
+    and           wd, ~15
+%if ARCH_X86_64
+    xor        xlimd, xlimd
+%else
+    mov        xlimd, 0
+%endif
+
+    ; main y loop for vertical filter
+.loop:
+%if ARCH_X86_64
+    mov      srcptrq, srcq
+    mov      dstptrq, dstq
+    lea           xd, [wq+xlimq]
+%else
+    mov      [esp+8], srcq
+    mov      [esp+4], dstq
+    mov           xd, xlimd
+    add           xd, wd
+%endif
+
+    ; load left edge pixels
+    test       edgeb, 1 ; have_left
+    jz .emu_left
+    test       leftq, leftq ; left == NULL for the edge-extended bottom/top
+    jz .load_left_combined
+    movd          m0, [leftq]
+    movd          m1, [srcq]
+    punpckldq     m0, m1
+    pslldq        m0, 9
+    add        leftq, 4
+    jmp .left_load_done
+.load_left_combined:
+    movq          m0, [srcq-3]
+    pslldq        m0, 10
+    jmp .left_load_done
+.emu_left:
+    movd          m0, [srcq]
+%if cpuflag(ssse3)
+    pshufb        m0, [PIC_sym(pb_14x0_1_2)]
+%else
+    pslldq        m1, m0, 13
+    punpcklbw     m0, m0
+    pshuflw       m0, m0, q0000
+    punpcklqdq    m0, m0
+    psrldq        m0, 2
+    por           m0, m1
+%endif
+
+    ; load right edge pixels
+.left_load_done:
+    cmp           xd, 16
+    jg .main_load
+    test          xd, xd
+    jg .load_and_splat
+    je .splat_right
+
+    ; for very small images (w=[1-2]), edge-extend the original cache,
+    ; ugly, but only runs in very odd cases
+%if cpuflag(ssse3)
+    add           wd, wd
+ %if ARCH_X86_64
+    pshufb        m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
+ %else
+    pshufb        m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16]
+ %endif
+    shr           wd, 1
+%else
+    shl           wd, 4
+    pcmpeqd       m2, m2
+    movd          m3, wd
+    psrldq        m2, 2
+    punpckhbw     m1, m0, m0
+    pshufhw       m1, m1, q1122
+    psllq         m1, m3
+    pand          m0, m2
+    pandn         m2, m1
+    por           m0, m2
+    shr           wd, 4
+%endif
+
+    ; main x loop, mostly this starts in .main_load
+.splat_right:
+    ; no need to load new pixels, just extend them from the (possibly previously
+    ; extended) previous load into m0
+%if cpuflag(ssse3)
+    pshufb        m1, m0, [PIC_sym(pb_15)]
+%else
+    punpckhbw     m1, m0, m0
+    pshufhw       m1, m1, q3333
+    punpckhqdq    m1, m1
+%endif
+    jmp .main_loop
+.load_and_splat:
+    ; load new pixels and extend edge for right-most
+    movu          m1, [srcptrq+3]
+%if ARCH_X86_64
+    sub          r11, xq
+    movu          m2, [r11+16]
+    add          r11, xq
+%else
+    sub      PIC_reg, xd
+    movu          m2, [PIC_sym(pb_right_ext_mask)+16]
+    add      PIC_reg, xd
+%endif
+    movd          m3, [srcptrq+2+xq]
+%if cpuflag(ssse3)
+    pshufb        m3, [PIC_sym(pb_0)]
+%else
+    punpcklbw     m3, m3
+    pshuflw       m3, m3, q0000
+    punpcklqdq    m3, m3
+%endif
+    pand          m1, m2
+    pxor          m2, [PIC_sym(pb_right_ext_mask)]
+    pand          m3, m2
+    pxor          m2, [PIC_sym(pb_right_ext_mask)]
+    por           m1, m3
+    jmp .main_loop
+.main_load:
+    ; load subsequent line
+    movu          m1, [srcptrq+3]
+.main_loop:
+%if ARCH_X86_64
+    PALIGNR       m2, m1, m0, 10
+    PALIGNR       m3, m1, m0, 11
+    PALIGNR       m4, m1, m0, 12
+    PALIGNR       m5, m1, m0, 13
+    PALIGNR       m6, m1, m0, 14
+    PALIGNR       m7, m1, m0, 15
+
+    punpcklbw     m0, m2, m1
+    punpckhbw     m2, m1
+    punpcklbw     m8, m3, m7
+    punpckhbw     m3, m7
+    punpcklbw     m7, m4, m6
+    punpckhbw     m4, m6
+    PMADDUBSW     m0, m15, m6, m9, 1
+    PMADDUBSW     m2, m15, m6, m9, 0
+    PMADDUBSW     m8, m14, m6, m9, 0
+    PMADDUBSW     m3, m14, m6, m9, 0
+    PMADDUBSW     m7, m13, m6, m9, 0
+    PMADDUBSW     m4, m13, m6, m9, 0
+    paddw         m0, m8
+    paddw         m2, m3
+ %if cpuflag(ssse3)
+    pxor          m6, m6
+ %endif
+    punpcklbw     m3, m5, m6
+    punpckhbw     m5, m6
+    psllw         m8, m3, 7
+    psllw         m6, m5, 7
+    psubw         m8, m10
+    psubw         m6, m10
+    pmullw        m3, m12
+    pmullw        m5, m12
+    paddw         m0, m7
+    paddw         m2, m4
+    paddw         m0, m3
+    paddw         m2, m5
+    paddsw        m0, m8 ; see the avx2 for an explanation
+    paddsw        m2, m6 ; of how the clipping works here
+    psraw         m0, 3
+    psraw         m2, 3
+    paddw         m0, m11
+    paddw         m2, m11
+    mova [dstptrq+ 0], m0
+    mova [dstptrq+16], m2
+%else
+    PALIGNR       m2, m1, m0, 10
+    punpcklbw     m3, m2, m1
+    punpckhbw     m2, m1
+    PMADDUBSW     m3, m15, m4, m5, 1
+    PMADDUBSW     m2, m15, m4, m5, 0
+    PALIGNR       m4, m1, m0, 11
+    PALIGNR       m5, m1, m0, 15
+    punpcklbw     m6, m4, m5
+    punpckhbw     m4, m5
+    PMADDUBSW     m6, m14, m5, m7, 1
+    PMADDUBSW     m4, m14, m5, m7, 0
+    paddw         m3, m6
+    paddw         m2, m4
+    PALIGNR       m4, m1, m0, 12
+    PALIGNR       m5, m1, m0, 14
+    punpcklbw     m6, m4, m5
+    punpckhbw     m4, m5
+    PMADDUBSW     m6, m13, m5, m7, 1
+    PMADDUBSW     m4, m13, m5, m7, 0
+    paddw         m3, m6
+    paddw         m2, m4
+    PALIGNR       m6, m1, m0, 13
+ %if cpuflag(ssse3)
+    pxor          m5, m5
+ %endif
+    punpcklbw     m4, m6, m5
+    punpckhbw     m6, m5
+    psllw         m5, m4, 7
+    psllw         m7, m6, 7
+    psubw         m5, m10
+    psubw         m7, m10
+    pmullw        m4, m12
+    pmullw        m6, m12
+    paddw         m3, m4
+    paddw         m2, m6
+    paddsw        m3, m5
+    paddsw        m2, m7
+    psraw         m3, 3
+    psraw         m2, 3
+    paddw         m3, m11
+    paddw         m2, m11
+    mova [dstptrq+ 0], m3
+    mova [dstptrq+16], m2
+%endif
+
+    mova          m0, m1
+    add      srcptrq, 16
+    add      dstptrq, 32
+    sub           xd, 16
+    cmp           xd, 16
+    jg .main_load
+    test          xd, xd
+    jg .load_and_splat
+    cmp           xd, xlimd
+    jg .splat_right
+
+%if ARCH_X86_32
+    mov         srcq, [esp+8]
+    mov         dstq, [esp+4]
+%endif
+    add         srcq, strideq
+    add         dstq, 384*2
+    dec           hd
+    jg .loop
+    RET
+%endmacro
+
+%macro WIENER_V 0
+%if ARCH_X86_64
+cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge
+    mov        edged, edgem
+    movifnidn    fvq, fvmp
+    movifnidn     hd, hm
+    movq         m15, [fvq]
+    pshufd       m14, m15, q1111
+    pshufd       m15, m15, q0000
+    paddw        m14, [pw_0_128]
+    mova         m12, [pd_1024]
+
+    DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr
+
+    mov        ylimd, edged
+    and        ylimd, 8 ; have_bottom
+    shr        ylimd, 2
+    sub        ylimd, 3
+%else
+cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
+ %define ylimd [esp+12]
+
+    mov          r5d, edgem
+    and          r5d, 8
+    shr          r5d, 2
+    sub          r5d, 3
+    mov        ylimd, r5d
+    mov          fvq, fvmp
+    mov        edged, edgem
+
+    SETUP_PIC edged
+
+    movq          m0, [fvq]
+    pshufd        m1, m0, q1111
+    pshufd        m0, m0, q0000
+    paddw         m1, [PIC_sym(pw_0_128)]
+    mova  [esp+0x50], m0
+    mova  [esp+0x40], m1
+
+    DEFINE_ARGS dst, stride, mid, w, h, y, edge
+ %define mptrq      midq
+ %define dstptrq    dstq
+ %define edgeb      byte [esp]
+%endif
+
+    ; main x loop for vertical filter, does one column of 16 pixels
+.loop_x:
+    mova          m3, [midq] ; middle line
+
+    ; load top pixels
+    test       edgeb, 4 ; have_top
+    jz .emu_top
+    mova          m0, [midq-384*4]
+    mova          m2, [midq-384*2]
+    mova          m1, m0
+    jmp .load_bottom_pixels
+.emu_top:
+    mova          m0, m3
+    mova          m1, m3
+    mova          m2, m3
+
+    ; load bottom pixels
+.load_bottom_pixels:
+    mov           yd, hd
+%if ARCH_X86_64
+    mov        mptrq, midq
+    mov      dstptrq, dstq
+    add           yd, ylimd
+%else
+    mov      [esp+8], midq
+    mov      [esp+4], dstq
+    add           yd, ylimd
+%endif
+    jg .load_threelines
+
+    ; the remainder here is somewhat messy but only runs in very weird
+    ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
+    ; so performance is not terribly important here...
+    je .load_twolines
+    cmp           yd, -1
+    je .load_oneline
+    ; h == 1 case
+    mova          m5, m3
+    mova          m4, m3
+    mova          m6, m3
+    jmp .loop
+.load_oneline:
+    ; h == 2 case
+    mova          m4, [midq+384*2]
+    mova          m5, m4
+    mova          m6, m4
+    jmp .loop
+.load_twolines:
+    ; h == 3 case
+    mova          m4, [midq+384*2]
+    mova          m5, [midq+384*4]
+    mova          m6, m5
+    jmp .loop
+.load_threelines:
+    ; h > 3 case
+    mova          m4, [midq+384*2]
+    mova          m5, [midq+384*4]
+    ; third line loaded in main loop below
+
+    ; main y loop for vertical filter
+.loop_load:
+    ; load one line into m6. if that pixel is no longer available, do
+    ; nothing, since m6 still has the data from the previous line in it. We
+    ; try to structure the loop so that the common case is evaluated fastest
+    mova          m6, [mptrq+384*6]
+.loop:
+%if ARCH_X86_64
+    paddw         m7, m0, m6
+    paddw         m8, m1, m5
+    paddw         m9, m2, m4
+    punpcklwd    m10, m7, m8
+    punpckhwd     m7, m8
+    punpcklwd    m11, m9, m3
+    punpckhwd     m9, m3
+    pmaddwd      m10, m15
+    pmaddwd       m7, m15
+    pmaddwd      m11, m14
+    pmaddwd       m9, m14
+    paddd        m10, m12
+    paddd         m7, m12
+    paddd        m10, m11
+    paddd         m7, m9
+    psrad        m10, 11
+    psrad         m7, 11
+    packssdw     m10, m7
+    packuswb     m10, m10
+    movq   [dstptrq], m10
+%else
+    mova  [esp+0x30], m1
+    mova  [esp+0x20], m2
+    mova  [esp+0x10], m3
+    paddw         m0, m6
+    paddw         m1, m5
+    paddw         m2, m4
+    punpcklwd     m7, m2, m3
+    punpckhwd     m2, m3
+    punpcklwd     m3, m0, m1
+    punpckhwd     m0, m1
+    mova          m1, [esp+0x50]
+    pmaddwd       m3, m1
+    pmaddwd       m0, m1
+    mova          m1, [esp+0x40]
+    pmaddwd       m7, m1
+    pmaddwd       m2, m1
+    paddd         m3, [PIC_sym(pd_1024)]
+    paddd         m0, [PIC_sym(pd_1024)]
+    paddd         m3, m7
+    paddd         m0, m2
+    psrad         m3, 11
+    psrad         m0, 11
+    packssdw      m3, m0
+    packuswb      m3, m3
+    movq      [dstq], m3
+    mova          m1, [esp+0x30]
+    mova          m2, [esp+0x20]
+    mova          m3, [esp+0x10]
+%endif
+    ; shift pixels one position
+    mova          m0, m1
+    mova          m1, m2
+    mova          m2, m3
+    mova          m3, m4
+    mova          m4, m5
+    mova          m5, m6
+    add        mptrq, 384*2
+    add      dstptrq, strideq
+    dec           yd
+    jg .loop_load
+    ; for the bottom pixels, continue using m6 (as extended edge)
+    cmp           yd, ylimd
+    jg .loop
+
+%if ARCH_X86_32
+    mov         midq, [esp+8]
+    mov         dstq, [esp+4]
+%endif
+    add         midq, 16
+    add         dstq, 8
+    sub           wd, 8
+    jg .loop_x
+    RET
+%endmacro
+
+INIT_XMM sse2
+WIENER_H
+WIENER_V
+
+INIT_XMM ssse3
+WIENER_H
+WIENER_V
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;      self-guided     ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro MULLD 2
+    pmulhuw       m5, %1, %2
+    pmullw        %1, %2
+    pslld         m5, 16
+    paddd         %1, m5
+%endmacro
+
+%macro GATHERDD 2
+    mova          m5, m7
+    movd         r6d, %2
+ %if ARCH_X86_64
+    movd          %1, [r5+r6]
+    pextrw       r6d, %2, 2
+    pinsrw        m5, [r5+r6+2], 3
+    pextrw       r6d, %2, 4
+    pinsrw        %1, [r5+r6+2], 5
+    pextrw       r6d, %2, 6
+    pinsrw        m5, [r5+r6+2], 7
+ %else
+    movd          %1, [PIC_sym(sgr_x_by_x-0xF03)+r6]
+    pextrw       r6d, %2, 2
+    pinsrw        m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3
+    pextrw       r6d, %2, 4
+    pinsrw        %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5
+    pextrw       r6d, %2, 6
+    pinsrw        m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7
+ %endif
+    por           %1, m5
+%endmacro
+
+%if ARCH_X86_64
+cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+    mov        xlimd, edgem
+    movifnidn     xd, xm
+    mov           hd, hm
+    mov        edged, xlimd
+    and        xlimd, 2                             ; have_right
+    add           xd, xlimd
+    xor        xlimd, 2                             ; 2*!have_right
+%else
+cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ %define wq     r0m
+ %define xlimd  r1m
+ %define hd     hmp
+ %define edgeb  byte edgem
+
+    mov           r6, edgem
+    and           r6, 2                             ; have_right
+    add           xd, r6
+    xor           r6, 2                             ; 2*!have_right
+    mov        xlimd, r6
+    SETUP_PIC     r6, 0
+%endif
+
+    jnz .no_right
+    add           xd, 7
+    and           xd, ~7
+.no_right:
+    pxor          m1, m1
+    lea         srcq, [srcq+xq]
+    lea         sumq, [sumq+xq*2-2]
+    lea       sumsqq, [sumsqq+xq*4-4]
+    neg           xq
+    mov           wq, xq
+%if ARCH_X86_64
+    lea          r10, [pb_right_ext_mask+16]
+%endif
+.loop_y:
+    mov           xq, wq
+
+    ; load left
+    test       edgeb, 1                             ; have_left
+    jz .no_left
+    test       leftq, leftq
+    jz .load_left_from_main
+    movd          m0, [leftq]
+    pslldq        m0, 12
+    add        leftq, 4
+    jmp .expand_x
+.no_left:
+    movd          m0, [srcq+xq]
+    pshufb        m0, [PIC_sym(pb_0)]
+    jmp .expand_x
+.load_left_from_main:
+    movd          m0, [srcq+xq-2]
+    pslldq        m0, 14
+.expand_x:
+    punpckhbw    xm0, xm1
+
+    ; when we reach this, m0 contains left two px in highest words
+    cmp           xd, -8
+    jle .loop_x
+.partial_load_and_extend:
+    movd          m3, [srcq-4]
+    pshufb        m3, [PIC_sym(pb_3)]
+    movq          m2, [srcq+xq]
+    punpcklbw     m2, m1
+    punpcklbw     m3, m1
+%if ARCH_X86_64
+    movu          m4, [r10+xq*2]
+%else
+    movu          m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
+%endif
+    pand          m2, m4
+    pandn         m4, m3
+    por           m2, m4
+    jmp .loop_x_noload
+.right_extend:
+    pshufb        m2, m0, [PIC_sym(pb_14_15)]
+    jmp .loop_x_noload
+
+.loop_x:
+    movq          m2, [srcq+xq]
+    punpcklbw     m2, m1
+.loop_x_noload:
+    palignr       m3, m2, m0, 12
+    palignr       m4, m2, m0, 14
+
+    punpcklwd     m5, m3, m2
+    punpckhwd     m6, m3, m2
+    paddw         m3, m4
+    punpcklwd     m7, m4, m1
+    punpckhwd     m4, m1
+    pmaddwd       m5, m5
+    pmaddwd       m6, m6
+    pmaddwd       m7, m7
+    pmaddwd       m4, m4
+    paddd         m5, m7
+    paddd         m6, m4
+    paddw         m3, m2
+    movu [sumq+xq*2], m3
+    movu [sumsqq+xq*4+ 0], m5
+    movu [sumsqq+xq*4+16], m6
+
+    mova          m0, m2
+    add           xq, 8
+
+    ; if x <= -8 we can reload more pixels
+    ; else if x < 0 we reload and extend (this implies have_right=0)
+    ; else if x < xlimd we extend from previous load (this implies have_right=0)
+    ; else we are done
+
+    cmp           xd, -8
+    jle .loop_x
+    test          xd, xd
+    jl .partial_load_and_extend
+    cmp           xd, xlimd
+    jl .right_extend
+
+    add       sumsqq, (384+16)*4
+    add         sumq, (384+16)*2
+    add         srcq, strideq
+    dec           hd
+    jg .loop_y
+    RET
+
+%if ARCH_X86_64
+cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
+    movifnidn  edged, edgem
+%else
+cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
+ %define sumsq_baseq dword [esp+0]
+ %define sum_baseq   dword [esp+4]
+ %define ylimd       dword [esp+8]
+ %define m8          [esp+12]
+    mov        edged, r4m
+    mov           hd, r3m
+%endif
+    mov           xq, -2
+%if ARCH_X86_64
+    mov        ylimd, edged
+    and        ylimd, 8                             ; have_bottom
+    shr        ylimd, 2
+    sub        ylimd, 2                             ; -2 if have_bottom=0, else 0
+    mov  sumsq_baseq, sumsqq
+    mov    sum_baseq, sumq
+.loop_x:
+    mov       sumsqq, sumsq_baseq
+    mov         sumq, sum_baseq
+    lea           yd, [hq+ylimq+2]
+%else
+    mov           yd, edged
+    and           yd, 8                             ; have_bottom
+    shr           yd, 2
+    sub           yd, 2                             ; -2 if have_bottom=0, else 0
+    mov  sumsq_baseq, sumsqq
+    mov    sum_baseq, sumq
+    mov        ylimd, yd
+.loop_x:
+    mov       sumsqd, sumsq_baseq
+    mov         sumd, sum_baseq
+    lea           yd, [hq+2]
+    add           yd, ylimd
+%endif
+    lea       sumsqq, [sumsqq+xq*4+4-(384+16)*4]
+    lea         sumq, [sumq+xq*2+2-(384+16)*2]
+    test       edgeb, 4                             ; have_top
+    jnz .load_top
+    movu          m0, [sumsqq+(384+16)*4*1]
+    movu          m1, [sumsqq+(384+16)*4*1+16]
+    mova          m2, m0
+    mova          m3, m1
+    mova          m4, m0
+    mova          m5, m1
+    movu          m6, [sumq+(384+16)*2*1]
+    mova          m7, m6
+    mova          m8, m6
+    jmp .loop_y_noload
+.load_top:
+    movu          m0, [sumsqq-(384+16)*4*1]      ; l2sq [left]
+    movu          m1, [sumsqq-(384+16)*4*1+16]   ; l2sq [right]
+    movu          m2, [sumsqq-(384+16)*4*0]      ; l1sq [left]
+    movu          m3, [sumsqq-(384+16)*4*0+16]   ; l1sq [right]
+    movu          m6, [sumq-(384+16)*2*1]        ; l2
+    movu          m7, [sumq-(384+16)*2*0]        ; l1
+.loop_y:
+%if ARCH_X86_64
+    movu          m8, [sumq+(384+16)*2*1]        ; l0
+%else
+    movu          m4, [sumq+(384+16)*2*1]        ; l0
+    mova          m8, m4
+%endif
+    movu          m4, [sumsqq+(384+16)*4*1]      ; l0sq [left]
+    movu          m5, [sumsqq+(384+16)*4*1+16]   ; l0sq [right]
+.loop_y_noload:
+    paddd         m0, m2
+    paddd         m1, m3
+    paddw         m6, m7
+    paddd         m0, m4
+    paddd         m1, m5
+    paddw         m6, m8
+    movu [sumsqq+ 0], m0
+    movu [sumsqq+16], m1
+    movu      [sumq], m6
+
+    ; shift position down by one
+    mova          m0, m2
+    mova          m1, m3
+    mova          m2, m4
+    mova          m3, m5
+    mova          m6, m7
+    mova          m7, m8
+    add       sumsqq, (384+16)*4
+    add         sumq, (384+16)*2
+    dec           yd
+    jg .loop_y
+    cmp           yd, ylimd
+    jg .loop_y_noload
+    add           xd, 8
+    cmp           xd, wd
+    jl .loop_x
+    RET
+
+cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
+    movifnidn     sd, sm
+    sub           aq, (384+16-1)*4
+    sub           bq, (384+16-1)*2
+    add           hd, 2
+%if ARCH_X86_64
+    LEA           r5, sgr_x_by_x-0xF03
+%else
+    SETUP_PIC r5, 0
+%endif
+    movd          m6, sd
+    pshuflw       m6, m6, q0000
+    punpcklqdq    m6, m6
+    pxor          m7, m7
+    DEFINE_ARGS a, b, w, h, x
+%if ARCH_X86_64
+    mova          m8, [pd_0xF00801C7]
+    mova          m9, [pw_256]
+    psrld        m10, m9, 13                        ; pd_2048
+    mova         m11, [pb_unpcklwdw]
+%else
+ %define m8     [PIC_sym(pd_0xF00801C7)]
+ %define m9     [PIC_sym(pw_256)]
+ %define m10    [PIC_sym(pd_2048)]
+ %define m11    [PIC_sym(pb_unpcklwdw)]
+%endif
+.loop_y:
+    mov           xq, -2
+.loop_x:
+    movq          m0, [bq+xq*2]
+    movq          m1, [bq+xq*2+(384+16)*2]
+    punpcklwd     m0, m7
+    punpcklwd     m1, m7
+    movu          m2, [aq+xq*4]
+    movu          m3, [aq+xq*4+(384+16)*4]
+    pslld         m4, m2, 3
+    pslld         m5, m3, 3
+    paddd         m2, m4                            ; aa * 9
+    paddd         m3, m5
+    pmaddwd       m4, m0, m0
+    pmaddwd       m5, m1, m1
+    pmaddwd       m0, m8
+    pmaddwd       m1, m8
+    psubd         m2, m4                            ; p = aa * 9 - bb * bb
+    psubd         m3, m5
+    MULLD         m2, m6
+    MULLD         m3, m6
+    paddusw       m2, m8
+    paddusw       m3, m8
+    psrld         m2, 20                            ; z
+    psrld         m3, 20
+    GATHERDD      m4, m2                            ; xx
+    GATHERDD      m2, m3
+    psrld         m4, 24
+    psrld         m2, 24
+    packssdw      m3, m4, m2
+    pshufb        m4, m11
+    MULLD         m0, m4
+    pshufb        m2, m11
+    MULLD         m1, m2
+    psubw         m5, m9, m3
+    paddd         m0, m10
+    paddd         m1, m10
+    psrld         m0, 12
+    psrld         m1, 12
+    movq   [bq+xq*2], m5
+    psrldq        m5, 8
+    movq [bq+xq*2+(384+16)*2], m5
+    movu   [aq+xq*4], m0
+    movu [aq+xq*4+(384+16)*4], m1
+    add           xd, 4
+    cmp           xd, wd
+    jl .loop_x
+    add           aq, (384+16)*4*2
+    add           bq, (384+16)*2*2
+    sub           hd, 2
+    jg .loop_y
+    RET
+
+%if ARCH_X86_64
+cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
+                                       tmp_base, src_base, a_base, b_base, x, y
+    movifnidn     wd, wm
+    mov           hd, hm
+    mova         m15, [pw_16]
+    mov    tmp_baseq, tq
+    mov    src_baseq, srcq
+    mov      a_baseq, aq
+    mov      b_baseq, bq
+    xor           xd, xd
+%else
+cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
+ %define tmp_baseq  [esp+8]
+ %define src_baseq  [esp+12]
+ %define a_baseq    [esp+16]
+ %define b_baseq    [esp+20]
+ %define wd         [esp+24]
+ %define hd         [esp+28]
+    mov    tmp_baseq, tq
+    mov    src_baseq, srcq
+    mov      a_baseq, aq
+    mov      b_baseq, bq
+    mov           wd, xd
+    mov           hd, yd
+    xor           xd, xd
+    SETUP_PIC yd, 1, 1
+    jmp .loop_start
+%endif
+
+.loop_x:
+    mov           tq, tmp_baseq
+    mov         srcq, src_baseq
+    mov           aq, a_baseq
+    mov           bq, b_baseq
+%if ARCH_X86_32
+.loop_start:
+    movu          m0, [bq+xq*2-(384+16)*2-2]
+    movu          m2, [bq+xq*2-(384+16)*2+2]
+    mova          m1, [bq+xq*2-(384+16)*2]          ; b:top
+    paddw         m0, m2                            ; b:tl+tr
+    movu          m2, [bq+xq*2-2]
+    movu          m3, [bq+xq*2+2]
+    paddw         m1, [bq+xq*2]                     ; b:top+ctr
+    paddw         m2, m3                            ; b:l+r
+    mova  [esp+0x80], m0
+    mova  [esp+0x70], m1
+    mova  [esp+0x60], m2
+%endif
+    movu          m0, [aq+xq*4-(384+16)*4-4]
+    movu          m2, [aq+xq*4-(384+16)*4+4]
+    mova          m1, [aq+xq*4-(384+16)*4]          ; a:top [first half]
+    paddd         m0, m2                            ; a:tl+tr [first half]
+    movu          m2, [aq+xq*4-(384+16)*4-4+16]
+    movu          m4, [aq+xq*4-(384+16)*4+4+16]
+    mova          m3, [aq+xq*4-(384+16)*4+16]       ; a:top [second half]
+    paddd         m2, m4                            ; a:tl+tr [second half]
+    movu          m4, [aq+xq*4-4]
+    movu          m5, [aq+xq*4+4]
+    paddd         m1, [aq+xq*4]                     ; a:top+ctr [first half]
+    paddd         m4, m5                            ; a:l+r [first half]
+    movu          m5, [aq+xq*4+16-4]
+    movu          m6, [aq+xq*4+16+4]
+    paddd         m3, [aq+xq*4+16]                  ; a:top+ctr [second half]
+    paddd         m5, m6                            ; a:l+r [second half]
+%if ARCH_X86_64
+    movu          m6, [bq+xq*2-(384+16)*2-2]
+    movu          m8, [bq+xq*2-(384+16)*2+2]
+    mova          m7, [bq+xq*2-(384+16)*2]          ; b:top
+    paddw         m6, m8                            ; b:tl+tr
+    movu          m8, [bq+xq*2-2]
+    movu          m9, [bq+xq*2+2]
+    paddw         m7, [bq+xq*2]                     ; b:top+ctr
+    paddw         m8, m9                            ; b:l+r
+%endif
+
+    lea           tq, [tq+xq*2]
+    lea         srcq, [srcq+xq*1]
+    lea           aq, [aq+xq*4+(384+16)*4]
+    lea           bq, [bq+xq*2+(384+16)*2]
+    mov           yd, hd
+.loop_y:
+%if ARCH_X86_64
+    movu          m9, [bq-2]
+    movu         m10, [bq+2]
+    paddw         m7, [bq]                          ; b:top+ctr+bottom
+    paddw         m9, m10                           ; b:bl+br
+    paddw        m10, m7, m8                        ; b:top+ctr+bottom+l+r
+    paddw         m6, m9                            ; b:tl+tr+bl+br
+    psubw         m7, [bq-(384+16)*2*2]             ; b:ctr+bottom
+    paddw        m10, m6
+    psllw        m10, 2
+    psubw        m10, m6                            ; aa
+    pxor         m14, m14
+    movq         m12, [srcq]
+    punpcklbw    m12, m14
+    punpcklwd     m6, m10, m15
+    punpckhwd    m10, m15
+    punpcklwd    m13, m12, m15
+    punpckhwd    m12, m15
+    pmaddwd       m6, m13                           ; aa*src[x]+256 [first half]
+    pmaddwd      m10, m12                           ; aa*src[x]+256 [second half]
+%else
+    paddd         m1, [aq]                          ; a:top+ctr+bottom [first half]
+    paddd         m3, [aq+16]                       ; a:top+ctr+bottom [second half]
+    mova  [esp+0x50], m1
+    mova  [esp+0x40], m3
+    mova  [esp+0x30], m4
+    movu          m6, [aq-4]
+    movu          m7, [aq+4]
+    paddd         m1, m4                            ; a:top+ctr+bottom+l+r [first half]
+    paddd         m3, m5                            ; a:top+ctr+bottom+l+r [second half]
+    paddd         m6, m7                            ; a:bl+br [first half]
+    movu          m7, [aq+16-4]
+    movu          m4, [aq+16+4]
+    paddd         m7, m4                            ; a:bl+br [second half]
+    paddd         m0, m6                            ; a:tl+tr+bl+br [first half]
+    paddd         m2, m7                            ; a:tl+tr+bl+br [second half]
+    paddd         m1, m0
+    paddd         m3, m2
+    pslld         m1, 2
+    pslld         m3, 2
+    psubd         m1, m0                            ; bb [first half]
+    psubd         m3, m2                            ; bb [second half]
+%endif
+
+%if ARCH_X86_64
+    movu         m11, [aq-4]
+    movu         m12, [aq+4]
+    paddd         m1, [aq]                          ; a:top+ctr+bottom [first half]
+    paddd        m11, m12                           ; a:bl+br [first half]
+    movu         m12, [aq+16-4]
+    movu         m13, [aq+16+4]
+    paddd         m3, [aq+16]                       ; a:top+ctr+bottom [second half]
+    paddd        m12, m13                           ; a:bl+br [second half]
+    paddd        m13, m1, m4                        ; a:top+ctr+bottom+l+r [first half]
+    paddd        m14, m3, m5                        ; a:top+ctr+bottom+l+r [second half]
+    paddd         m0, m11                           ; a:tl+tr+bl+br [first half]
+    paddd         m2, m12                           ; a:tl+tr+bl+br [second half]
+    paddd        m13, m0
+    paddd        m14, m2
+    pslld        m13, 2
+    pslld        m14, 2
+    psubd        m13, m0                            ; bb [first half]
+    psubd        m14, m2                            ; bb [second half]
+    psubd         m1, [aq-(384+16)*4*2]             ; a:ctr+bottom [first half]
+    psubd         m3, [aq-(384+16)*4*2+16]          ; a:ctr+bottom [second half]
+%else
+    mova          m4, [esp+0x80]
+    mova  [esp+0x80], m5
+    mova          m5, [esp+0x70]
+    mova  [esp+0x70], m6
+    mova          m6, [esp+0x60]
+    mova  [esp+0x60], m7
+    mova  [esp+0x20], m1
+    movu          m7, [bq-2]
+    movu          m1, [bq+2]
+    paddw         m5, [bq]                          ; b:top+ctr+bottom
+    paddw         m7, m1
+    paddw         m1, m5, m6                        ; b:top+ctr+bottom+l+r
+    paddw         m4, m7                            ; b:tl+tr+bl+br
+    psubw         m5, [bq-(384+16)*2*2]             ; b:ctr+bottom
+    paddw         m1, m4
+    psllw         m1, 2
+    psubw         m1, m4                            ; aa
+    movq          m0, [srcq]
+    XCHG_PIC_REG
+    punpcklbw     m0, [PIC_sym(pb_right_ext_mask)+16]
+    punpcklwd     m4, m1, [PIC_sym(pw_16)]
+    punpckhwd     m1, [PIC_sym(pw_16)]
+    punpcklwd     m2, m0, [PIC_sym(pw_16)]
+    punpckhwd     m0, [PIC_sym(pw_16)]
+    XCHG_PIC_REG
+    pmaddwd       m4, m2                            ; aa*src[x]+256 [first half]
+    pmaddwd       m1, m0                            ; aa*src[x]+256 [second half]
+%endif
+
+%if ARCH_X86_64
+    paddd         m6, m13
+    paddd        m10, m14
+    psrad         m6, 9
+    psrad        m10, 9
+    packssdw      m6, m10
+    mova        [tq], m6
+%else
+    paddd         m4, [esp+0x20]
+    paddd         m1, m3
+    psrad         m4, 9
+    psrad         m1, 9
+    packssdw      m4, m1
+    mova        [tq], m4
+%endif
+
+    ; shift to next row
+%if ARCH_X86_64
+    mova          m0, m4
+    mova          m2, m5
+    mova          m4, m11
+    mova          m5, m12
+    mova          m6, m8
+    mova          m8, m9
+%else
+    mova          m1, [esp+0x50]
+    mova          m3, [esp+0x40]
+    mova          m0, [esp+0x30]
+    mova          m2, [esp+0x80]
+    mova          m4, [esp+0x70]
+    mova  [esp+0x70], m5
+    mova          m5, [esp+0x60]
+    mova  [esp+0x80], m6
+    mova  [esp+0x60], m7
+    psubd         m1, [aq-(384+16)*4*2]             ; a:ctr+bottom [first half]
+    psubd         m3, [aq-(384+16)*4*2+16]          ; a:ctr+bottom [second half]
+%endif
+
+    add         srcq, strideq
+    add           aq, (384+16)*4
+    add           bq, (384+16)*2
+    add           tq, 384*2
+    dec           yd
+    jg .loop_y
+    add           xd, 8
+    cmp           xd, wd
+    jl .loop_x
+    RET
+
+cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
+    movifnidn     hd, hm
+%if ARCH_X86_32
+    SETUP_PIC r6, 0
+%endif
+    movd          m0, wtm
+    pshufb        m0, [PIC_sym(pb_0_1)]
+    psllw         m0, 4
+    pxor          m7, m7
+    DEFINE_ARGS dst, stride, t, w, h, idx
+.loop_y:
+    xor         idxd, idxd
+.loop_x:
+    mova          m1, [tq+idxq*2+ 0]
+    mova          m4, [tq+idxq*2+16]
+    mova          m5, [dstq+idxq]
+    punpcklbw     m2, m5, m7
+    punpckhbw     m5, m7
+    psllw         m3, m2, 4
+    psllw         m6, m5, 4
+    psubw         m1, m3
+    psubw         m4, m6
+    pmulhrsw      m1, m0
+    pmulhrsw      m4, m0
+    paddw         m1, m2
+    paddw         m4, m5
+    packuswb      m1, m4
+    mova [dstq+idxq], m1
+    add         idxd, 16
+    cmp         idxd, wd
+    jl .loop_x
+    add         dstq, strideq
+    add           tq, 384 * 2
+    dec           hd
+    jg .loop_y
+    RET
+
+%if ARCH_X86_64
+cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+    mov        edged, edgem
+    movifnidn     wd, wm
+    mov           hd, hm
+    mova         m10, [pb_0]
+    mova         m11, [pb_0_1]
+%else
+cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
+ %define edgeb      byte edgem
+ %define wd         xd
+ %define wq         wd
+ %define wm         r5m
+ %define strideq    r4m
+    SUB          esp, 8
+    SETUP_PIC sumsqd, 1, 1
+
+ %define m10    [PIC_sym(pb_0)]
+ %define m11    [PIC_sym(pb_0_1)]
+%endif
+
+    test       edgeb, 2                             ; have_right
+    jz .no_right
+    xor        xlimd, xlimd
+    add           wd, 2
+    add           wd, 15
+    and           wd, ~15
+    jmp .right_done
+.no_right:
+    mov        xlimd, 3
+    dec           wd
+.right_done:
+    pxor          m1, m1
+    lea         srcq, [srcq+wq+1]
+    lea         sumq, [sumq+wq*2-2]
+    lea       sumsqq, [sumsqq+wq*4-4]
+    neg           wq
+%if ARCH_X86_64
+    lea          r10, [pb_right_ext_mask+16]
+%else
+    mov           wm, xd
+ %define wq wm
+%endif
+
+.loop_y:
+    mov           xq, wq
+    ; load left
+    test       edgeb, 1                             ; have_left
+    jz .no_left
+    test       leftq, leftq
+    jz .load_left_from_main
+    movd          m0, [leftq]
+    movd          m2, [srcq+xq-1]
+    pslldq        m2, 4
+    por           m0, m2
+    pslldq        m0, 11
+    add        leftq, 4
+    jmp .expand_x
+.no_left:
+    movd          m0, [srcq+xq-1]
+    XCHG_PIC_REG
+    pshufb        m0, m10
+    XCHG_PIC_REG
+    jmp .expand_x
+.load_left_from_main:
+    movd          m0, [srcq+xq-4]
+    pslldq        m0, 12
+.expand_x:
+    punpckhbw     m0, m1
+
+    ; when we reach this, m0 contains left two px in highest words
+    cmp           xd, -8
+    jle .loop_x
+    test          xd, xd
+    jge .right_extend
+.partial_load_and_extend:
+    XCHG_PIC_REG
+    movd          m3, [srcq-1]
+    movq          m2, [srcq+xq]
+    pshufb        m3, m10
+    punpcklbw     m3, m1
+    punpcklbw     m2, m1
+%if ARCH_X86_64
+    movu          m4, [r10+xq*2]
+%else
+    movu          m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
+    XCHG_PIC_REG
+%endif
+    pand          m2, m4
+    pandn         m4, m3
+    por           m2, m4
+    jmp .loop_x_noload
+.right_extend:
+    psrldq        m2, m0, 14
+    XCHG_PIC_REG
+    pshufb        m2, m11
+    XCHG_PIC_REG
+    jmp .loop_x_noload
+
+.loop_x:
+    movq          m2, [srcq+xq]
+    punpcklbw     m2, m1
+.loop_x_noload:
+    palignr       m3, m2, m0, 8
+    palignr       m4, m2, m0, 10
+    palignr       m5, m2, m0, 12
+    palignr       m6, m2, m0, 14
+
+%if ARCH_X86_64
+    paddw         m0, m3, m2
+    punpcklwd     m7, m3, m2
+    punpckhwd     m3, m2
+    paddw         m0, m4
+    punpcklwd     m8, m4, m5
+    punpckhwd     m4, m5
+    paddw         m0, m5
+    punpcklwd     m9, m6, m1
+    punpckhwd     m5, m6, m1
+    paddw         m0, m6
+    pmaddwd       m7, m7
+    pmaddwd       m3, m3
+    pmaddwd       m8, m8
+    pmaddwd       m4, m4
+    pmaddwd       m9, m9
+    pmaddwd       m5, m5
+    paddd         m7, m8
+    paddd         m3, m4
+    paddd         m7, m9
+    paddd         m3, m5
+    movu [sumq+xq*2], m0
+    movu [sumsqq+xq*4+ 0], m7
+    movu [sumsqq+xq*4+16], m3
+%else
+    paddw         m0, m3, m2
+    paddw         m0, m4
+    paddw         m0, m5
+    paddw         m0, m6
+    movu [sumq+xq*2], m0
+    punpcklwd     m7, m3, m2
+    punpckhwd     m3, m2
+    punpcklwd     m0, m4, m5
+    punpckhwd     m4, m5
+    punpckhwd     m5, m6, m1
+    pmaddwd       m7, m7
+    pmaddwd       m3, m3
+    pmaddwd       m0, m0
+    pmaddwd       m4, m4
+    pmaddwd       m5, m5
+    paddd         m7, m0
+    paddd         m3, m4
+    paddd         m3, m5
+    punpcklwd     m0, m6, m1
+    pmaddwd       m0, m0
+    paddd         m7, m0
+    movu [sumsqq+xq*4+ 0], m7
+    movu [sumsqq+xq*4+16], m3
+%endif
+
+    mova          m0, m2
+    add           xq, 8
+
+    ; if x <= -8 we can reload more pixels
+    ; else if x < 0 we reload and extend (this implies have_right=0)
+    ; else if x < xlimd we extend from previous load (this implies have_right=0)
+    ; else we are done
+
+    cmp           xd, -8
+    jle .loop_x
+    test          xd, xd
+    jl .partial_load_and_extend
+    cmp           xd, xlimd
+    jl .right_extend
+
+    add         srcq, strideq
+    add       sumsqq, (384+16)*4
+    add         sumq, (384+16)*2
+    dec           hd
+    jg .loop_y
+%if ARCH_X86_32
+    ADD          esp, 8
+%endif
+    RET
+
+%if ARCH_X86_64
+cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+    movifnidn  edged, edgem
+    mov        ylimd, edged
+%else
+cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
+ %define wm     [esp+0]
+ %define hm     [esp+4]
+ %define edgem  [esp+8]
+    mov           wm, xd
+    mov           hm, yd
+    mov        edgem, ylimd
+%endif
+
+    and        ylimd, 8                             ; have_bottom
+    shr        ylimd, 2
+    sub        ylimd, 3                             ; -3 if have_bottom=0, else -1
+    mov           xq, -2
+%if ARCH_X86_64
+.loop_x:
+    lea           yd, [hd+ylimd+2]
+    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
+    lea     sum_ptrq, [  sumq+xq*2+2-(384+16)*2]
+    test       edgeb, 4                             ; have_top
+    jnz .load_top
+    movu          m0, [sumsq_ptrq+(384+16)*4*1]
+    movu          m1, [sumsq_ptrq+(384+16)*4*1+16]
+    mova          m2, m0
+    mova          m3, m1
+    mova          m4, m0
+    mova          m5, m1
+    mova          m6, m0
+    mova          m7, m1
+    movu         m10, [sum_ptrq+(384+16)*2*1]
+    mova         m11, m10
+    mova         m12, m10
+    mova         m13, m10
+    jmp .loop_y_second_load
+.load_top:
+    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l3/4sq [left]
+    movu          m1, [sumsq_ptrq-(384+16)*4*1+16]   ; l3/4sq [right]
+    movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
+    movu          m5, [sumsq_ptrq-(384+16)*4*0+16]   ; l2sq [right]
+    mova          m2, m0
+    mova          m3, m1
+    movu         m10, [sum_ptrq-(384+16)*2*1]        ; l3/4
+    movu         m12, [sum_ptrq-(384+16)*2*0]        ; l2
+    mova         m11, m10
+.loop_y:
+    movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
+    movu          m7, [sumsq_ptrq+(384+16)*4*1+16]   ; l1sq [right]
+    movu         m13, [sum_ptrq+(384+16)*2*1]        ; l1
+.loop_y_second_load:
+    test          yd, yd
+    jle .emulate_second_load
+    movu          m8, [sumsq_ptrq+(384+16)*4*2]      ; l0sq [left]
+    movu          m9, [sumsq_ptrq+(384+16)*4*2+16]   ; l0sq [right]
+    movu         m14, [sum_ptrq+(384+16)*2*2]        ; l0
+.loop_y_noload:
+    paddd         m0, m2
+    paddd         m1, m3
+    paddw        m10, m11
+    paddd         m0, m4
+    paddd         m1, m5
+    paddw        m10, m12
+    paddd         m0, m6
+    paddd         m1, m7
+    paddw        m10, m13
+    paddd         m0, m8
+    paddd         m1, m9
+    paddw        m10, m14
+    movu [sumsq_ptrq+ 0], m0
+    movu [sumsq_ptrq+16], m1
+    movu  [sum_ptrq], m10
+
+    ; shift position down by one
+    mova          m0, m4
+    mova          m1, m5
+    mova          m2, m6
+    mova          m3, m7
+    mova          m4, m8
+    mova          m5, m9
+    mova         m10, m12
+    mova         m11, m13
+    mova         m12, m14
+    add   sumsq_ptrq, (384+16)*4*2
+    add     sum_ptrq, (384+16)*2*2
+    sub           yd, 2
+    jge .loop_y
+    ; l1 = l0
+    mova          m6, m8
+    mova          m7, m9
+    mova         m13, m14
+    cmp           yd, ylimd
+    jg .loop_y_noload
+    add           xd, 8
+    cmp           xd, wd
+    jl .loop_x
+    RET
+.emulate_second_load:
+    mova          m8, m6
+    mova          m9, m7
+    mova         m14, m13
+    jmp .loop_y_noload
+%else
+.sumsq_loop_x:
+    lea           yd, [ylimd+2]
+    add           yd, hm
+    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
+    test  byte edgem, 4                             ; have_top
+    jnz .sumsq_load_top
+    movu          m0, [sumsq_ptrq+(384+16)*4*1]
+    movu          m1, [sumsq_ptrq+(384+16)*4*1+16]
+    mova          m4, m0
+    mova          m5, m1
+    mova          m6, m0
+    mova          m7, m1
+    mova  [esp+0x1c], m0
+    mova  [esp+0x0c], m1
+    jmp .sumsq_loop_y_second_load
+.sumsq_load_top:
+    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l3/4sq [left]
+    movu          m1, [sumsq_ptrq-(384+16)*4*1+16]   ; l3/4sq [right]
+    movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
+    movu          m5, [sumsq_ptrq-(384+16)*4*0+16]   ; l2sq [right]
+    mova  [esp+0x1c], m0
+    mova  [esp+0x0c], m1
+.sumsq_loop_y:
+    movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
+    movu          m7, [sumsq_ptrq+(384+16)*4*1+16]   ; l1sq [right]
+.sumsq_loop_y_second_load:
+    test          yd, yd
+    jle .sumsq_emulate_second_load
+    movu          m2, [sumsq_ptrq+(384+16)*4*2]      ; l0sq [left]
+    movu          m3, [sumsq_ptrq+(384+16)*4*2+16]   ; l0sq [right]
+.sumsq_loop_y_noload:
+    paddd         m0, [esp+0x1c]
+    paddd         m1, [esp+0x0c]
+    paddd         m0, m4
+    paddd         m1, m5
+    paddd         m0, m6
+    paddd         m1, m7
+    paddd         m0, m2
+    paddd         m1, m3
+    movu [sumsq_ptrq+ 0], m0
+    movu [sumsq_ptrq+16], m1
+
+    ; shift position down by one
+    mova          m0, m4
+    mova          m1, m5
+    mova          m4, m2
+    mova          m5, m3
+    mova  [esp+0x1c], m6
+    mova  [esp+0x0c], m7
+    add   sumsq_ptrq, (384+16)*4*2
+    sub           yd, 2
+    jge .sumsq_loop_y
+    ; l1 = l0
+    mova          m6, m2
+    mova          m7, m3
+    cmp           yd, ylimd
+    jg .sumsq_loop_y_noload
+    add           xd, 8
+    cmp           xd, wm
+    jl .sumsq_loop_x
+
+    mov           xd, -2
+.sum_loop_x:
+    lea           yd, [ylimd+2]
+    add           yd, hm
+    lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
+    test  byte edgem, 4                             ; have_top
+    jnz .sum_load_top
+    movu          m0, [sum_ptrq+(384+16)*2*1]
+    mova          m1, m0
+    mova          m2, m0
+    mova          m3, m0
+    jmp .sum_loop_y_second_load
+.sum_load_top:
+    movu          m0, [sum_ptrq-(384+16)*2*1]        ; l3/4
+    movu          m2, [sum_ptrq-(384+16)*2*0]        ; l2
+    mova          m1, m0
+.sum_loop_y:
+    movu          m3, [sum_ptrq+(384+16)*2*1]        ; l1
+.sum_loop_y_second_load:
+    test          yd, yd
+    jle .sum_emulate_second_load
+    movu          m4, [sum_ptrq+(384+16)*2*2]        ; l0
+.sum_loop_y_noload:
+    paddw         m0, m1
+    paddw         m0, m2
+    paddw         m0, m3
+    paddw         m0, m4
+    movu  [sum_ptrq], m0
+
+    ; shift position down by one
+    mova          m0, m2
+    mova          m1, m3
+    mova          m2, m4
+    add     sum_ptrq, (384+16)*2*2
+    sub           yd, 2
+    jge .sum_loop_y
+    ; l1 = l0
+    mova          m3, m4
+    cmp           yd, ylimd
+    jg .sum_loop_y_noload
+    add           xd, 8
+    cmp           xd, wm
+    jl .sum_loop_x
+    RET
+.sumsq_emulate_second_load:
+    mova          m2, m6
+    mova          m3, m7
+    jmp .sumsq_loop_y_noload
+.sum_emulate_second_load:
+    mova          m4, m3
+    jmp .sum_loop_y_noload
+%endif
+
+cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
+    movifnidn     sd, sm
+    sub           aq, (384+16-1)*4
+    sub           bq, (384+16-1)*2
+    add           hd, 2
+%if ARCH_X86_64
+    LEA           r5, sgr_x_by_x-0xF03
+%else
+    SETUP_PIC r5, 0
+%endif
+    movd          m6, sd
+    pshuflw       m6, m6, q0000
+    punpcklqdq    m6, m6
+    pxor          m7, m7
+    DEFINE_ARGS a, b, w, h, x
+%if ARCH_X86_64
+    mova          m8, [pd_0xF0080029]
+    mova          m9, [pw_256]
+    psrld        m10, m9, 15                        ; pd_512
+%else
+ %define m8     [PIC_sym(pd_0xF0080029)]
+ %define m9     [PIC_sym(pw_256)]
+ %define m10    [PIC_sym(pd_512)]
+%endif
+.loop_y:
+    mov           xq, -2
+.loop_x:
+    movq          m0, [bq+xq*2+0]
+    movq          m1, [bq+xq*2+8]
+    punpcklwd     m0, m7
+    punpcklwd     m1, m7
+    movu          m2, [aq+xq*4+ 0]
+    movu          m3, [aq+xq*4+16]
+    pslld         m4, m2, 3                         ; aa * 8
+    pslld         m5, m3, 3
+    paddd         m2, m4                            ; aa * 9
+    paddd         m3, m5
+    paddd         m4, m4                            ; aa * 16
+    paddd         m5, m5
+    paddd         m2, m4                            ; aa * 25
+    paddd         m3, m5
+    pmaddwd       m4, m0, m0
+    pmaddwd       m5, m1, m1
+    psubd         m2, m4                            ; p = aa * 25 - bb * bb
+    psubd         m3, m5
+    MULLD         m2, m6
+    MULLD         m3, m6
+    paddusw       m2, m8
+    paddusw       m3, m8
+    psrld         m2, 20                            ; z
+    psrld         m3, 20
+    GATHERDD      m4, m2                            ; xx
+    GATHERDD      m2, m3
+    psrld         m4, 24
+    psrld         m2, 24
+    packssdw      m3, m4, m2
+    pmullw        m4, m8
+    pmullw        m2, m8
+    psubw         m5, m9, m3
+    pmaddwd       m0, m4
+    pmaddwd       m1, m2
+    paddd         m0, m10
+    paddd         m1, m10
+    psrld         m0, 10
+    psrld         m1, 10
+    movu   [bq+xq*2], m5
+    movu [aq+xq*4+ 0], m0
+    movu [aq+xq*4+16], m1
+    add           xd, 8
+    cmp           xd, wd
+    jl .loop_x
+    add           aq, (384+16)*4*2
+    add           bq, (384+16)*2*2
+    sub           hd, 2
+    jg .loop_y
+    RET
+
+%if ARCH_X86_64
+cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
+                                       tmp_base, src_base, a_base, b_base, x, y
+    movifnidn     wd, wm
+    mov           hd, hm
+    mov    tmp_baseq, tq
+    mov    src_baseq, srcq
+    mov      a_baseq, aq
+    mov      b_baseq, bq
+    mova          m9, [pw_5_6]
+    mova         m12, [pw_256]
+    psrlw        m10, m12, 8                    ; pw_1
+    psrlw        m11, m12, 1                    ; pw_128
+    pxor         m13, m13
+%else
+cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
+ %define tmp_baseq  r0m
+ %define src_baseq  r1m
+ %define a_baseq    r3m
+ %define b_baseq    r4m
+ %define wd         r5m
+ %define hd         r6m
+
+    SUB          esp, 8
+    SETUP_PIC yd
+
+ %define m8     m5
+ %define m9     [PIC_sym(pw_5_6)]
+ %define m10    [PIC_sym(pw_1)]
+ %define m11    [PIC_sym(pw_128)]
+ %define m12    [PIC_sym(pw_256)]
+ %define m13    m0
+%endif
+    xor           xd, xd
+.loop_x:
+    mov           tq, tmp_baseq
+    mov         srcq, src_baseq
+    mov           aq, a_baseq
+    mov           bq, b_baseq
+    movu          m0, [aq+xq*4-(384+16)*4-4]
+    mova          m1, [aq+xq*4-(384+16)*4]
+    movu          m2, [aq+xq*4-(384+16)*4+4]
+    movu          m3, [aq+xq*4-(384+16)*4-4+16]
+    mova          m4, [aq+xq*4-(384+16)*4+16]
+    movu          m5, [aq+xq*4-(384+16)*4+4+16]
+    paddd         m0, m2
+    paddd         m3, m5
+    paddd         m0, m1
+    paddd         m3, m4
+    pslld         m2, m0, 2
+    pslld         m5, m3, 2
+    paddd         m2, m0
+    paddd         m5, m3
+    paddd         m0, m2, m1                    ; prev_odd_b [first half]
+    paddd         m1, m5, m4                    ; prev_odd_b [second half]
+    movu          m3, [bq+xq*2-(384+16)*2-2]
+    mova          m4, [bq+xq*2-(384+16)*2]
+    movu          m5, [bq+xq*2-(384+16)*2+2]
+    paddw         m3, m5
+    punpcklwd     m5, m3, m4
+    punpckhwd     m3, m4
+    pmaddwd       m5, m9
+    pmaddwd       m3, m9
+    mova          m2, m5
+    packssdw      m2, m3                        ; prev_odd_a
+    lea           tq, [tq+xq*2]
+    lea         srcq, [srcq+xq*1]
+    lea           aq, [aq+xq*4+(384+16)*4]
+    lea           bq, [bq+xq*2+(384+16)*2]
+%if ARCH_X86_32
+    mov        [esp], PIC_reg
+%endif
+    mov           yd, hd
+    XCHG_PIC_REG
+.loop_y:
+    movu          m3, [aq-4]
+    mova          m4, [aq]
+    movu          m5, [aq+4]
+    paddd         m3, m5
+    paddd         m3, m4
+    pslld         m5, m3, 2
+    paddd         m5, m3
+    paddd         m5, m4                        ; cur_odd_b [first half]
+    movu          m3, [aq+16-4]
+    mova          m6, [aq+16]
+    movu          m7, [aq+16+4]
+    paddd         m3, m7
+    paddd         m3, m6
+    pslld         m7, m3, 2
+    paddd         m7, m3
+    paddd         m4, m7, m6                    ; cur_odd_b [second half]
+    movu          m3, [bq-2]
+    mova          m6, [bq]
+    movu          m7, [bq+2]
+    paddw         m3, m7
+    punpcklwd     m7, m3, m6
+    punpckhwd     m3, m6
+    pmaddwd       m7, m9
+    pmaddwd       m3, m9
+    packssdw      m6, m7, m3                    ; cur_odd_a
+
+    paddd         m0, m5                        ; cur_even_b [first half]
+    paddd         m1, m4                        ; cur_even_b [second half]
+    paddw         m2, m6                        ; cur_even_a
+
+    movq          m3, [srcq]
+%if ARCH_X86_64
+    punpcklbw     m3, m13
+%else
+    mova        [td], m5
+    pxor          m7, m7
+    punpcklbw     m3, m7
+%endif
+    punpcklwd     m7, m3, m10
+    punpckhwd     m3, m10
+    punpcklwd     m8, m2, m12
+    punpckhwd     m2, m12
+    pmaddwd       m7, m8
+    pmaddwd       m3, m2
+    paddd         m7, m0
+    paddd         m3, m1
+    psrad         m7, 9
+    psrad         m3, 9
+
+%if ARCH_X86_32
+    pxor         m13, m13
+%endif
+    movq          m8, [srcq+strideq]
+    punpcklbw     m8, m13
+    punpcklwd     m0, m8, m10
+    punpckhwd     m8, m10
+    punpcklwd     m1, m6, m11
+    punpckhwd     m2, m6, m11
+    pmaddwd       m0, m1
+    pmaddwd       m8, m2
+%if ARCH_X86_64
+    paddd         m0, m5
+%else
+    paddd         m0, [td]
+%endif
+    paddd         m8, m4
+    psrad         m0, 8
+    psrad         m8, 8
+
+    packssdw      m7, m3
+    packssdw      m0, m8
+%if ARCH_X86_32
+    mova          m5, [td]
+%endif
+    mova [tq+384*2*0], m7
+    mova [tq+384*2*1], m0
+
+    mova          m0, m5
+    mova          m1, m4
+    mova          m2, m6
+    add           aq, (384+16)*4*2
+    add           bq, (384+16)*2*2
+    add           tq, 384*2*2
+    lea         srcq, [srcq+strideq*2]
+%if ARCH_X86_64
+    sub           yd, 2
+%else
+    sub dword [esp+4], 2
+%endif
+    jg .loop_y
+    add           xd, 8
+    cmp           xd, wd
+    jl .loop_x
+%if ARCH_X86_32
+    ADD          esp, 8
+%endif
+    RET
+
+cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
+    movifnidn     wd, wm
+    movd          m0, wtm
+%if ARCH_X86_64
+    movifnidn     hd, hm
+    mova         m10, [pd_1024]
+    pxor         m11, m11
+%else
+    SETUP_PIC     hd, 0
+ %define m10    [PIC_sym(pd_1024)]
+ %define m11    m7
+%endif
+    pshufd        m0, m0, 0
+    DEFINE_ARGS dst, stride, t1, t2, w, h, idx
+%if ARCH_X86_32
+ %define hd     hmp
+%endif
+
+.loop_y:
+    xor         idxd, idxd
+.loop_x:
+    mova          m1, [t1q+idxq*2+ 0]
+    mova          m2, [t1q+idxq*2+16]
+    mova          m3, [t2q+idxq*2+ 0]
+    mova          m4, [t2q+idxq*2+16]
+    mova          m6, [dstq+idxq]
+%if ARCH_X86_32
+    pxor          m11, m11
+%endif
+    punpcklbw     m5, m6, m11
+    punpckhbw     m6, m11
+    psllw         m7, m5, 4
+    psubw         m1, m7
+    psubw         m3, m7
+    psllw         m7, m6, 4
+    psubw         m2, m7
+    psubw         m4, m7
+    punpcklwd     m7, m1, m3
+    punpckhwd     m1, m3
+    punpcklwd     m3, m2, m4
+    punpckhwd     m2, m4
+    pmaddwd       m7, m0
+    pmaddwd       m1, m0
+    pmaddwd       m3, m0
+    pmaddwd       m2, m0
+    paddd         m7, m10
+    paddd         m1, m10
+    paddd         m3, m10
+    paddd         m2, m10
+    psrad         m7, 11
+    psrad         m1, 11
+    psrad         m3, 11
+    psrad         m2, 11
+    packssdw      m7, m1
+    packssdw      m3, m2
+    paddw         m7, m5
+    paddw         m3, m6
+    packuswb      m7, m3
+    mova [dstq+idxq], m7
+    add         idxd, 16
+    cmp         idxd, wd
+    jl .loop_x
+    add         dstq, strideq
+    add          t1q, 384 * 2
+    add          t2q, 384 * 2
+    dec           hd
+    jg .loop_y
+    RET
diff --git a/src/x86/mc.asm b/src/x86/mc.asm
new file mode 100644 (file)
index 0000000..5d769df
--- /dev/null
@@ -0,0 +1,8066 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+; dav1d_obmc_masks[] with 64-x interleaved
+obmc_masks: db  0,  0,  0,  0
+            ; 2
+            db 45, 19, 64,  0
+            ; 4
+            db 39, 25, 50, 14, 59,  5, 64,  0
+            ; 8
+            db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
+            ; 16
+            db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+            db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
+            ; 32
+            db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+            db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
+            db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
+            db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
+
+bidir_sctr_w4:  dd  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
+wm_420_perm4:   db  1,  3,  9, 11,  5,  7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31
+                db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63
+                db  0,  2,  8, 10,  4,  6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
+                db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
+wm_420_perm8:   db  1,  3, 17, 19,  5,  7, 21, 23,  9, 11, 25, 27, 13, 15, 29, 31
+                db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63
+                db  0,  2, 16, 18,  4,  6, 20, 22,  8, 10, 24, 26, 12, 14, 28, 30
+                db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
+wm_420_perm16:  db  1,  3, 33, 35,  5,  7, 37, 39,  9, 11, 41, 43, 13, 15, 45, 47
+                db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63
+                db  0,  2, 32, 34,  4,  6, 36, 38,  8, 10, 40, 42, 12, 14, 44, 46
+                db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
+wm_420_mask:    db  3,  7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+                db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
+                db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+                db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_422_mask:    db  2,  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62
+                db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+                db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126
+                db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_444_mask:    db  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+                db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+                db  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+                db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+bilin_h_perm16: db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+                db  9,  8, 10,  9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+                db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
+                db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47
+bilin_h_perm32: db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+                db  9,  8, 10,  9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+                db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23
+                db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31
+bilin_v_perm8:  db 16,  0, 17,  1, 18,  2, 19,  3, 20,  4, 21,  5, 22,  6, 23,  7
+                db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+                db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87
+                db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39
+bilin_v_perm16: db 16,  0, 17,  1, 18,  2, 19,  3, 20,  4, 21,  5, 22,  6, 23,  7
+                db 24,  8, 25,  9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
+                db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23
+                db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31
+bilin_v_perm32: db 64,  0, 65,  1, 66,  2, 67,  3, 68,  4, 69,  5, 70,  6, 71,  7
+                db 72,  8, 73,  9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15
+                db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+                db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31
+bilin_v_perm64: dq  0,  4,  1,  5,  2,  6,  3,  7
+spel_h_perm16a: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+                db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+                db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+                db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+spel_h_perm16b: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+                db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+                db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42
+                db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50
+spel_h_perm16c: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+                db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+                db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+                db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54
+spel_h_perm32a: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+                db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+                db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+                db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+spel_h_perm32b: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+                db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+                db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26
+                db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34
+spel_h_perm32c: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+                db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+                db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+                db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+spel_hv_perm4a: db  8,  9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23
+                db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
+spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39
+                db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47
+                db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55
+                db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63
+
+warp_8x8_shufA: db  0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
+                db  4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
+warp_8x8_shufB: db  2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
+                db  6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
+subpel_h_shuf4: db  0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
+                db  2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+subpel_h_shufB: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+subpel_h_shufC: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_v_shuf4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+subpel_s_shuf2: db  0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
+subpel_s_shuf8: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
+bilin_h_shuf4:  db  1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
+bilin_h_shuf8:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+bilin_v_shuf4:  db  4,  0,  5,  1,  6,  2,  7,  3,  8,  4,  9,  5, 10,  6, 11,  7
+deint_shuf4:    db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
+blend_shuf:     db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
+wswap:          db  2,  3,  0,  1,  6,  7,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13
+pb_8x0_8x8: times 8 db 0
+            times 8 db 8
+bdct_lb_dw: times 4 db 0
+            times 4 db 4
+            times 4 db 8
+            times 4 db 12
+
+ALIGN 32
+rescale_mul:    dd  0,  1,  2,  3, 4, 5, 6, 7
+resize_shuf:    times 5 db 0
+                db  1, 2, 3, 4, 5, 6
+                times 5+8 db 7
+
+ALIGN 8
+wm_420_perm64:  dq 0xfedcba9876543210
+wm_420_sign:    dd 0x01020102, 0x01010101
+wm_422_sign:    dd 0x80808080, 0x7f7f7f7f
+wm_sign_avx512: dd 0x40804080, 0xc0c0c0c0, 0x40404040
+
+ALIGN 4
+pb_0123: db 0, 1, 2, 3
+pb_4567: db 4, 5, 6, 7
+pw_m128  times 2 dw -128
+pw_m256: times 2 dw -256
+pw_32:   times 2 dw 32
+pw_34:   times 2 dw 34
+pw_258:  times 2 dw 258
+pw_512:  times 2 dw 512
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_6903: times 2 dw 6903
+pw_8192: times 2 dw 8192
+pd_2:            dd 2
+pd_32:           dd 32
+pd_63:           dd 63
+pd_512:          dd 512
+pd_32768:        dd 32768
+pd_0x3ff:        dd 0x3ff
+pd_0x4000:       dd 0x4000
+pq_0x40000000:   dq 0x40000000
+
+%define pb_m64 (wm_sign_avx512+4)
+%define pb_64  (wm_sign_avx512+8)
+%define pb_127 (wm_422_sign   +4)
+
+cextern mc_subpel_filters
+cextern mc_warp_filter
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BASE_JMP_TABLE 3-*
+    %xdefine %1_%2_table (%%table - %3)
+    %xdefine %%base %1_%2
+    %%table:
+    %rep %0 - 2
+        dw %%base %+ _w%3 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
+    %xdefine %%base %1_%3
+    %assign %%types %4
+    %if %%types & 1
+        %xdefine %1_%2_h_%3_table  (%%h  - %5)
+        %%h:
+        %rep %0 - 4
+            dw %%prefix %+ .h_w%5 - %%base
+            %rotate 1
+        %endrep
+        %rotate 4
+    %endif
+    %if %%types & 2
+        %xdefine %1_%2_v_%3_table  (%%v  - %5)
+        %%v:
+        %rep %0 - 4
+            dw %%prefix %+ .v_w%5 - %%base
+            %rotate 1
+        %endrep
+        %rotate 4
+    %endif
+    %if %%types & 4
+        %xdefine %1_%2_hv_%3_table (%%hv - %5)
+        %%hv:
+        %rep %0 - 4
+            dw %%prefix %+ .hv_w%5 - %%base
+            %rotate 1
+        %endrep
+    %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 1-*
+    %xdefine %1_table (%%table - 2*%2)
+    %xdefine %%base %1_table
+    %xdefine %%prefix mangle(private_prefix %+ _%1)
+    %%table:
+    %rep %0 - 1
+        dd %%prefix %+ .w%2 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SCALED_JMP_TABLE 1-*
+    %xdefine %1_table (%%table - %2)
+    %xdefine %%base mangle(private_prefix %+ _%1)
+%%table:
+    %rep %0 - 1
+        dw %%base %+ .w%2 - %%base
+        %rotate 1
+    %endrep
+    %rotate 1
+%%dy_1024:
+    %xdefine %1_dy1_table (%%dy_1024 - %2)
+    %rep %0 - 1
+        dw %%base %+ .dy1_w%2 - %%base
+        %rotate 1
+    %endrep
+    %rotate 1
+%%dy_2048:
+    %xdefine %1_dy2_table (%%dy_2048 - %2)
+    %rep %0 - 1
+        dw %%base %+ .dy2_w%2 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put)
+%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep)
+%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep)
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+BASE_JMP_TABLE   put,  avx2,           2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE   prep, avx2,              4, 8, 16, 32, 64, 128
+HV_JMP_TABLE     put,  bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE     prep, bilin, avx2, 7,    4, 8, 16, 32, 64, 128
+HV_JMP_TABLE     put,  8tap,  avx2, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE     prep, 8tap,  avx2, 1,    4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled_avx2,   4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  avg_avx2,                4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  w_avg_avx2,              4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  mask_avx2,               4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  w_mask_420_avx2,         4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  w_mask_422_avx2,         4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  w_mask_444_avx2,         4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  blend_avx2,              4, 8, 16, 32
+BIDIR_JMP_TABLE  blend_v_avx2,         2, 4, 8, 16, 32
+BIDIR_JMP_TABLE  blend_h_avx2,         2, 4, 8, 16, 32, 32, 32
+
+%if HAVE_AVX512ICL
+BASE_JMP_TABLE prep, avx512icl,            4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx512icl, 7,    4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap,  avx512icl, 7,    4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE avg_avx512icl,             4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg_avx512icl,           4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask_avx512icl,            4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420_avx512icl,      4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422_avx512icl,      4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444_avx512icl,      4, 8, 16, 32, 64, 128
+%endif ; HAVE_AVX512ICL
+
+SECTION .text
+
+INIT_XMM avx2
+DECLARE_REG_TMP 4, 6, 7
+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+    movifnidn          mxyd, r6m ; mx
+    lea                  t2, [put_avx2]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    test               mxyd, mxyd
+    jnz .h
+    mov                mxyd, r7m ; my
+    test               mxyd, mxyd
+    jnz .v
+.put:
+    movzx                wd, word [t2+wq*2+table_offset(put,)]
+    add                  wq, t2
+    jmp                  wq
+.put_w2:
+    movzx               t0d, word [srcq+ssq*0]
+    movzx               t1d, word [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mov        [dstq+dsq*0], t0w
+    mov        [dstq+dsq*1], t1w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w2
+    RET
+.put_w4:
+    mov                 t0d, [srcq+ssq*0]
+    mov                 t1d, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mov        [dstq+dsq*0], t0d
+    mov        [dstq+dsq*1], t1d
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w4
+    RET
+.put_w8:
+    mov                  t0, [srcq+ssq*0]
+    mov                  t1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mov        [dstq+dsq*0], t0
+    mov        [dstq+dsq*1], t1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w8
+    RET
+.put_w16:
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mova       [dstq+dsq*0], m0
+    mova       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w16
+    RET
+INIT_YMM avx2
+.put_w32:
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mova       [dstq+dsq*0], m0
+    mova       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w32
+    RET
+.put_w64:
+    movu                 m0, [srcq+ssq*0+32*0]
+    movu                 m1, [srcq+ssq*0+32*1]
+    movu                 m2, [srcq+ssq*1+32*0]
+    movu                 m3, [srcq+ssq*1+32*1]
+    lea                srcq, [srcq+ssq*2]
+    mova  [dstq+dsq*0+32*0], m0
+    mova  [dstq+dsq*0+32*1], m1
+    mova  [dstq+dsq*1+32*0], m2
+    mova  [dstq+dsq*1+32*1], m3
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w64
+    RET
+.put_w128:
+    movu                 m0, [srcq+32*0]
+    movu                 m1, [srcq+32*1]
+    movu                 m2, [srcq+32*2]
+    movu                 m3, [srcq+32*3]
+    add                srcq, ssq
+    mova        [dstq+32*0], m0
+    mova        [dstq+32*1], m1
+    mova        [dstq+32*2], m2
+    mova        [dstq+32*3], m3
+    add                dstq, dsq
+    dec                  hd
+    jg .put_w128
+    RET
+.h:
+    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+    imul               mxyd, 0xff01
+    vbroadcasti128       m4, [bilin_h_shuf8]
+    add                mxyd, 16 << 8
+    movd                xm5, mxyd
+    mov                mxyd, r7m ; my
+    vpbroadcastw         m5, xm5
+    test               mxyd, mxyd
+    jnz .hv
+    movzx                wd, word [t2+wq*2+table_offset(put, _bilin_h)]
+    vpbroadcastd         m3, [pw_2048]
+    add                  wq, t2
+    jmp                  wq
+.h_w2:
+    movd                xm0, [srcq+ssq*0]
+    pinsrd              xm0, [srcq+ssq*1], 1
+    lea                srcq, [srcq+ssq*2]
+    pshufb              xm0, xm4
+    pmaddubsw           xm0, xm5
+    pmulhrsw            xm0, xm3
+    packuswb            xm0, xm0
+    pextrw     [dstq+dsq*0], xm0, 0
+    pextrw     [dstq+dsq*1], xm0, 2
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w2
+    RET
+.h_w4:
+    mova                xm4, [bilin_h_shuf4]
+.h_w4_loop:
+    movq                xm0, [srcq+ssq*0]
+    movhps              xm0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb              xm0, xm4
+    pmaddubsw           xm0, xm5
+    pmulhrsw            xm0, xm3
+    packuswb            xm0, xm0
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w4_loop
+    RET
+.h_w8:
+    movu                xm0, [srcq+ssq*0]
+    movu                xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb              xm0, xm4
+    pshufb              xm1, xm4
+    pmaddubsw           xm0, xm5
+    pmaddubsw           xm1, xm5
+    pmulhrsw            xm0, xm3
+    pmulhrsw            xm1, xm3
+    packuswb            xm0, xm1
+    movq       [dstq+dsq*0], xm0
+    movhps     [dstq+dsq*1], xm0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w8
+    RET
+.h_w16:
+    movu                xm0,     [srcq+ssq*0+8*0]
+    vinserti128          m0, m0, [srcq+ssq*1+8*0], 1
+    movu                xm1,     [srcq+ssq*0+8*1]
+    vinserti128          m1, m1, [srcq+ssq*1+8*1], 1
+    lea                srcq,     [srcq+ssq*2]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], m0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w16
+    RET
+.h_w32:
+    movu                 m0, [srcq+8*0]
+    movu                 m1, [srcq+8*1]
+    add                srcq, ssq
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w32
+    RET
+.h_w64:
+    movu                 m0, [srcq+8*0]
+    movu                 m1, [srcq+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    movu                 m1, [srcq+8*4]
+    movu                 m2, [srcq+8*5]
+    add                srcq, ssq
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmulhrsw             m1, m3
+    pmulhrsw             m2, m3
+    packuswb             m1, m2
+    mova        [dstq+32*0], m0
+    mova        [dstq+32*1], m1
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w64
+    RET
+.h_w128:
+    mov                  t1, -32*3
+.h_w128_loop:
+    movu                 m0, [srcq+t1+32*3+8*0]
+    movu                 m1, [srcq+t1+32*3+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    mova     [dstq+t1+32*3], m0
+    add                  t1, 32
+    jle .h_w128_loop
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w128
+    RET
+.v:
+    movzx                wd, word [t2+wq*2+table_offset(put, _bilin_v)]
+    imul               mxyd, 0xff01
+    vpbroadcastd         m5, [pw_2048]
+    add                mxyd, 16 << 8
+    add                  wq, t2
+    movd                xm4, mxyd
+    vpbroadcastw         m4, xm4
+    jmp                  wq
+.v_w2:
+    movd                xm0,      [srcq+ssq*0]
+.v_w2_loop:
+    pinsrw              xm1, xm0, [srcq+ssq*1], 1 ; 0 1
+    lea                srcq,      [srcq+ssq*2]
+    pinsrw              xm0, xm1, [srcq+ssq*0], 0 ; 2 1
+    pshuflw             xm1, xm1, q2301           ; 1 0
+    punpcklbw           xm1, xm0, xm1
+    pmaddubsw           xm1, xm4
+    pmulhrsw            xm1, xm5
+    packuswb            xm1, xm1
+    pextrw     [dstq+dsq*0], xm1, 1
+    pextrw     [dstq+dsq*1], xm1, 0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w2_loop
+    RET
+.v_w4:
+    movd                xm0, [srcq+ssq*0]
+.v_w4_loop:
+    vpbroadcastd        xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd            xm2, xm1, xm0, 0x01 ; 0 1
+    vpbroadcastd        xm0, [srcq+ssq*0]
+    vpblendd            xm1, xm1, xm0, 0x02 ; 1 2
+    punpcklbw           xm1, xm2
+    pmaddubsw           xm1, xm4
+    pmulhrsw            xm1, xm5
+    packuswb            xm1, xm1
+    movd       [dstq+dsq*0], xm1
+    pextrd     [dstq+dsq*1], xm1, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w4_loop
+    RET
+.v_w8:
+    movq                xm0, [srcq+ssq*0]
+.v_w8_loop:
+    movq                xm3, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpcklbw           xm1, xm3, xm0
+    movq                xm0, [srcq+ssq*0]
+    punpcklbw           xm2, xm0, xm3
+    pmaddubsw           xm1, xm4
+    pmaddubsw           xm2, xm4
+    pmulhrsw            xm1, xm5
+    pmulhrsw            xm2, xm5
+    packuswb            xm1, xm2
+    movq       [dstq+dsq*0], xm1
+    movhps     [dstq+dsq*1], xm1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w8_loop
+    RET
+.v_w16:
+    movu                xm0, [srcq+ssq*0]
+.v_w16_loop:
+    vbroadcasti128       m2, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd             m3, m2, m0, 0x0f ; 0 1
+    vbroadcasti128       m0, [srcq+ssq*0]
+    vpblendd             m2, m2, m0, 0xf0 ; 1 2
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    packuswb             m1, m2
+    mova         [dstq+dsq*0], xm1
+    vextracti128 [dstq+dsq*1], m1, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w16_loop
+    RET
+.v_w32:
+%macro PUT_BILIN_V_W32 0
+    movu                 m0, [srcq+ssq*0]
+%%loop:
+    movu                 m3, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpcklbw            m1, m3, m0
+    punpckhbw            m2, m3, m0
+    movu                 m0, [srcq+ssq*0]
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    packuswb             m1, m2
+    mova       [dstq+dsq*0], m1
+    punpcklbw            m1, m0, m3
+    punpckhbw            m2, m0, m3
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    packuswb             m1, m2
+    mova       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg %%loop
+%endmacro
+    PUT_BILIN_V_W32
+    RET
+.v_w64:
+    movu                 m0, [srcq+32*0]
+    movu                 m1, [srcq+32*1]
+.v_w64_loop:
+    add                srcq, ssq
+    movu                 m3, [srcq+32*0]
+    punpcklbw            m2, m3, m0
+    punpckhbw            m0, m3, m0
+    pmaddubsw            m2, m4
+    pmaddubsw            m0, m4
+    pmulhrsw             m2, m5
+    pmulhrsw             m0, m5
+    packuswb             m2, m0
+    mova                 m0, m3
+    movu                 m3, [srcq+32*1]
+    mova        [dstq+32*0], m2
+    punpcklbw            m2, m3, m1
+    punpckhbw            m1, m3, m1
+    pmaddubsw            m2, m4
+    pmaddubsw            m1, m4
+    pmulhrsw             m2, m5
+    pmulhrsw             m1, m5
+    packuswb             m2, m1
+    mova                 m1, m3
+    mova        [dstq+32*1], m2
+    add                dstq, dsq
+    dec                  hd
+    jg .v_w64_loop
+    RET
+.v_w128:
+    mov                  t0, dstq
+    mov                  t1, srcq
+    lea                 t2d, [hq+(3<<8)]
+.v_w128_loop:
+    PUT_BILIN_V_W32
+    movzx                hd, t2b
+    add                  t0, 32
+    add                  t1, 32
+    mov                dstq, t0
+    mov                srcq, t1
+    sub                 t2d, 1<<8
+    jg .v_w128_loop
+    RET
+.hv:
+    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+    movzx                wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
+    WIN64_SPILL_XMM       8
+    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
+    vpbroadcastd         m7, [pw_2048]
+    movd                xm6, mxyd
+    add                  wq, t2
+    vpbroadcastw         m6, xm6
+    jmp                  wq
+.hv_w2:
+    vpbroadcastd        xm0, [srcq+ssq*0]
+    pshufb              xm0, xm4
+    pmaddubsw           xm0, xm5
+.hv_w2_loop:
+    movd                xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pinsrd              xm1, [srcq+ssq*0], 1
+    pshufb              xm1, xm4
+    pmaddubsw           xm1, xm5             ; 1 _ 2 _
+    shufps              xm2, xm0, xm1, q1032 ; 0 _ 1 _
+    mova                xm0, xm1
+    psubw               xm1, xm2
+    paddw               xm1, xm1
+    pmulhw              xm1, xm6
+    paddw               xm1, xm2
+    pmulhrsw            xm1, xm7
+    packuswb            xm1, xm1
+    pextrw     [dstq+dsq*0], xm1, 0
+    pextrw     [dstq+dsq*1], xm1, 2
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w2_loop
+    RET
+.hv_w4:
+    mova                xm4, [bilin_h_shuf4]
+    movddup             xm0, [srcq+ssq*0]
+    pshufb              xm0, xm4
+    pmaddubsw           xm0, xm5
+.hv_w4_loop:
+    movq                xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movhps              xm1, [srcq+ssq*0]
+    pshufb              xm1, xm4
+    pmaddubsw           xm1, xm5             ; 1 2
+    shufps              xm2, xm0, xm1, q1032 ; 0 1
+    mova                xm0, xm1
+    psubw               xm1, xm2
+    paddw               xm1, xm1
+    pmulhw              xm1, xm6
+    paddw               xm1, xm2
+    pmulhrsw            xm1, xm7
+    packuswb            xm1, xm1
+    movd       [dstq+dsq*0], xm1
+    pextrd     [dstq+dsq*1], xm1, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    vbroadcasti128       m0,     [srcq+ssq*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w8_loop:
+    movu                xm1,     [srcq+ssq*1]
+    lea                srcq,     [srcq+ssq*2]
+    vinserti128          m1, m1, [srcq+ssq*0], 1
+    pshufb               m1, m4
+    pmaddubsw            m1, m5           ; 1 2
+    vperm2i128           m2, m0, m1, 0x21 ; 0 1
+    mova                 m0, m1
+    psubw                m1, m2
+    paddw                m1, m1
+    pmulhw               m1, m6
+    paddw                m1, m2
+    pmulhrsw             m1, m7
+    vextracti128        xm2, m1, 1
+    packuswb            xm1, xm2
+    movq       [dstq+dsq*0], xm1
+    movhps     [dstq+dsq*1], xm1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    RET
+.hv_w16:
+    movu                 m0,     [srcq+ssq*0+8*0]
+    vinserti128          m0, m0, [srcq+ssq*0+8*1], 1
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w16_loop:
+    movu                xm2,     [srcq+ssq*1+8*0]
+    vinserti128          m2, m2, [srcq+ssq*1+8*1], 1
+    lea                srcq,     [srcq+ssq*2]
+    movu                xm3,     [srcq+ssq*0+8*0]
+    vinserti128          m3, m3, [srcq+ssq*0+8*1], 1
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m2, m5
+    psubw                m1, m2, m0
+    paddw                m1, m1
+    pmulhw               m1, m6
+    paddw                m1, m0
+    pmaddubsw            m0, m3, m5
+    psubw                m3, m0, m2
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m2
+    pmulhrsw             m1, m7
+    pmulhrsw             m3, m7
+    packuswb             m1, m3
+    vpermq               m1, m1, q3120
+    mova         [dstq+dsq*0], xm1
+    vextracti128 [dstq+dsq*1], m1, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w16_loop
+    RET
+.hv_w32:
+    xor                 t2d, t2d
+.hv_w32gt:
+    mov                  t0, dstq
+    mov                  t1, srcq
+%if WIN64
+    movaps              r4m, xmm8
+%endif
+.hv_w32_loop0:
+    movu                 m0,     [srcq+8*0]
+    vinserti128          m0, m0, [srcq+8*2], 1
+    movu                 m1,     [srcq+8*1]
+    vinserti128          m1, m1, [srcq+8*3], 1
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+.hv_w32_loop:
+    add                srcq, ssq
+    movu                xm2,     [srcq+8*1]
+    vinserti128          m2, m2, [srcq+8*3], 1
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m3, m2, m1
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m1
+    mova                 m1, m2
+    pmulhrsw             m8, m3, m7
+    movu                xm2,     [srcq+8*0]
+    vinserti128          m2, m2, [srcq+8*2], 1
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m3, m2, m0
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m0
+    mova                 m0, m2
+    pmulhrsw             m3, m7
+    packuswb             m3, m8
+    mova             [dstq], m3
+    add                dstq, dsq
+    dec                  hd
+    jg .hv_w32_loop
+    movzx                hd, t2b
+    add                  t0, 32
+    add                  t1, 32
+    mov                dstq, t0
+    mov                srcq, t1
+    sub                 t2d, 1<<8
+    jg .hv_w32_loop0
+%if WIN64
+    movaps             xmm8, r4m
+%endif
+    RET
+.hv_w64:
+    lea                 t2d, [hq+(1<<8)]
+    jmp .hv_w32gt
+.hv_w128:
+    lea                 t2d, [hq+(3<<8)]
+    jmp .hv_w32gt
+
+%macro PREP_BILIN 0
+DECLARE_REG_TMP 3, 5, 6
+cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+    movifnidn          mxyd, r5m ; mx
+    lea                  t2, [prep%+SUFFIX]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    test               mxyd, mxyd
+    jnz .h
+    mov                mxyd, r6m ; my
+    test               mxyd, mxyd
+    jnz .v
+.prep:
+    movzx                wd, word [t2+wq*2+table_offset(prep,)]
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.prep_w4:
+    movd                xm0, [srcq+strideq*0]
+    pinsrd              xm0, [srcq+strideq*1], 1
+    pinsrd              xm0, [srcq+strideq*2], 2
+    pinsrd              xm0, [srcq+stride3q ], 3
+    lea                srcq, [srcq+strideq*4]
+    pmovzxbw            ym0, xm0
+    psllw               ym0, 4
+    mova             [tmpq], ym0
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .prep_w4
+    RET
+.prep_w8:
+    movq                xm0, [srcq+strideq*0]
+%if cpuflag(avx512)
+    movq                xm1, [srcq+strideq*1]
+    vinserti128         ym0, [srcq+strideq*2], 1
+    vinserti128         ym1, [srcq+stride3q ], 1
+    lea                srcq, [srcq+strideq*4]
+    punpcklqdq          ym0, ym1
+    pmovzxbw             m0, ym0
+    psllw                m0, 4
+    mova             [tmpq], m0
+%else
+    movhps              xm0, [srcq+strideq*1]
+    movq                xm1, [srcq+strideq*2]
+    movhps              xm1, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    pmovzxbw             m0, xm0
+    pmovzxbw             m1, xm1
+    psllw                m0, 4
+    psllw                m1, 4
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+%endif
+    add                tmpq, 32*2
+    sub                  hd, 4
+    jg .prep_w8
+    RET
+.prep_w16:
+%if cpuflag(avx512)
+    movu                xm0, [srcq+strideq*0]
+    vinserti128         ym0, [srcq+strideq*1], 1
+    movu                xm1, [srcq+strideq*2]
+    vinserti128         ym1, [srcq+stride3q ], 1
+    pmovzxbw             m0, ym0
+    pmovzxbw             m1, ym1
+%else
+    pmovzxbw             m0, [srcq+strideq*0]
+    pmovzxbw             m1, [srcq+strideq*1]
+    pmovzxbw             m2, [srcq+strideq*2]
+    pmovzxbw             m3, [srcq+stride3q ]
+%endif
+    lea                srcq, [srcq+strideq*4]
+    psllw                m0, 4
+    psllw                m1, 4
+%if notcpuflag(avx512)
+    psllw                m2, 4
+    psllw                m3, 4
+%endif
+    mova    [tmpq+mmsize*0], m0
+    mova    [tmpq+mmsize*1], m1
+%if notcpuflag(avx512)
+    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*3], m3
+%endif
+    add                tmpq, 32*4
+    sub                  hd, 4
+    jg .prep_w16
+    RET
+.prep_w32:
+%if cpuflag(avx512)
+    pmovzxbw             m0, [srcq+strideq*0]
+    pmovzxbw             m1, [srcq+strideq*1]
+    pmovzxbw             m2, [srcq+strideq*2]
+    pmovzxbw             m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+%else
+    pmovzxbw             m0, [srcq+strideq*0+16*0]
+    pmovzxbw             m1, [srcq+strideq*0+16*1]
+    pmovzxbw             m2, [srcq+strideq*1+16*0]
+    pmovzxbw             m3, [srcq+strideq*1+16*1]
+    lea                srcq, [srcq+strideq*2]
+%endif
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova    [tmpq+mmsize*0], m0
+    mova    [tmpq+mmsize*1], m1
+    mova    [tmpq+mmsize*2], m2
+    mova    [tmpq+mmsize*3], m3
+    add                tmpq, mmsize*4
+    sub                  hd, mmsize*4/(32*2)
+    jg .prep_w32
+    RET
+.prep_w64:
+%if cpuflag(avx512)
+    pmovzxbw             m0, [srcq+strideq*0+32*0]
+    pmovzxbw             m1, [srcq+strideq*0+32*1]
+    pmovzxbw             m2, [srcq+strideq*1+32*0]
+    pmovzxbw             m3, [srcq+strideq*1+32*1]
+    lea                srcq, [srcq+strideq*2]
+%else
+    pmovzxbw             m0, [srcq+16*0]
+    pmovzxbw             m1, [srcq+16*1]
+    pmovzxbw             m2, [srcq+16*2]
+    pmovzxbw             m3, [srcq+16*3]
+    add                srcq, strideq
+%endif
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova    [tmpq+mmsize*0], m0
+    mova    [tmpq+mmsize*1], m1
+    mova    [tmpq+mmsize*2], m2
+    mova    [tmpq+mmsize*3], m3
+    add                tmpq, mmsize*4
+%if cpuflag(avx512)
+    sub                  hd, 2
+%else
+    dec                  hd
+%endif
+    jg .prep_w64
+    RET
+.prep_w128:
+    pmovzxbw             m0, [srcq+(mmsize/2)*0]
+    pmovzxbw             m1, [srcq+(mmsize/2)*1]
+    pmovzxbw             m2, [srcq+(mmsize/2)*2]
+    pmovzxbw             m3, [srcq+(mmsize/2)*3]
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova    [tmpq+mmsize*0], m0
+    mova    [tmpq+mmsize*1], m1
+    mova    [tmpq+mmsize*2], m2
+    mova    [tmpq+mmsize*3], m3
+%if notcpuflag(avx512)
+    pmovzxbw             m0, [srcq+16*4]
+    pmovzxbw             m1, [srcq+16*5]
+    pmovzxbw             m2, [srcq+16*6]
+    pmovzxbw             m3, [srcq+16*7]
+%endif
+    add                tmpq, 32*8
+    add                srcq, strideq
+%if notcpuflag(avx512)
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova        [tmpq-32*4], m0
+    mova        [tmpq-32*3], m1
+    mova        [tmpq-32*2], m2
+    mova        [tmpq-32*1], m3
+%endif
+    dec                  hd
+    jg .prep_w128
+    RET
+.h:
+    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+    ; = (16 - mx) * src[x] + mx * src[x + 1]
+    imul               mxyd, 0xff01
+    add                mxyd, 16 << 8
+%if cpuflag(avx512)
+    vpbroadcastw         m5, mxyd
+%else
+    movd                xm5, mxyd
+    vbroadcasti128       m4, [bilin_h_shuf8]
+    vpbroadcastw         m5, xm5
+%endif
+    mov                mxyd, r6m ; my
+    test               mxyd, mxyd
+    jnz .hv
+    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.h_w4:
+    vbroadcasti128      ym4, [bilin_h_shuf4]
+.h_w4_loop:
+    movq                xm0, [srcq+strideq*0]
+    movhps              xm0, [srcq+strideq*1]
+    movq                xm1, [srcq+strideq*2]
+    movhps              xm1, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vinserti128         ym0, xm1, 1
+    pshufb              ym0, ym4
+    pmaddubsw           ym0, ym5
+    mova             [tmpq], ym0
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .h_w4_loop
+    RET
+.h_w8:
+%if cpuflag(avx512)
+    vbroadcasti128       m4, [bilin_h_shuf8]
+.h_w8_loop:
+    movu                xm0, [srcq+strideq*0]
+    vinserti128         ym0, [srcq+strideq*1], 1
+    vinserti128          m0, [srcq+strideq*2], 2
+    vinserti128          m0, [srcq+stride3q ], 3
+    lea                srcq, [srcq+strideq*4]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+    mova        [tmpq+64*0], m0
+%else
+.h_w8_loop:
+    movu                xm0, [srcq+strideq*0]
+    vinserti128          m0, [srcq+strideq*1], 1
+    movu                xm1, [srcq+strideq*2]
+    vinserti128          m1, [srcq+stride3q ], 1
+    lea                srcq, [srcq+strideq*4]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+%endif
+    add                tmpq, 32*2
+    sub                  hd, 4
+    jg .h_w8_loop
+    RET
+.h_w16:
+%if cpuflag(avx512icl)
+    mova                 m4, [bilin_h_perm16]
+.h_w16_loop:
+    movu                ym0, [srcq+strideq*0]
+    vinserti32x8         m0, [srcq+strideq*1], 1
+    movu                ym1, [srcq+strideq*2]
+    vinserti32x8         m1, [srcq+stride3q ], 1
+    lea                srcq, [srcq+strideq*4]
+    vpermb               m0, m4, m0
+    vpermb               m1, m4, m1
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    mova        [tmpq+64*0], m0
+    mova        [tmpq+64*1], m1
+%else
+.h_w16_loop:
+    movu                xm0, [srcq+strideq*0+8*0]
+    vinserti128          m0, [srcq+strideq*0+8*1], 1
+    movu                xm1, [srcq+strideq*1+8*0]
+    vinserti128          m1, [srcq+strideq*1+8*1], 1
+    movu                xm2, [srcq+strideq*2+8*0]
+    vinserti128          m2, [srcq+strideq*2+8*1], 1
+    movu                xm3, [srcq+stride3q +8*0]
+    vinserti128          m3, [srcq+stride3q +8*1], 1
+    lea                srcq, [srcq+strideq*4]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*3], m3
+%endif
+    add                tmpq, 32*4
+    sub                  hd, 4
+    jg .h_w16_loop
+    RET
+.h_w32:
+%if cpuflag(avx512icl)
+    mova                 m4, [bilin_h_perm32]
+.h_w32_loop:
+    vpermb               m0, m4, [srcq+strideq*0]
+    vpermb               m1, m4, [srcq+strideq*1]
+    vpermb               m2, m4, [srcq+strideq*2]
+    vpermb               m3, m4, [srcq+stride3q ]
+    lea                srcq,     [srcq+strideq*4]
+%else
+.h_w32_loop:
+    movu                xm0, [srcq+strideq*0+8*0]
+    vinserti128          m0, [srcq+strideq*0+8*1], 1
+    movu                xm1, [srcq+strideq*0+8*2]
+    vinserti128          m1, [srcq+strideq*0+8*3], 1
+    movu                xm2, [srcq+strideq*1+8*0]
+    vinserti128          m2, [srcq+strideq*1+8*1], 1
+    movu                xm3, [srcq+strideq*1+8*2]
+    vinserti128          m3, [srcq+strideq*1+8*3], 1
+    lea                srcq, [srcq+strideq*2]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+%endif
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova    [tmpq+mmsize*0], m0
+    mova    [tmpq+mmsize*1], m1
+    mova    [tmpq+mmsize*2], m2
+    mova    [tmpq+mmsize*3], m3
+    add                tmpq, mmsize*4
+    sub                  hd, mmsize*4/(32*2)
+    jg .h_w32_loop
+    RET
+.h_w64:
+%if cpuflag(avx512icl)
+    mova                 m4, [bilin_h_perm32]
+.h_w64_loop:
+    vpermb               m0, m4, [srcq+strideq*0+32*0]
+    vpermb               m1, m4, [srcq+strideq*0+32*1]
+    vpermb               m2, m4, [srcq+strideq*1+32*0]
+    vpermb               m3, m4, [srcq+strideq*1+32*1]
+    lea                srcq,     [srcq+strideq*2]
+%else
+.h_w64_loop:
+    movu                xm0, [srcq+8*0]
+    vinserti128          m0, [srcq+8*1], 1
+    movu                xm1, [srcq+8*2]
+    vinserti128          m1, [srcq+8*3], 1
+    movu                xm2, [srcq+8*4]
+    vinserti128          m2, [srcq+8*5], 1
+    movu                xm3, [srcq+8*6]
+    vinserti128          m3, [srcq+8*7], 1
+    add                srcq, strideq
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+%endif
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova    [tmpq+mmsize*0], m0
+    mova    [tmpq+mmsize*1], m1
+    mova    [tmpq+mmsize*2], m2
+    mova    [tmpq+mmsize*3], m3
+    add                tmpq, mmsize*4
+%if cpuflag(avx512)
+    sub                  hd, 2
+%else
+    dec                  hd
+%endif
+    jg .h_w64_loop
+    RET
+.h_w128:
+%if cpuflag(avx512icl)
+    mova                 m4, [bilin_h_perm32]
+.h_w128_loop:
+    vpermb               m0, m4, [srcq+32*0]
+    vpermb               m1, m4, [srcq+32*1]
+    vpermb               m2, m4, [srcq+32*2]
+    vpermb               m3, m4, [srcq+32*3]
+%else
+.h_w128_loop:
+    movu                xm0, [srcq+8*0]
+    vinserti128          m0, [srcq+8*1], 1
+    movu                xm1, [srcq+8*2]
+    vinserti128          m1, [srcq+8*3], 1
+    movu                xm2, [srcq+8*4]
+    vinserti128          m2, [srcq+8*5], 1
+    movu                xm3, [srcq+8*6]
+    vinserti128          m3, [srcq+8*7], 1
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+%endif
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova    [tmpq+mmsize*0], m0
+    mova    [tmpq+mmsize*1], m1
+    mova    [tmpq+mmsize*2], m2
+    mova    [tmpq+mmsize*3], m3
+%if notcpuflag(avx512)
+    movu                xm0, [srcq+8* 8]
+    vinserti128          m0, [srcq+8* 9], 1
+    movu                xm1, [srcq+8*10]
+    vinserti128          m1, [srcq+8*11], 1
+    movu                xm2, [srcq+8*12]
+    vinserti128          m2, [srcq+8*13], 1
+    movu                xm3, [srcq+8*14]
+    vinserti128          m3, [srcq+8*15], 1
+%endif
+    add                tmpq, 32*8
+    add                srcq, strideq
+%if notcpuflag(avx512)
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova        [tmpq-32*4], m0
+    mova        [tmpq-32*3], m1
+    mova        [tmpq-32*2], m2
+    mova        [tmpq-32*1], m3
+%endif
+    dec                  hd
+    jg .h_w128_loop
+    RET
+.v:
+    WIN64_SPILL_XMM       7
+    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
+    imul               mxyd, 0xff01
+    add                mxyd, 16 << 8
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+%if cpuflag(avx512)
+    vpbroadcastw         m6, mxyd
+%else
+    movd                xm6, mxyd
+    vpbroadcastw         m6, xm6
+%endif
+    jmp                  wq
+.v_w4:
+%if cpuflag(avx512)
+    vpbroadcastd        xm0, [srcq+strideq*0]
+    mov                 r3d, 0x29
+    vbroadcasti128      ym3, [bilin_v_shuf4]
+    kmovb                k1, r3d
+.v_w4_loop:
+    vpblendmd       xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____
+    vpbroadcastd        ym2, [srcq+strideq*2]
+    vpbroadcastd    ym2{k1}, [srcq+stride3q ]             ; __2_ 23__
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastd        ym0, [srcq+strideq*0]
+    punpckhqdq      ym2{k1}, ym1, ym0                     ; 012_ 234_
+    pshufb              ym2, ym3
+%else
+    movd                xm0, [srcq+strideq*0]
+.v_w4_loop:
+    vpbroadcastd         m1, [srcq+strideq*2]
+    vpbroadcastd        xm2, [srcq+strideq*1]
+    vpbroadcastd         m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpblendd             m1, m1, m0, 0x05 ; 0 2 2 2
+    vpbroadcastd         m0, [srcq+strideq*0]
+    vpblendd             m3, m3, m2, 0x0f ; 1 1 3 3
+    vpblendd             m2, m1, m0, 0xa0 ; 0 2 2 4
+    vpblendd             m1, m1, m3, 0xaa ; 0 1 2 3
+    vpblendd             m2, m2, m3, 0x55 ; 1 2 3 4
+    punpcklbw            m2, m1
+%endif
+    pmaddubsw           ym2, ym6
+    mova             [tmpq], ym2
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .v_w4_loop
+    RET
+.v_w8:
+%if cpuflag(avx512icl)
+    mova                 m5, [bilin_v_perm8]
+    vbroadcasti128      ym0, [srcq+strideq*0]
+%else
+    movq                xm0, [srcq+strideq*0]
+%endif
+.v_w8_loop:
+%if cpuflag(avx512icl)
+    vinserti128         ym1, ym0, [srcq+strideq*1], 1
+    vpbroadcastq        ym0, [srcq+strideq*2]
+    vinserti128          m1, [srcq+stride3q ], 2
+    lea                srcq, [srcq+strideq*4]
+    vinserti128         ym0, [srcq+strideq*0], 0
+    vpermt2b             m1, m5, m0
+    pmaddubsw            m1, m6
+    mova             [tmpq], m1
+%else
+    vpbroadcastq         m1, [srcq+strideq*2]
+    vpbroadcastq         m2, [srcq+strideq*1]
+    vpbroadcastq         m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpblendd             m1, m1, m0, 0x03 ; 0 2 2 2
+    vpbroadcastq         m0, [srcq+strideq*0]
+    vpblendd             m3, m3, m2, 0x33 ; 1 3 1 3
+    vpblendd             m2, m1, m3, 0x0f ; 1 3 2 2
+    vpblendd             m1, m1, m3, 0xf0 ; 0 2 1 3
+    vpblendd             m2, m2, m0, 0xc0 ; 1 3 2 4
+    punpcklbw            m3, m2, m1
+    punpckhbw            m2, m1
+    pmaddubsw            m3, m6
+    pmaddubsw            m2, m6
+    mova        [tmpq+32*0], m3
+    mova        [tmpq+32*1], m2
+%endif
+    add                tmpq, 32*2
+    sub                  hd, 4
+    jg .v_w8_loop
+    RET
+.v_w16:
+%if cpuflag(avx512icl)
+    mova                 m5, [bilin_v_perm16]
+    movu                xm0, [srcq+strideq*0]
+.v_w16_loop:
+    movu                xm2, [srcq+strideq*2]
+    vinserti128         ym1, ym0, [srcq+strideq*1], 1
+    vpermt2b             m1, m5, m2
+    vinserti128         ym2, [srcq+stride3q ], 1
+    lea                srcq, [srcq+strideq*4]
+    movu                xm0, [srcq+strideq*0]
+    vpermt2b             m2, m5, m0
+    pmaddubsw            m1, m6
+    pmaddubsw            m2, m6
+    mova        [tmpq+64*0], m1
+    mova        [tmpq+64*1], m2
+%else
+    vbroadcasti128       m0, [srcq+strideq*0]
+.v_w16_loop:
+    vbroadcasti128       m1, [srcq+strideq*2]
+    vbroadcasti128       m2, [srcq+strideq*1]
+    vbroadcasti128       m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    shufpd               m4, m0, m1, 0x0c ; 0 2  ; 0l2l 0h2h
+    vbroadcasti128       m0, [srcq+strideq*0]
+    shufpd               m2, m2, m3, 0x0c ; 1 3  ; 1l3l 1h3h
+    shufpd               m1, m1, m0, 0x0c ; 2 4  ; 2l4l 2h4h
+    punpcklbw            m3, m2, m4
+    punpcklbw            m5, m1, m2
+    punpckhbw            m1, m2
+    punpckhbw            m2, m4
+    pmaddubsw            m3, m6
+    pmaddubsw            m5, m6
+    pmaddubsw            m2, m6
+    pmaddubsw            m1, m6
+    mova        [tmpq+32*0], m3
+    mova        [tmpq+32*1], m5
+    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*3], m1
+%endif
+    add                tmpq, 32*4
+    sub                  hd, 4
+    jg .v_w16_loop
+    RET
+.v_w32:
+%if cpuflag(avx512icl)
+    mova                 m5, [bilin_v_perm32]
+    movu                ym0, [srcq+strideq*0]
+.v_w32_loop:
+    movu                ym2, [srcq+strideq*1]
+    movu                ym3, [srcq+strideq*2]
+    movu                ym4, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpermt2b             m0, m5, m2
+    vpermt2b             m2, m5, m3
+    vpermt2b             m3, m5, m4
+    pmaddubsw            m1, m0, m6
+    movu                ym0, [srcq+strideq*0]
+    vpermt2b             m4, m5, m0
+    pmaddubsw            m2, m6
+    pmaddubsw            m3, m6
+    pmaddubsw            m4, m6
+    mova        [tmpq+64*0], m1
+    mova        [tmpq+64*1], m2
+    mova        [tmpq+64*2], m3
+    mova        [tmpq+64*3], m4
+    add                tmpq, 64*4
+%else
+    vpermq              ym0, [srcq+strideq*0], q3120
+.v_w32_loop:
+    vpermq              ym1, [srcq+strideq*1], q3120
+    vpermq              ym2, [srcq+strideq*2], q3120
+    vpermq              ym3, [srcq+stride3q ], q3120
+    lea                srcq, [srcq+strideq*4]
+    punpcklbw            m4, m1, m0
+    punpckhbw            m5, m1, m0
+    vpermq              ym0, [srcq+strideq*0], q3120
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    mova        [tmpq+32*0], ym4
+    mova        [tmpq+32*1], ym5
+    punpcklbw            m4, m2, m1
+    punpckhbw            m5, m2, m1
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    mova        [tmpq+32*2], ym4
+    mova        [tmpq+32*3], ym5
+    add                tmpq, 32*8
+    punpcklbw            m4, m3, m2
+    punpckhbw            m5, m3, m2
+    punpcklbw            m1, m0, m3
+    punpckhbw            m2, m0, m3
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    pmaddubsw            m1, m6
+    pmaddubsw            m2, m6
+    mova        [tmpq-32*4], m4
+    mova        [tmpq-32*3], m5
+    mova        [tmpq-32*2], m1
+    mova        [tmpq-32*1], m2
+%endif
+    sub                  hd, 4
+    jg .v_w32_loop
+    RET
+.v_w64:
+%if cpuflag(avx512)
+    mova                 m5, [bilin_v_perm64]
+    vpermq               m0, m5, [srcq+strideq*0]
+.v_w64_loop:
+    vpermq               m1, m5, [srcq+strideq*1]
+    lea                srcq,     [srcq+strideq*2]
+    punpcklbw            m4, m1, m0
+    punpckhbw            m2, m1, m0
+    vpermq               m0, m5, [srcq+strideq*0]
+    punpcklbw            m3, m0, m1
+    punpckhbw            m1, m0, m1
+    pmaddubsw            m4, m6
+    pmaddubsw            m2, m6
+    pmaddubsw            m3, m6
+    pmaddubsw            m1, m6
+    mova        [tmpq+64*0], m4
+    mova        [tmpq+64*1], m2
+    mova        [tmpq+64*2], m3
+    mova        [tmpq+64*3], m1
+    add                tmpq, 64*4
+%else
+    vpermq               m0, [srcq+strideq*0+32*0], q3120
+    vpermq               m1, [srcq+strideq*0+32*1], q3120
+.v_w64_loop:
+    vpermq               m2, [srcq+strideq*1+32*0], q3120
+    vpermq               m3, [srcq+strideq*1+32*1], q3120
+    lea                srcq, [srcq+strideq*2]
+    punpcklbw            m4, m2, m0
+    punpckhbw            m5, m2, m0
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    mova        [tmpq+32*0], m4
+    mova        [tmpq+32*1], m5
+    punpcklbw            m4, m3, m1
+    punpckhbw            m5, m3, m1
+    vpermq               m0, [srcq+strideq*0+32*0], q3120
+    vpermq               m1, [srcq+strideq*0+32*1], q3120
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    mova        [tmpq+32*2], m4
+    mova        [tmpq+32*3], m5
+    add                tmpq, 32*8
+    punpcklbw            m4, m0, m2
+    punpckhbw            m5, m0, m2
+    punpcklbw            m2, m1, m3
+    punpckhbw            m3, m1, m3
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    pmaddubsw            m2, m6
+    pmaddubsw            m3, m6
+    mova        [tmpq-32*4], m4
+    mova        [tmpq-32*3], m5
+    mova        [tmpq-32*2], m2
+    mova        [tmpq-32*1], m3
+%endif
+    sub                  hd, 2
+    jg .v_w64_loop
+    RET
+.v_w128:
+%if cpuflag(avx512)
+    mova                 m5, [bilin_v_perm64]
+    vpermq               m0, m5, [srcq+strideq*0+ 0]
+    vpermq               m1, m5, [srcq+strideq*0+64]
+.v_w128_loop:
+    vpermq               m2, m5, [srcq+strideq*1+ 0]
+    vpermq               m3, m5, [srcq+strideq*1+64]
+    lea                srcq, [srcq+strideq*2]
+    punpcklbw            m4, m2, m0
+    punpckhbw            m0, m2, m0
+    pmaddubsw            m4, m6
+    pmaddubsw            m0, m6
+    mova        [tmpq+64*0], m4
+    mova        [tmpq+64*1], m0
+    punpcklbw            m4, m3, m1
+    punpckhbw            m1, m3, m1
+    pmaddubsw            m4, m6
+    pmaddubsw            m1, m6
+    mova        [tmpq+64*2], m4
+    mova        [tmpq+64*3], m1
+    vpermq               m0, m5, [srcq+strideq*0+ 0]
+    vpermq               m1, m5, [srcq+strideq*0+64]
+    punpcklbw            m4, m0, m2
+    punpckhbw            m2, m0, m2
+    pmaddubsw            m4, m6
+    pmaddubsw            m2, m6
+    mova        [tmpq+64*4], m4
+    mova        [tmpq+64*5], m2
+    punpcklbw            m4, m1, m3
+    punpckhbw            m3, m1, m3
+    pmaddubsw            m4, m6
+    pmaddubsw            m3, m6
+    mova        [tmpq+64*6], m4
+    mova        [tmpq+64*7], m3
+    add                tmpq, 64*8
+    sub                  hd, 2
+    jg .v_w128_loop
+%else
+    mov                  t0, tmpq
+    mov                  t1, srcq
+    lea                 t2d, [hq+(3<<8)]
+.v_w128_loop0:
+    vpermq               m0, [srcq+strideq*0], q3120
+.v_w128_loop:
+    vpermq               m1, [srcq+strideq*1], q3120
+    lea                srcq, [srcq+strideq*2]
+    punpcklbw            m2, m1, m0
+    punpckhbw            m3, m1, m0
+    vpermq               m0, [srcq+strideq*0], q3120
+    punpcklbw            m4, m0, m1
+    punpckhbw            m5, m0, m1
+    pmaddubsw            m2, m6
+    pmaddubsw            m3, m6
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    mova        [tmpq+32*0], m2
+    mova        [tmpq+32*1], m3
+    mova        [tmpq+32*8], m4
+    mova        [tmpq+32*9], m5
+    add                tmpq, 32*16
+    sub                  hd, 2
+    jg .v_w128_loop
+    movzx                hd, t2b
+    add                  t0, 64
+    add                  t1, 32
+    mov                tmpq, t0
+    mov                srcq, t1
+    sub                 t2d, 1<<8
+    jg .v_w128_loop0
+%endif
+    RET
+.hv:
+    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM       7
+    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+    shl                mxyd, 11
+%if cpuflag(avx512)
+    vpbroadcastw         m6, mxyd
+%else
+    movd                xm6, mxyd
+    vpbroadcastw         m6, xm6
+%endif
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.hv_w4:
+    vbroadcasti128      ym4, [bilin_h_shuf4]
+    vpbroadcastq        ym0, [srcq+strideq*0]
+    pshufb              ym0, ym4
+    pmaddubsw           ym0, ym5
+.hv_w4_loop:
+    movq                xm1, [srcq+strideq*1]
+    movhps              xm1, [srcq+strideq*2]
+    movq                xm2, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    movhps              xm2, [srcq+strideq*0]
+    vinserti128         ym1, xm2, 1
+    pshufb              ym1, ym4
+    pmaddubsw           ym1, ym5         ; 1 2 3 4
+%if cpuflag(avx512)
+    valignq             ym2, ym1, ym0, 3 ; 0 1 2 3
+%else
+    vpblendd            ym2, ym1, ym0, 0xc0
+    vpermq              ym2, ym2, q2103  ; 0 1 2 3
+%endif
+    mova                ym0, ym1
+    psubw               ym1, ym2
+    pmulhrsw            ym1, ym6
+    paddw               ym1, ym2
+    mova             [tmpq], ym1
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+%if cpuflag(avx512)
+    vbroadcasti128       m4, [bilin_h_shuf8]
+%endif
+    vbroadcasti128       m0, [srcq+strideq*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w8_loop:
+    movu                xm1, [srcq+strideq*1]
+%if cpuflag(avx512)
+    vinserti128         ym1, [srcq+strideq*2], 1
+    vinserti128          m1, [srcq+stride3q ], 2
+    lea                srcq, [srcq+strideq*4]
+    vinserti128          m1, [srcq+strideq*0], 3
+    pshufb               m1, m4
+    pmaddubsw            m1, m5        ; 1 2 3 4
+    valignq              m2, m1, m0, 6 ; 0 1 2 3
+    mova                 m0, m1
+    psubw                m1, m2
+    pmulhrsw             m1, m6
+    paddw                m1, m2
+    mova             [tmpq], m1
+%else
+    vinserti128          m1, m1, [srcq+strideq*2], 1
+    movu                xm2,     [srcq+stride3q ]
+    lea                srcq,     [srcq+strideq*4]
+    vinserti128          m2, m2, [srcq+strideq*0], 1
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5           ; 1 2
+    vperm2i128           m3, m0, m1, 0x21 ; 0 1
+    pmaddubsw            m0, m2, m5       ; 3 4
+    vperm2i128           m2, m1, m0, 0x21 ; 2 3
+    psubw                m1, m3
+    pmulhrsw             m1, m6
+    paddw                m1, m3
+    psubw                m3, m0, m2
+    pmulhrsw             m3, m6
+    paddw                m3, m2
+    mova        [tmpq+32*0], m1
+    mova        [tmpq+32*1], m3
+%endif
+    add                tmpq, 32*2
+    sub                  hd, 4
+    jg .hv_w8_loop
+    RET
+.hv_w16:
+%if cpuflag(avx512icl)
+    mova                 m4, [bilin_h_perm16]
+    vbroadcasti32x8      m0, [srcq+strideq*0]
+    vpermb               m0, m4, m0
+%else
+    movu                xm0, [srcq+strideq*0+8*0]
+    vinserti128          m0, [srcq+strideq*0+8*1], 1
+    pshufb               m0, m4
+%endif
+    pmaddubsw            m0, m5
+.hv_w16_loop:
+%if cpuflag(avx512icl)
+    movu                ym1, [srcq+strideq*1]
+    vinserti32x8         m1, [srcq+strideq*2], 1
+    movu                ym2, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vinserti32x8         m2, [srcq+strideq*0], 1
+    vpermb               m1, m4, m1
+    vpermb               m2, m4, m2
+    pmaddubsw            m1, m5            ; 1 2
+    vshufi32x4           m3, m0, m1, q1032 ; 0 1
+    pmaddubsw            m0, m2, m5        ; 3 4
+    vshufi32x4           m2, m1, m0, q1032 ; 2 3
+    psubw                m1, m3
+    pmulhrsw             m1, m6
+    paddw                m1, m3
+    psubw                m3, m0, m2
+    pmulhrsw             m3, m6
+    paddw                m3, m2
+    mova        [tmpq+64*0], m1
+    mova        [tmpq+64*1], m3
+%else
+    movu                xm1, [srcq+strideq*1+8*0]
+    vinserti128          m1, [srcq+strideq*1+8*1], 1
+    lea                srcq, [srcq+strideq*2]
+    movu                xm2, [srcq+strideq*0+8*0]
+    vinserti128          m2, [srcq+strideq*0+8*1], 1
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5
+    psubw                m3, m1, m0
+    pmulhrsw             m3, m6
+    paddw                m3, m0
+    pmaddubsw            m0, m2, m5
+    psubw                m2, m0, m1
+    pmulhrsw             m2, m6
+    paddw                m2, m1
+    mova        [tmpq+32*0], m3
+    mova        [tmpq+32*1], m2
+%endif
+    add                tmpq, mmsize*2
+    sub                  hd, mmsize*2/(16*2)
+    jg .hv_w16_loop
+    RET
+.hv_w32:
+%if cpuflag(avx512icl)
+    mova                 m4, [bilin_h_perm32]
+    vpermb               m0, m4, [srcq+strideq*0]
+    pmaddubsw            m0, m5
+.hv_w32_loop:
+    vpermb               m1, m4, [srcq+strideq*1]
+    lea                srcq,     [srcq+strideq*2]
+    vpermb               m2, m4, [srcq+strideq*0]
+    pmaddubsw            m1, m5
+    psubw                m3, m1, m0
+    pmulhrsw             m3, m6
+    paddw                m3, m0
+    pmaddubsw            m0, m2, m5
+    psubw                m2, m0, m1
+    pmulhrsw             m2, m6
+    paddw                m2, m1
+    mova        [tmpq+64*0], m3
+    mova        [tmpq+64*1], m2
+    add                tmpq, 64*2
+    sub                  hd, 2
+%else
+    movu                xm0, [srcq+8*0]
+    vinserti128          m0, [srcq+8*1], 1
+    movu                xm1, [srcq+8*2]
+    vinserti128          m1, [srcq+8*3], 1
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+.hv_w32_loop:
+    add                srcq, strideq
+    movu                xm2,     [srcq+8*0]
+    vinserti128          m2, m2, [srcq+8*1], 1
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m3, m2, m0
+    pmulhrsw             m3, m6
+    paddw                m3, m0
+    mova                 m0, m2
+    mova          [tmpq+ 0], m3
+    movu                xm2,     [srcq+8*2]
+    vinserti128          m2, m2, [srcq+8*3], 1
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m3, m2, m1
+    pmulhrsw             m3, m6
+    paddw                m3, m1
+    mova                 m1, m2
+    mova          [tmpq+32], m3
+    add                tmpq, 32*2
+    dec                  hd
+%endif
+    jg .hv_w32_loop
+    RET
+.hv_w64:
+%if cpuflag(avx512icl)
+    mova                 m4, [bilin_h_perm32]
+    vpermb               m0, m4, [srcq+32*0]
+    vpermb               m1, m4, [srcq+32*1]
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+.hv_w64_loop:
+    add                srcq, strideq
+    vpermb               m2, m4, [srcq+32*0]
+    vpermb               m3, m4, [srcq+32*1]
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    psubw                m7, m2, m0
+    psubw                m8, m3, m1
+    pmulhrsw             m7, m6
+    pmulhrsw             m8, m6
+    paddw                m7, m0
+    paddw                m8, m1
+    mova          [tmpq+ 0], m7
+    mova          [tmpq+64], m8
+    mova                 m0, m2
+    mova                 m1, m3
+    add                tmpq, 64*2
+    dec                  hd
+    jg .hv_w64_loop
+%else
+    mov                  t0, tmpq
+    mov                  t1, srcq
+    lea                 t2d, [hq+(3<<8)]
+.hv_w64_loop0:
+    movu                xm0,     [srcq+strideq*0+8*0]
+    vinserti128          m0, m0, [srcq+strideq*0+8*1], 1
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w64_loop:
+    movu                xm1,     [srcq+strideq*1+8*0]
+    vinserti128          m1, m1, [srcq+strideq*1+8*1], 1
+    lea                srcq,     [srcq+strideq*2]
+    movu                xm2,     [srcq+strideq*0+8*0]
+    vinserti128          m2, m2, [srcq+strideq*0+8*1], 1
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5
+    psubw                m3, m1, m0
+    pmulhrsw             m3, m6
+    paddw                m3, m0
+    pmaddubsw            m0, m2, m5
+    psubw                m2, m0, m1
+    pmulhrsw             m2, m6
+    paddw                m2, m1
+    mova        [tmpq+32*0], m3
+    add                tmpq, 32*8
+    mova        [tmpq-32*4], m2
+    sub                  hd, 2
+    jg .hv_w64_loop
+    movzx                hd, t2b
+    add                  t0, 32
+    add                  t1, 16
+    mov                tmpq, t0
+    mov                srcq, t1
+    sub                 t2d, 1<<8
+    jg .hv_w64_loop0
+%endif
+    RET
+.hv_w128:
+%if cpuflag(avx512icl)
+    mova                 m4, [bilin_h_perm32]
+    vpermb               m0, m4, [srcq+32*0]
+    vpermb               m1, m4, [srcq+32*1]
+    vpermb               m2, m4, [srcq+32*2]
+    vpermb               m3, m4, [srcq+32*3]
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+.hv_w128_loop:
+    add                srcq, strideq
+    vpermb               m7, m4, [srcq+32*0]
+    vpermb               m8, m4, [srcq+32*1]
+    vpermb               m9, m4, [srcq+32*2]
+    vpermb              m10, m4, [srcq+32*3]
+    pmaddubsw            m7, m5
+    pmaddubsw            m8, m5
+    pmaddubsw            m9, m5
+    pmaddubsw           m10, m5
+    psubw               m11, m7, m0
+    psubw               m12, m8, m1
+    psubw               m13, m9, m2
+    psubw               m14, m10, m3
+    pmulhrsw            m11, m6
+    pmulhrsw            m12, m6
+    pmulhrsw            m13, m6
+    pmulhrsw            m14, m6
+    paddw               m11, m0
+    paddw               m12, m1
+    paddw               m13, m2
+    paddw               m14, m3
+    mova        [tmpq+64*0], m11
+    mova        [tmpq+64*1], m12
+    mova        [tmpq+64*2], m13
+    mova        [tmpq+64*3], m14
+    mova                 m0, m7
+    mova                 m1, m8
+    mova                 m2, m9
+    mova                 m3, m10
+    add                tmpq, 64*4
+    dec                  hd
+    jg .hv_w128_loop
+%else
+    mov                  t0, tmpq
+    mov                  t1, srcq
+    lea                 t2d, [hq+(7<<8)]
+.hv_w128_loop0:
+    movu                xm0,     [srcq+strideq*0+8*0]
+    vinserti128          m0, m0, [srcq+strideq*0+8*1], 1
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w128_loop:
+    movu                xm1,     [srcq+strideq*1+8*0]
+    vinserti128          m1, m1, [srcq+strideq*1+8*1], 1
+    lea                srcq,     [srcq+strideq*2]
+    movu                xm2,     [srcq+strideq*0+8*0]
+    vinserti128          m2, m2, [srcq+strideq*0+8*1], 1
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5
+    psubw                m3, m1, m0
+    pmulhrsw             m3, m6
+    paddw                m3, m0
+    pmaddubsw            m0, m2, m5
+    psubw                m2, m0, m1
+    pmulhrsw             m2, m6
+    paddw                m2, m1
+    mova        [tmpq+32*0], m3
+    mova        [tmpq+32*8], m2
+    add                tmpq, 32*16
+    sub                  hd, 2
+    jg .hv_w128_loop
+    movzx                hd, t2b
+    add                  t0, mmsize
+    add                  t1, mmsize/2
+    mov                tmpq, t0
+    mov                srcq, t1
+    sub                 t2d, 1<<8
+    jg .hv_w128_loop0
+%endif
+    RET
+%endmacro
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
+%assign FILTER_SHARP   (2*15 << 16) | 3*15
+
+%macro FN 4 ; fn, type, type_h, type_v
+cglobal %1_%2
+    mov                 t0d, FILTER_%3
+    mov                 t1d, FILTER_%4
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+    jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
+%endif
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+
+PUT_8TAP_FN regular,        REGULAR, REGULAR
+PUT_8TAP_FN regular_sharp,  REGULAR, SHARP
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR
+PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH
+PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
+PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR
+PUT_8TAP_FN sharp,          SHARP,   SHARP
+PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
+
+cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+    imul                mxd, mxm, 0x010101
+    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+    imul                myd, mym, 0x010101
+    add                 myd, t1d ; 8tap_v, my, 4tap_v
+    lea                  r8, [put_avx2]
+    movsxd               wq, wm
+    movifnidn            hd, hm
+    test                mxd, 0xf00
+    jnz .h
+    test                myd, 0xf00
+    jnz .v
+    tzcnt                wd, wd
+    movzx                wd, word [r8+wq*2+table_offset(put,)]
+    add                  wq, r8
+    lea                  r6, [ssq*3]
+    lea                  r7, [dsq*3]
+%if WIN64
+    pop                  r8
+%endif
+    jmp                  wq
+.h:
+    test                myd, 0xf00
+    jnz .hv
+    vpbroadcastd         m5, [pw_34] ; 2 + (8 << 2)
+    WIN64_SPILL_XMM      11
+    cmp                  wd, 4
+    jl .h_w2
+    vbroadcasti128       m6, [subpel_h_shufA]
+    je .h_w4
+    tzcnt                wd, wd
+    vbroadcasti128       m7, [subpel_h_shufB]
+    vbroadcasti128       m8, [subpel_h_shufC]
+    shr                 mxd, 16
+    sub                srcq, 3
+    movzx                wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+    vpbroadcastd         m9, [r8+mxq*8+subpel_filters-put_avx2+0]
+    vpbroadcastd        m10, [r8+mxq*8+subpel_filters-put_avx2+4]
+    add                  wq, r8
+    jmp                  wq
+.h_w2:
+    movzx               mxd, mxb
+    dec                srcq
+    mova                xm4, [subpel_h_shuf4]
+    vpbroadcastd        xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w2_loop:
+    movq                xm0, [srcq+ssq*0]
+    movhps              xm0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb              xm0, xm4
+    pmaddubsw           xm0, xm3
+    phaddw              xm0, xm0
+    paddw               xm0, xm5
+    psraw               xm0, 6
+    packuswb            xm0, xm0
+    pextrw     [dstq+dsq*0], xm0, 0
+    pextrw     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w2_loop
+    RET
+.h_w4:
+    movzx               mxd, mxb
+    dec                srcq
+    vpbroadcastd        xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w4_loop:
+    movq                xm0, [srcq+ssq*0]
+    movq                xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb              xm0, xm6
+    pshufb              xm1, xm6
+    pmaddubsw           xm0, xm3
+    pmaddubsw           xm1, xm3
+    phaddw              xm0, xm1
+    paddw               xm0, xm5
+    psraw               xm0, 6
+    packuswb            xm0, xm0
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w4_loop
+    RET
+.h_w8:
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+    pshufb              m%2, m%1, m7
+    pshufb              m%3, m%1, m8
+    pshufb              m%1, m6
+    pmaddubsw           m%4, m%2, m9
+    pmaddubsw           m%2, m10
+    pmaddubsw           m%3, m10
+    pmaddubsw           m%1, m9
+    paddw               m%3, m%4
+    paddw               m%1, m%2
+    phaddw              m%1, m%3
+    paddw               m%1, m5
+    psraw               m%1, 6
+%endmacro
+    movu                xm0,     [srcq+ssq*0]
+    vinserti128          m0, m0, [srcq+ssq*1], 1
+    lea                srcq,     [srcq+ssq*2]
+    PUT_8TAP_H            0, 1, 2, 3
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    movq       [dstq+dsq*0], xm0
+    movhps     [dstq+dsq*1], xm0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w8
+    RET
+.h_w16:
+    movu                xm0,     [srcq+ssq*0+8*0]
+    vinserti128          m0, m0, [srcq+ssq*1+8*0], 1
+    movu                xm1,     [srcq+ssq*0+8*1]
+    vinserti128          m1, m1, [srcq+ssq*1+8*1], 1
+    PUT_8TAP_H            0, 2, 3, 4
+    lea                srcq, [srcq+ssq*2]
+    PUT_8TAP_H            1, 2, 3, 4
+    packuswb             m0, m1
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], m0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w16
+    RET
+.h_w32:
+    xor                 r6d, r6d
+    jmp .h_start
+.h_w64:
+    mov                  r6, -32*1
+    jmp .h_start
+.h_w128:
+    mov                  r6, -32*3
+.h_start:
+    sub                srcq, r6
+    sub                dstq, r6
+    mov                  r4, r6
+.h_loop:
+    movu                 m0, [srcq+r6+8*0]
+    movu                 m1, [srcq+r6+8*1]
+    PUT_8TAP_H            0, 2, 3, 4
+    PUT_8TAP_H            1, 2, 3, 4
+    packuswb             m0, m1
+    mova          [dstq+r6], m0
+    add                  r6, 32
+    jle .h_loop
+    add                srcq, ssq
+    add                dstq, dsq
+    mov                  r6, r4
+    dec                  hd
+    jg .h_loop
+    RET
+.v:
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM      16
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    tzcnt               r6d, wd
+    movzx               r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
+    vpbroadcastd         m7, [pw_512]
+    lea                 myq, [r8+myq*8+subpel_filters-put_avx2]
+    vpbroadcastw         m8, [myq+0]
+    vpbroadcastw         m9, [myq+2]
+    vpbroadcastw        m10, [myq+4]
+    vpbroadcastw        m11, [myq+6]
+    add                  r6, r8
+    lea                ss3q, [ssq*3]
+    sub                srcq, ss3q
+    jmp                  r6
+.v_w2:
+    movd                xm2, [srcq+ssq*0]
+    pinsrw              xm2, [srcq+ssq*1], 2
+    pinsrw              xm2, [srcq+ssq*2], 4
+    pinsrw              xm2, [srcq+ss3q ], 6 ; 0 1 2 3
+    lea                srcq, [srcq+ssq*4]
+    movd                xm3, [srcq+ssq*0]
+    vpbroadcastd        xm1, [srcq+ssq*1]
+    vpbroadcastd        xm0, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpblendd            xm3, xm3, xm1, 0x02  ; 4 5
+    vpblendd            xm1, xm1, xm0, 0x02  ; 5 6
+    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
+    punpcklbw           xm3, xm1             ; 45 56
+    punpcklbw           xm1, xm2, xm4        ; 01 12
+    punpckhbw           xm2, xm4             ; 23 34
+.v_w2_loop:
+    pmaddubsw           xm5, xm1, xm8        ; a0 b0
+    mova                xm1, xm2
+    pmaddubsw           xm2, xm9             ; a1 b1
+    paddw               xm5, xm2
+    mova                xm2, xm3
+    pmaddubsw           xm3, xm10            ; a2 b2
+    paddw               xm5, xm3
+    vpbroadcastd        xm4, [srcq+ssq*0]
+    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
+    vpbroadcastd        xm0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd            xm4, xm4, xm0, 0x02  ; 7 8
+    punpcklbw           xm3, xm4             ; 67 78
+    pmaddubsw           xm4, xm3, xm11       ; a3 b3
+    paddw               xm5, xm4
+    pmulhrsw            xm5, xm7
+    packuswb            xm5, xm5
+    pextrw     [dstq+dsq*0], xm5, 0
+    pextrw     [dstq+dsq*1], xm5, 2
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w2_loop
+    RET
+.v_w4:
+    movd                xm2, [srcq+ssq*0]
+    pinsrd              xm2, [srcq+ssq*1], 1
+    pinsrd              xm2, [srcq+ssq*2], 2
+    pinsrd              xm2, [srcq+ss3q ], 3 ; 0 1 2 3
+    lea                srcq, [srcq+ssq*4]
+    movd                xm3, [srcq+ssq*0]
+    vpbroadcastd        xm1, [srcq+ssq*1]
+    vpbroadcastd        xm0, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpblendd            xm3, xm3, xm1, 0x02  ; 4 5
+    vpblendd            xm1, xm1, xm0, 0x02  ; 5 6
+    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
+    punpcklbw           xm3, xm1             ; 45 56
+    punpcklbw           xm1, xm2, xm4        ; 01 12
+    punpckhbw           xm2, xm4             ; 23 34
+.v_w4_loop:
+    pmaddubsw           xm5, xm1, xm8        ; a0 b0
+    mova                xm1, xm2
+    pmaddubsw           xm2, xm9             ; a1 b1
+    paddw               xm5, xm2
+    mova                xm2, xm3
+    pmaddubsw           xm3, xm10            ; a2 b2
+    paddw               xm5, xm3
+    vpbroadcastd        xm4, [srcq+ssq*0]
+    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
+    vpbroadcastd        xm0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd            xm4, xm4, xm0, 0x02  ; 7 8
+    punpcklbw           xm3, xm4             ; 67 78
+    pmaddubsw           xm4, xm3, xm11       ; a3 b3
+    paddw               xm5, xm4
+    pmulhrsw            xm5, xm7
+    packuswb            xm5, xm5
+    movd       [dstq+dsq*0], xm5
+    pextrd     [dstq+dsq*1], xm5, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w4_loop
+    RET
+.v_w8:
+    movq                xm1, [srcq+ssq*0]
+    vpbroadcastq         m4, [srcq+ssq*1]
+    vpbroadcastq         m2, [srcq+ssq*2]
+    vpbroadcastq         m5, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    vpbroadcastq         m3, [srcq+ssq*0]
+    vpbroadcastq         m6, [srcq+ssq*1]
+    vpbroadcastq         m0, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpblendd             m1, m1, m4, 0x30
+    vpblendd             m4, m4, m2, 0x30
+    punpcklbw            m1, m4 ; 01 12
+    vpblendd             m2, m2, m5, 0x30
+    vpblendd             m5, m5, m3, 0x30
+    punpcklbw            m2, m5 ; 23 34
+    vpblendd             m3, m3, m6, 0x30
+    vpblendd             m6, m6, m0, 0x30
+    punpcklbw            m3, m6 ; 45 56
+.v_w8_loop:
+    pmaddubsw            m5, m1, m8  ; a0 b0
+    mova                 m1, m2
+    pmaddubsw            m2, m9      ; a1 b1
+    paddw                m5, m2
+    mova                 m2, m3
+    pmaddubsw            m3, m10     ; a2 b2
+    paddw                m5, m3
+    vpbroadcastq         m4, [srcq+ssq*0]
+    vpblendd             m3, m0, m4, 0x30
+    vpbroadcastq         m0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd             m4, m4, m0, 0x30
+    punpcklbw            m3, m4      ; 67 78
+    pmaddubsw            m4, m3, m11 ; a3 b3
+    paddw                m5, m4
+    pmulhrsw             m5, m7
+    vextracti128        xm4, m5, 1
+    packuswb            xm5, xm4
+    movq       [dstq+dsq*0], xm5
+    movhps     [dstq+dsq*1], xm5
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w8_loop
+    RET
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+    lea                 r6d, [wq-16]
+    mov                  r4, dstq
+    mov                  r7, srcq
+    shl                 r6d, 4
+    mov                 r6b, hb
+.v_w16_loop0:
+    vbroadcasti128       m4, [srcq+ssq*0]
+    vbroadcasti128       m5, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vbroadcasti128       m0, [srcq+ssq*1]
+    vbroadcasti128       m6, [srcq+ssq*0]
+    lea                srcq, [srcq+ssq*2]
+    vbroadcasti128       m1, [srcq+ssq*0]
+    vbroadcasti128       m2, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vbroadcasti128       m3, [srcq+ssq*0]
+    shufpd               m4, m4, m0, 0x0c
+    shufpd               m5, m5, m1, 0x0c
+    punpcklbw            m1, m4, m5 ; 01
+    punpckhbw            m4, m5     ; 34
+    shufpd               m6, m6, m2, 0x0c
+    punpcklbw            m2, m5, m6 ; 12
+    punpckhbw            m5, m6     ; 45
+    shufpd               m0, m0, m3, 0x0c
+    punpcklbw            m3, m6, m0 ; 23
+    punpckhbw            m6, m0     ; 56
+.v_w16_loop:
+    vbroadcasti128      m12, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vbroadcasti128      m13, [srcq+ssq*0]
+    pmaddubsw           m14, m1, m8  ; a0
+    pmaddubsw           m15, m2, m8  ; b0
+    mova                 m1, m3
+    mova                 m2, m4
+    pmaddubsw            m3, m9      ; a1
+    pmaddubsw            m4, m9      ; b1
+    paddw               m14, m3
+    paddw               m15, m4
+    mova                 m3, m5
+    mova                 m4, m6
+    pmaddubsw            m5, m10     ; a2
+    pmaddubsw            m6, m10     ; b2
+    paddw               m14, m5
+    paddw               m15, m6
+    shufpd               m6, m0, m12, 0x0d
+    shufpd               m0, m12, m13, 0x0c
+    punpcklbw            m5, m6, m0  ; 67
+    punpckhbw            m6, m0      ; 78
+    pmaddubsw           m12, m5, m11 ; a3
+    pmaddubsw           m13, m6, m11 ; b3
+    paddw               m14, m12
+    paddw               m15, m13
+    pmulhrsw            m14, m7
+    pmulhrsw            m15, m7
+    packuswb            m14, m15
+    vpermq              m14, m14, q3120
+    mova         [dstq+dsq*0], xm14
+    vextracti128 [dstq+dsq*1], m14, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w16_loop
+    movzx                hd, r6b
+    add                  r4, 16
+    add                  r7, 16
+    mov                dstq, r4
+    mov                srcq, r7
+    sub                 r6d, 1<<8
+    jg .v_w16_loop0
+    RET
+.hv:
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM      16
+    cmp                  wd, 4
+    jg .hv_w8
+    movzx               mxd, mxb
+    dec                srcq
+    vpbroadcastd         m7, [r8+mxq*8+subpel_filters-put_avx2+2]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    vpbroadcastq         m0, [r8+myq*8+subpel_filters-put_avx2]
+    lea                ss3q, [ssq*3]
+    sub                srcq, ss3q
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    vpbroadcastd         m8, [pw_8192]
+    vpbroadcastd         m9, [pd_512]
+    pshufd              m10, m0, q0000
+    pshufd              m11, m0, q1111
+    pshufd              m12, m0, q2222
+    pshufd              m13, m0, q3333
+    cmp                  wd, 4
+    je .hv_w4
+    vbroadcasti128       m6, [subpel_h_shuf4]
+    movq                xm2, [srcq+ssq*0]
+    movhps              xm2, [srcq+ssq*1]
+    movq                xm0, [srcq+ssq*2]
+    movhps              xm0, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    vpbroadcastq         m3, [srcq+ssq*0]
+    vpbroadcastq         m4, [srcq+ssq*1]
+    vpbroadcastq         m1, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpblendd             m2, m2, m3, 0x30
+    vpblendd             m0, m0, m1, 0x30
+    vpblendd             m2, m2, m4, 0xc0
+    pshufb               m2, m6
+    pshufb               m0, m6
+    pmaddubsw            m2, m7
+    pmaddubsw            m0, m7
+    phaddw               m2, m0
+    pmulhrsw             m2, m8
+    vextracti128        xm3, m2, 1
+    palignr             xm4, xm3, xm2, 4
+    punpcklwd           xm1, xm2, xm4  ; 01 12
+    punpckhwd           xm2, xm4       ; 23 34
+    pshufd              xm0, xm3, q2121
+    punpcklwd           xm3, xm0       ; 45 56
+.hv_w2_loop:
+    pmaddwd             xm5, xm1, xm10 ; a0 b0
+    mova                xm1, xm2
+    pmaddwd             xm2, xm11      ; a1 b1
+    paddd               xm5, xm2
+    mova                xm2, xm3
+    pmaddwd             xm3, xm12      ; a2 b2
+    paddd               xm5, xm3
+    movq                xm4, [srcq+ssq*0]
+    movhps              xm4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb              xm4, xm6
+    pmaddubsw           xm4, xm7
+    phaddw              xm4, xm4
+    pmulhrsw            xm4, xm8
+    palignr             xm3, xm4, xm0, 12
+    mova                xm0, xm4
+    punpcklwd           xm3, xm0       ; 67 78
+    pmaddwd             xm4, xm3, xm13 ; a3 b3
+    paddd               xm5, xm9
+    paddd               xm5, xm4
+    psrad               xm5, 10
+    packssdw            xm5, xm5
+    packuswb            xm5, xm5
+    pextrw     [dstq+dsq*0], xm5, 0
+    pextrw     [dstq+dsq*1], xm5, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w2_loop
+    RET
+.hv_w4:
+    mova                 m6, [subpel_h_shuf4]
+    vpbroadcastq         m2, [srcq+ssq*0]
+    vpbroadcastq         m4, [srcq+ssq*1]
+    vpbroadcastq         m0, [srcq+ssq*2]
+    vpbroadcastq         m5, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    vpbroadcastq         m3, [srcq+ssq*0]
+    vpblendd             m2, m2, m4, 0xcc ; 0 1
+    vpbroadcastq         m4, [srcq+ssq*1]
+    vpbroadcastq         m1, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpblendd             m0, m0, m5, 0xcc ; 2 3
+    vpblendd             m3, m3, m4, 0xcc ; 4 5
+    pshufb               m2, m6
+    pshufb               m0, m6
+    pshufb               m3, m6
+    pshufb               m1, m6
+    pmaddubsw            m2, m7
+    pmaddubsw            m0, m7
+    pmaddubsw            m3, m7
+    pmaddubsw            m1, m7
+    phaddw               m2, m0
+    phaddw               m3, m1
+    pmulhrsw             m2, m8
+    pmulhrsw             m3, m8
+    palignr              m4, m3, m2, 4
+    punpcklwd            m1, m2, m4  ; 01 12
+    punpckhwd            m2, m4      ; 23 34
+    pshufd               m0, m3, q2121
+    punpcklwd            m3, m0      ; 45 56
+.hv_w4_loop:
+    pmaddwd              m5, m1, m10 ; a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, m11     ; a1 b1
+    paddd                m5, m2
+    mova                 m2, m3
+    pmaddwd              m3, m12     ; a2 b2
+    paddd                m5, m3
+    vpbroadcastq         m4, [srcq+ssq*0]
+    vpbroadcastq         m3, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd             m4, m4, m3, 0xcc ; 7 8
+    pshufb               m4, m6
+    pmaddubsw            m4, m7
+    phaddw               m4, m4
+    pmulhrsw             m4, m8
+    palignr              m3, m4, m0, 12
+    mova                 m0, m4
+    punpcklwd            m3, m0      ; 67 78
+    pmaddwd              m4, m3, m13 ; a3 b3
+    paddd                m5, m9
+    paddd                m5, m4
+    psrad                m5, 10
+    vextracti128        xm4, m5, 1
+    packssdw            xm5, xm4
+    packuswb            xm5, xm5
+    pshuflw             xm5, xm5, q3120
+    movd       [dstq+dsq*0], xm5
+    pextrd     [dstq+dsq*1], xm5, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    shr                 mxd, 16
+    sub                srcq, 3
+    vpbroadcastd        m10, [r8+mxq*8+subpel_filters-put_avx2+0]
+    vpbroadcastd        m11, [r8+mxq*8+subpel_filters-put_avx2+4]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    vpbroadcastq         m0, [r8+myq*8+subpel_filters-put_avx2]
+    lea                ss3q, [ssq*3]
+    sub                srcq, ss3q
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    pshufd              m12, m0, q0000
+    pshufd              m13, m0, q1111
+    pshufd              m14, m0, q2222
+    pshufd              m15, m0, q3333
+    lea                 r6d, [wq-8]
+    mov                  r4, dstq
+    mov                  r7, srcq
+    shl                 r6d, 5
+    mov                 r6b, hb
+.hv_w8_loop0:
+    vbroadcasti128       m7, [subpel_h_shufA]
+    vbroadcasti128       m8, [subpel_h_shufB]
+    vbroadcasti128       m9, [subpel_h_shufC]
+    movu                xm4,     [srcq+ssq*0]
+    movu                xm5,     [srcq+ssq*1]
+    lea                srcq,     [srcq+ssq*2]
+    movu                xm6,     [srcq+ssq*0]
+    vbroadcasti128       m0,     [srcq+ssq*1]
+    lea                srcq,     [srcq+ssq*2]
+    vpblendd             m4, m4, m0, 0xf0        ; 0 3
+    vinserti128          m5, m5, [srcq+ssq*0], 1 ; 1 4
+    vinserti128          m6, m6, [srcq+ssq*1], 1 ; 2 5
+    lea                srcq,     [srcq+ssq*2]
+    vinserti128          m0, m0, [srcq+ssq*0], 1 ; 3 6
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+    pshufb               %3, %1, %6
+    pshufb               %4, %1, %7
+    pshufb               %1, %5
+    pmaddubsw            %2, %3, m10
+    pmaddubsw            %4, m11
+    pmaddubsw            %3, m11
+    pmaddubsw            %1, m10
+    paddw                %2, %4
+    paddw                %1, %3
+    phaddw               %1, %2
+%endmacro
+    HV_H_W8              m4, m1, m2, m3, m7, m8, m9
+    HV_H_W8              m5, m1, m2, m3, m7, m8, m9
+    HV_H_W8              m6, m1, m2, m3, m7, m8, m9
+    HV_H_W8              m0, m1, m2, m3, m7, m8, m9
+    vpbroadcastd         m7, [pw_8192]
+    vpermq               m4, m4, q3120
+    vpermq               m5, m5, q3120
+    vpermq               m6, m6, q3120
+    pmulhrsw             m0, m7
+    pmulhrsw             m4, m7
+    pmulhrsw             m5, m7
+    pmulhrsw             m6, m7
+    vpermq               m7, m0, q3120
+    punpcklwd            m1, m4, m5  ; 01
+    punpckhwd            m4, m5      ; 34
+    punpcklwd            m2, m5, m6  ; 12
+    punpckhwd            m5, m6      ; 45
+    punpcklwd            m3, m6, m7  ; 23
+    punpckhwd            m6, m7      ; 56
+.hv_w8_loop:
+    vextracti128        r6m, m0, 1 ; not enough registers
+    movu                xm0,     [srcq+ssq*1]
+    lea                srcq,     [srcq+ssq*2]
+    vinserti128          m0, m0, [srcq+ssq*0], 1 ; 7 8
+    pmaddwd              m8, m1, m12 ; a0
+    pmaddwd              m9, m2, m12 ; b0
+    mova                 m1, m3
+    mova                 m2, m4
+    pmaddwd              m3, m13     ; a1
+    pmaddwd              m4, m13     ; b1
+    paddd                m8, m3
+    paddd                m9, m4
+    mova                 m3, m5
+    mova                 m4, m6
+    pmaddwd              m5, m14     ; a2
+    pmaddwd              m6, m14     ; b2
+    paddd                m8, m5
+    paddd                m9, m6
+    vbroadcasti128       m6, [subpel_h_shufB]
+    vbroadcasti128       m7, [subpel_h_shufC]
+    vbroadcasti128       m5, [subpel_h_shufA]
+    HV_H_W8              m0, m5, m6, m7, m5, m6, m7
+    vpbroadcastd         m5, [pw_8192]
+    vpbroadcastd         m7, [pd_512]
+    vbroadcasti128       m6, r6m
+    pmulhrsw             m0, m5
+    paddd                m8, m7
+    paddd                m9, m7
+    vpermq               m7, m0, q3120    ; 7 8
+    shufpd               m6, m6, m7, 0x04 ; 6 7
+    punpcklwd            m5, m6, m7  ; 67
+    punpckhwd            m6, m7      ; 78
+    pmaddwd              m7, m5, m15 ; a3
+    paddd                m8, m7
+    pmaddwd              m7, m6, m15 ; b3
+    paddd                m7, m9
+    psrad                m8, 10
+    psrad                m7, 10
+    packssdw             m8, m7
+    vextracti128        xm7, m8, 1
+    packuswb            xm8, xm7
+    pshufd              xm7, xm8, q3120
+    movq       [dstq+dsq*0], xm7
+    movhps     [dstq+dsq*1], xm7
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    movzx                hd, r6b
+    add                  r4, 8
+    add                  r7, 8
+    mov                dstq, r4
+    mov                srcq, r7
+    sub                 r6d, 1<<8
+    jg .hv_w8_loop0
+    RET
+
+%macro PREP_8TAP_H 0
+ %if cpuflag(avx512)
+    vpermb              m10, m5, m0
+    vpermb              m11, m5, m1
+    vpermb              m12, m6, m0
+    vpermb              m13, m6, m1
+    vpermb              m14, m7, m0
+    vpermb              m15, m7, m1
+    mova                 m0, m4
+    mova                 m2, m4
+    mova                 m1, m4
+    mova                 m3, m4
+    vpdpbusd             m0, m10, m8
+    vpdpbusd             m2, m12, m8
+    vpdpbusd             m1, m11, m8
+    vpdpbusd             m3, m13, m8
+    vpdpbusd             m0, m12, m9
+    vpdpbusd             m2, m14, m9
+    vpdpbusd             m1, m13, m9
+    vpdpbusd             m3, m15, m9
+    packssdw             m0, m2
+    packssdw             m1, m3
+    psraw                m0, 2
+    psraw                m1, 2
+    mova          [tmpq+ 0], m0
+    mova          [tmpq+64], m1
+ %else
+    pshufb               m1, m0, m5
+    pshufb               m2, m0, m6
+    pshufb               m3, m0, m7
+    pmaddubsw            m1, m8
+    pmaddubsw            m0, m2, m8
+    pmaddubsw            m2, m9
+    pmaddubsw            m3, m9
+    paddw                m1, m2
+    paddw                m0, m3
+    phaddw               m0, m1, m0
+    pmulhrsw             m0, m4
+ %endif
+%endmacro
+
+%macro PREP_8TAP_V_W4 5 ; round, weights
+    movd                xm0, [srcq+strideq*0]
+    vpbroadcastd        ym1, [srcq+strideq*2]
+    vpbroadcastd        xm2, [srcq+strideq*1]
+    vpbroadcastd        ym3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpblendd            ym1, ym1, ym0, 0x01 ; 0 2 2 _   2 _ _ _
+    vpblendd            ym3, ym3, ym2, 0x03 ; 1 1 3 3   3 3 _ _
+    vpbroadcastd        ym0, [srcq+strideq*0]
+    vpbroadcastd        ym2, [srcq+strideq*1]
+    vpblendd            ym1, ym1, ym0, 0x68 ; 0 2 2 4   2 4 4 _
+    vpbroadcastd        ym0, [srcq+strideq*2]
+    vbroadcasti128      ym5, [deint_shuf4]
+    vpblendd            ym3, ym3, ym2, 0xc0 ; 1 1 3 3   3 3 5 5
+    vpblendd            ym2, ym3, ym1, 0x55 ; 0 1 2 3   2 3 4 5
+    vpblendd            ym3, ym3, ym1, 0xaa ; 1 2 3 4   3 4 5 _
+    punpcklbw           ym1, ym2, ym3       ; 01  12    23  34
+    vpblendd            ym3, ym3, ym0, 0x80 ; 1 2 3 4   3 4 5 6
+    punpckhbw           ym2, ym3           ; 23  34    45  56
+.v_w4_loop:
+    pinsrd              xm0, [srcq+stride3q ], 1
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastd        ym3, [srcq+strideq*0]
+    vpbroadcastd        ym4, [srcq+strideq*1]
+    vpblendd            ym3, ym3, ym4, 0x20 ; _ _ 8 _   8 9 _ _
+    vpblendd            ym3, ym3, ym0, 0x03 ; 6 7 8 _   8 9 _ _
+    vpbroadcastd        ym0, [srcq+strideq*2]
+    vpblendd            ym3, ym3, ym0, 0x40 ; 6 7 8 _   8 9 a _
+    pshufb              ym3, ym5           ; 67  78    89  9a
+    pmaddubsw           ym4, ym1, ym%2
+    vperm2i128          ym1, ym2, ym3, 0x21 ; 45  56    67  78
+    pmaddubsw           ym2, ym%3
+    paddw               ym4, ym2
+    mova                ym2, ym3
+    pmaddubsw           ym3, ym%5
+    paddw               ym3, ym4
+    pmaddubsw           ym4, ym1, ym%4
+    paddw               ym3, ym4
+    pmulhrsw            ym3, ym%1
+    mova             [tmpq], ym3
+%endmacro
+
+%macro PREP_8TAP_FN 3 ; type, type_h, type_v
+cglobal prep_8tap_%1
+    mov                 t0d, FILTER_%2
+    mov                 t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+    jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+%macro PREP_8TAP 0
+ %if WIN64
+  DECLARE_REG_TMP 6, 4
+ %else
+  DECLARE_REG_TMP 6, 7
+ %endif
+PREP_8TAP_FN regular,        REGULAR, REGULAR
+PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
+PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
+PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
+PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
+PREP_8TAP_FN sharp,          SHARP,   SHARP
+PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
+
+cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+    imul                mxd, mxm, 0x010101
+    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+    imul                myd, mym, 0x010101
+    add                 myd, t1d ; 8tap_v, my, 4tap_v
+    lea                  r7, [prep%+SUFFIX]
+    movsxd               wq, wm
+    movifnidn            hd, hm
+    test                mxd, 0xf00
+    jnz .h
+    test                myd, 0xf00
+    jnz .v
+    tzcnt                wd, wd
+    movzx                wd, word [r7+wq*2+table_offset(prep,)]
+    add                  wq, r7
+    lea                  r6, [strideq*3]
+%if WIN64
+    pop                  r7
+%endif
+    jmp                  wq
+.h:
+    test                myd, 0xf00
+    jnz .hv
+%if cpuflag(avx512)
+    vpbroadcastd         m4, [pd_2]
+%else
+    vpbroadcastd         m4, [pw_8192]
+    vbroadcasti128       m5, [subpel_h_shufA]
+%endif
+    WIN64_SPILL_XMM      10
+    cmp                  wd, 4
+    je .h_w4
+    tzcnt                wd, wd
+%if notcpuflag(avx512)
+    vbroadcasti128       m6, [subpel_h_shufB]
+    vbroadcasti128       m7, [subpel_h_shufC]
+%endif
+    shr                 mxd, 16
+    sub                srcq, 3
+    movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+    vpbroadcastd         m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+    add                  wq, r7
+    jmp                  wq
+.h_w4:
+%if cpuflag(avx512)
+    mov                 r3d, 0x4
+    kmovb                k1, r3d
+    vbroadcasti128      ym5, [subpel_h_shufA]
+%endif
+    movzx               mxd, mxb
+    dec                srcq
+    vpbroadcastd        ym6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+    lea            stride3q, [strideq*3]
+.h_w4_loop:
+%if cpuflag(avx512icl)
+    mova                ym0, ym4
+    mova                ym1, ym4
+    movq                xm2, [srcq+strideq*0]
+    movq                xm3, [srcq+strideq*1]
+    vpbroadcastq    ym2{k1}, [srcq+strideq*2]
+    vpbroadcastq    ym3{k1}, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    pshufb              ym2, ym5
+    pshufb              ym3, ym5
+    vpdpbusd            ym0, ym2, ym6
+    vpdpbusd            ym1, ym3, ym6
+    packssdw            ym0, ym1
+    psraw               ym0, 2
+%else
+    movq                xm0, [srcq+strideq*0]
+    vpbroadcastq         m2, [srcq+strideq*2]
+    movq                xm1, [srcq+strideq*1]
+    vpblendd             m0, m0, m2, 0xf0
+    vpbroadcastq         m2, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpblendd             m1, m1, m2, 0xf0
+    pshufb               m0, m5
+    pshufb               m1, m5
+    pmaddubsw            m0, m6
+    pmaddubsw            m1, m6
+    phaddw               m0, m1
+    pmulhrsw             m0, m4
+%endif
+    mova             [tmpq], ym0
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .h_w4_loop
+    RET
+.h_w8:
+%if cpuflag(avx512)
+    vbroadcasti128       m5, [subpel_h_shufA]
+    vbroadcasti128       m6, [subpel_h_shufB]
+    vbroadcasti128       m7, [subpel_h_shufC]
+    lea            stride3q, [strideq*3]
+%endif
+.h_w8_loop:
+    movu                xm0, [srcq+strideq*0]
+    vinserti128         ym0, [srcq+strideq*1], 1
+%if cpuflag(avx512)
+    vinserti128          m0, [srcq+strideq*2], 2
+    vinserti128          m0, [srcq+stride3q ], 3
+%endif
+    lea                srcq, [srcq+strideq*(mmsize/(8*2))]
+%if cpuflag(avx512icl)
+    mova                m10, m4
+    mova                m11, m4
+    pshufb               m1, m0, m5
+    pshufb               m2, m0, m6
+    pshufb               m3, m0, m7
+    vpdpbusd            m10, m1, m8
+    vpdpbusd            m11, m2, m8
+    vpdpbusd            m10, m2, m9
+    vpdpbusd            m11, m3, m9
+    packssdw            m10, m11
+    psraw                m0, m10, 2
+%else
+    PREP_8TAP_H
+%endif
+    mova             [tmpq], m0
+    add                tmpq, mmsize
+    sub                  hd, mmsize/(8*2)
+    jg .h_w8_loop
+    RET
+.h_w16:
+%if cpuflag(avx512icl)
+    mova                 m5, [spel_h_perm16a]
+    mova                 m6, [spel_h_perm16b]
+    mova                 m7, [spel_h_perm16c]
+    lea            stride3q, [strideq*3]
+.h_w16_loop:
+    movu                ym0, [srcq+strideq*0]
+    movu                ym1, [srcq+strideq*2]
+    vinserti32x8         m0, [srcq+strideq*1], 1
+    vinserti32x8         m1, [srcq+stride3q ], 1
+    lea                srcq, [srcq+strideq*4]
+    PREP_8TAP_H
+%else
+.h_w16_loop:
+    movu                xm0, [srcq+strideq*0+8*0]
+    vinserti128          m0, [srcq+strideq*0+8*1], 1
+    PREP_8TAP_H
+    mova        [tmpq+32*0], m0
+    movu                xm0,     [srcq+strideq*1+8*0]
+    vinserti128          m0, m0, [srcq+strideq*1+8*1], 1
+    lea                srcq, [srcq+strideq*2]
+    PREP_8TAP_H
+    mova        [tmpq+32*1], m0
+%endif
+    add                tmpq, mmsize*2
+    sub                  hd, mmsize*2/(16*2)
+    jg .h_w16_loop
+    RET
+.h_w32:
+%if cpuflag(avx512icl)
+    mova                 m5, [spel_h_perm32a]
+    mova                 m6, [spel_h_perm32b]
+    mova                 m7, [spel_h_perm32c]
+.h_w32_loop:
+    movu                 m0, [srcq+strideq*0]
+    movu                 m1, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    PREP_8TAP_H
+    add                tmpq, 64*2
+    sub                  hd, 2
+    jg .h_w32_loop
+    RET
+%else
+    xor                 r6d, r6d
+    jmp .h_start
+%endif
+.h_w64:
+%if cpuflag(avx512)
+    xor                 r6d, r6d
+%else
+    mov                  r6, -32*1
+%endif
+    jmp .h_start
+.h_w128:
+%if cpuflag(avx512)
+    mov                  r6, -64*1
+%else
+    mov                  r6, -32*3
+%endif
+.h_start:
+%if cpuflag(avx512)
+    mova                 m5, [spel_h_perm32a]
+    mova                 m6, [spel_h_perm32b]
+    mova                 m7, [spel_h_perm32c]
+%endif
+    sub                srcq, r6
+    mov                  r5, r6
+.h_loop:
+%if cpuflag(avx512icl)
+    movu                 m0, [srcq+r6+32*0]
+    movu                 m1, [srcq+r6+32*1]
+    PREP_8TAP_H
+%else
+    movu                xm0, [srcq+r6+8*0]
+    vinserti128         ym0, [srcq+r6+8*1], 1
+    PREP_8TAP_H
+    mova        [tmpq+32*0], m0
+    movu                xm0, [srcq+r6+8*2]
+    vinserti128         ym0, [srcq+r6+8*3], 1
+    PREP_8TAP_H
+    mova        [tmpq+32*1], m0
+%endif
+    add                tmpq, mmsize*2
+    add                  r6, mmsize
+    jle .h_loop
+    add                srcq, strideq
+    mov                  r6, r5
+    dec                  hd
+    jg .h_loop
+    RET
+.v:
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM      16
+    movzx               mxd, myb ; Select 4-tap/8-tap filter multipliers.
+    shr                 myd, 16  ; Note that the code is 8-tap only, having
+    cmp                  hd, 4   ; a separate 4-tap code path for (4|8|16)x4
+    cmove               myd, mxd ; had a negligible effect on performance.
+    ; TODO: Would a 6-tap code path be worth it?
+%if cpuflag(avx512)
+    tzcnt                wd, wd
+    movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_v)]
+    add                  wq, r7
+%endif
+    lea                 myq, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+    lea            stride3q, [strideq*3]
+    sub                srcq, stride3q
+    vpbroadcastd         m7, [pw_8192]
+    vpbroadcastw         m8, [myq+0]
+    vpbroadcastw         m9, [myq+2]
+    vpbroadcastw        m10, [myq+4]
+    vpbroadcastw        m11, [myq+6]
+%if cpuflag(avx512)
+    jmp                  wq
+%else
+    cmp                  wd, 8
+    jg .v_w16
+    je .v_w8
+%endif
+.v_w4:
+%if cpuflag(avx512)
+    AVX512_MM_PERMUTATION
+    PREP_8TAP_V_W4 23, 24, 25, 26, 27
+    AVX512_MM_PERMUTATION
+%else
+    PREP_8TAP_V_W4 7, 8, 9, 10, 11
+%endif
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .v_w4_loop
+%if cpuflag(avx512)
+    vzeroupper
+%endif
+    RET
+.v_w8:
+%if cpuflag(avx512)
+    mov                 r3d, 0xf044
+    kmovw                k1, r3d
+    kshiftrw             k2, k1, 8
+    movq                xm0, [srcq+strideq*0]
+    vpbroadcastq        ym1, [srcq+strideq*1]
+    vpbroadcastq         m2, [srcq+strideq*2]
+    vpbroadcastq         m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastq         m4, [srcq+strideq*0]
+    vpbroadcastq         m5, [srcq+strideq*1]
+    vpbroadcastq         m6, [srcq+strideq*2]
+    vmovdqa64       ym0{k1}, ym1
+    vmovdqa64       ym1{k1}, ym2
+    vmovdqa64        m2{k1}, m3
+    vmovdqa64        m3{k1}, m4
+    vmovdqa64        m4{k1}, m5
+    vmovdqa64        m5{k1}, m6
+    punpcklbw           ym0, ym1 ; 01 12 __ __
+    punpcklbw            m2, m3  ; 23 34 23 34
+    punpcklbw            m4, m5  ; 45 56 45 56
+    vmovdqa64        m0{k2}, m2  ; 01 12 23 34
+    vmovdqa64        m2{k2}, m4  ; 23 34 45 56
+.v_w8_loop:
+    vpbroadcastq         m1, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastq         m3, [srcq+strideq*0]
+    vpbroadcastq         m5, [srcq+strideq*1]
+    pmaddubsw           m14, m0, m8
+    pmaddubsw           m15, m2, m9
+    vpblendmq        m0{k1}, m6, m1
+    vpblendmq        m2{k1}, m1, m3
+    vpbroadcastq         m6, [srcq+strideq*2]
+    paddw               m14, m15
+    punpcklbw            m2, m0, m2   ; 67 78 67 78
+    vpblendmq       m12{k1}, m3, m5
+    vpblendmq       m13{k1}, m5, m6
+    vpblendmq        m0{k2}, m4, m2   ; 45 56 67 78
+    punpcklbw            m4, m12, m13 ; 89 9a 89 9a
+    vmovdqa64        m2{k2}, m4       ; 67 78 89 9a
+    pmaddubsw           m12, m0, m10
+    pmaddubsw           m13, m2, m11
+    paddw               m14, m12
+    paddw               m14, m13
+    pmulhrsw            m14, m7
+    mova             [tmpq], m14
+%else
+    movq                xm1, [srcq+strideq*0]
+    vpbroadcastq         m4, [srcq+strideq*1]
+    vpbroadcastq         m2, [srcq+strideq*2]
+    vpbroadcastq         m5, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastq         m3, [srcq+strideq*0]
+    vpbroadcastq         m6, [srcq+strideq*1]
+    vpbroadcastq         m0, [srcq+strideq*2]
+    vpblendd             m1, m1, m4, 0x30
+    vpblendd             m4, m4, m2, 0x30
+    punpcklbw            m1, m4 ; 01 12
+    vpblendd             m2, m2, m5, 0x30
+    vpblendd             m5, m5, m3, 0x30
+    punpcklbw            m2, m5 ; 23 34
+    vpblendd             m3, m3, m6, 0x30
+    vpblendd             m6, m6, m0, 0x30
+    punpcklbw            m3, m6 ; 45 56
+.v_w8_loop:
+    vpbroadcastq         m4, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    pmaddubsw            m5, m2, m9  ; a1
+    pmaddubsw            m6, m2, m8  ; b0
+    vpblendd             m2, m0, m4, 0x30
+    vpbroadcastq         m0, [srcq+strideq*0]
+    vpblendd             m4, m4, m0, 0x30
+    punpcklbw            m2, m4      ; 67 78
+    pmaddubsw            m1, m8      ; a0
+    pmaddubsw            m4, m3, m9  ; b1
+    paddw                m5, m1
+    mova                 m1, m3
+    pmaddubsw            m3, m10     ; a2
+    paddw                m6, m4
+    paddw                m5, m3
+    vpbroadcastq         m4, [srcq+strideq*1]
+    vpblendd             m3, m0, m4, 0x30
+    vpbroadcastq         m0, [srcq+strideq*2]
+    vpblendd             m4, m4, m0, 0x30
+    punpcklbw            m3, m4      ; 89 9a
+    pmaddubsw            m4, m2, m11 ; a3
+    paddw                m5, m4
+    pmaddubsw            m4, m2, m10 ; b2
+    paddw                m6, m4
+    pmaddubsw            m4, m3, m11 ; b3
+    paddw                m6, m4
+    pmulhrsw             m5, m7
+    pmulhrsw             m6, m7
+    mova        [tmpq+32*0], m5
+    mova        [tmpq+32*1], m6
+%endif
+    add                tmpq, 32*2
+    sub                  hd, 4
+    jg .v_w8_loop
+    RET
+.v_w16:
+%if cpuflag(avx512)
+    mov                 r3d, 0xf0
+    kmovb                k1, r3d
+    vbroadcasti128       m0, [srcq+strideq*0]
+    vbroadcasti128       m1, [srcq+strideq*1]
+    vbroadcasti128       m2, [srcq+strideq*2]
+    vbroadcasti128       m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vbroadcasti128       m4, [srcq+strideq*0]
+    vbroadcasti128       m5, [srcq+strideq*1]
+    vbroadcasti128       m6, [srcq+strideq*2]
+    vmovdqa64        m0{k1}, m1
+    vmovdqa64        m1{k1}, m2
+    vmovdqa64        m2{k1}, m3
+    vmovdqa64        m3{k1}, m4
+    vmovdqa64        m4{k1}, m5
+    vmovdqa64        m5{k1}, m6
+    shufpd               m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b
+    shufpd               m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b
+    shufpd               m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_--
+    shufpd               m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_--
+    punpckhbw            m2, m0, m1   ;  23a   23b   34a   34b
+    punpcklbw            m0, m1       ;  01a   01b   12a   12b
+    punpcklbw            m4, m5       ;  45a   45b   56a   56b
+.v_w16_loop:
+    vbroadcasti128       m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vbroadcasti128       m5, [srcq+strideq*0]
+    vpblendmq        m1{k1}, m6, m3
+    vmovdqa64        m3{k1}, m5
+    pmaddubsw           m12, m0, m8
+    pmaddubsw           m13, m2, m8
+    pmaddubsw           m14, m2, m9
+    pmaddubsw           m15, m4, m9
+    pmaddubsw            m0, m4, m10
+    vbroadcasti128       m2, [srcq+strideq*1]
+    vbroadcasti128       m6, [srcq+strideq*2]
+    paddw               m12, m14
+    paddw               m13, m15
+    paddw               m12, m0
+    vmovdqa64        m5{k1}, m2
+    vmovdqa64        m2{k1}, m6
+    mova                 m0, m4
+    shufpd               m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b
+    shufpd               m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab
+    punpcklbw            m2, m1, m3   ;  67a   67b   78a   78b
+    punpckhbw            m4, m1, m3   ;  89a   89b   9Aa   9Ab
+    pmaddubsw           m14, m2, m10
+    pmaddubsw           m15, m2, m11
+    paddw               m13, m14
+    paddw               m12, m15
+    pmaddubsw           m14, m4, m11
+    paddw               m13, m14
+    pmulhrsw            m12, m7
+    pmulhrsw            m13, m7
+    mova          [tmpq+ 0], m12
+    mova          [tmpq+64], m13
+    add                tmpq, 64*2
+    sub                  hd, 4
+    jg .v_w16_loop
+%else
+    lea                 r6d, [wq-16]
+    mov                  r5, tmpq
+    mov                  r7, srcq
+    shl                 r6d, 4
+    mov                 r6b, hb
+.v_w16_loop0:
+    vbroadcasti128       m4, [srcq+strideq*0]
+    vbroadcasti128       m5, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    vbroadcasti128       m0, [srcq+strideq*1]
+    vbroadcasti128       m6, [srcq+strideq*0]
+    lea                srcq, [srcq+strideq*2]
+    vbroadcasti128       m1, [srcq+strideq*0]
+    vbroadcasti128       m2, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    vbroadcasti128       m3, [srcq+strideq*0]
+    shufpd               m4, m4, m0, 0x0c
+    shufpd               m5, m5, m1, 0x0c
+    punpcklbw            m1, m4, m5 ; 01
+    punpckhbw            m4, m5     ; 34
+    shufpd               m6, m6, m2, 0x0c
+    punpcklbw            m2, m5, m6 ; 12
+    punpckhbw            m5, m6     ; 45
+    shufpd               m0, m0, m3, 0x0c
+    punpcklbw            m3, m6, m0 ; 23
+    punpckhbw            m6, m0     ; 56
+.v_w16_loop:
+    vbroadcasti128      m12, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    vbroadcasti128      m13, [srcq+strideq*0]
+    pmaddubsw           m14, m1, m8  ; a0
+    pmaddubsw           m15, m2, m8  ; b0
+    mova                 m1, m3
+    mova                 m2, m4
+    pmaddubsw            m3, m9      ; a1
+    pmaddubsw            m4, m9      ; b1
+    paddw               m14, m3
+    paddw               m15, m4
+    mova                 m3, m5
+    mova                 m4, m6
+    pmaddubsw            m5, m10     ; a2
+    pmaddubsw            m6, m10     ; b2
+    paddw               m14, m5
+    paddw               m15, m6
+    shufpd               m6, m0, m12, 0x0d
+    shufpd               m0, m12, m13, 0x0c
+    punpcklbw            m5, m6, m0  ; 67
+    punpckhbw            m6, m0      ; 78
+    pmaddubsw           m12, m5, m11 ; a3
+    pmaddubsw           m13, m6, m11 ; b3
+    paddw               m14, m12
+    paddw               m15, m13
+    pmulhrsw            m14, m7
+    pmulhrsw            m15, m7
+    mova        [tmpq+wq*0], m14
+    mova        [tmpq+wq*2], m15
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jg .v_w16_loop
+    movzx                hd, r6b
+    add                  r5, 32
+    add                  r7, 16
+    mov                tmpq, r5
+    mov                srcq, r7
+    sub                 r6d, 1<<8
+    jg .v_w16_loop0
+%endif
+    RET
+%if cpuflag(avx512)
+.v_w32:
+    mova                m18, [bilin_v_perm64]
+    movu                ym0, [srcq+strideq*0]
+    movu                ym1, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movu                ym2, [srcq+strideq*0]
+    movu                ym3, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movu                ym4, [srcq+strideq*0]
+    movu                ym5, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movu                ym6, [srcq+strideq*0]
+    vpermq               m0, m18, m0
+    vpermq               m1, m18, m1
+    vpermq               m2, m18, m2
+    vpermq               m3, m18, m3
+    vpermq               m4, m18, m4
+    vpermq               m5, m18, m5
+    vpermq               m6, m18, m6
+    punpcklbw            m0, m1
+    punpcklbw            m1, m2
+    punpcklbw            m2, m3
+    punpcklbw            m3, m4
+    punpcklbw            m4, m5
+    punpcklbw            m5, m6
+.v_w32_loop:
+    movu               ym12, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movu               ym13, [srcq+strideq*0]
+    pmaddubsw           m14, m0, m8
+    pmaddubsw           m16, m2, m9
+    pmaddubsw           m15, m1, m8
+    pmaddubsw           m17, m3, m9
+    mova                 m0, m2
+    mova                 m1, m3
+    vpermq              m12, m18, m12
+    vpermq              m13, m18, m13
+    paddw               m14, m16
+    paddw               m15, m17
+    pmaddubsw           m16, m4, m10
+    pmaddubsw           m17, m5, m10
+    punpcklbw            m6, m12
+    punpcklbw           m12, m13
+    mova                 m2, m4
+    mova                 m3, m5
+    paddw               m14, m16
+    paddw               m15, m17
+    pmaddubsw           m16, m6, m11
+    pmaddubsw           m17, m12, m11
+    mova                 m4, m6
+    mova                 m5, m12
+    paddw               m14, m16
+    paddw               m15, m17
+    pmulhrsw            m14, m7
+    pmulhrsw            m15, m7
+    mova                 m6, m13
+    mova          [tmpq+ 0], m14
+    mova          [tmpq+64], m15
+    add                tmpq, 64*2
+    sub                  hd, 2
+    jg .v_w32_loop
+    vzeroupper
+    RET
+.v_w64:
+    mov                 r6d, hd
+    mov                  wd, 64
+    jmp .v_start
+.v_w128:
+    lea                 r6d, [(1<<8)+hq]
+    mov                  wd, 128
+.v_start:
+    WIN64_SPILL_XMM      27
+    mova                m26, [bilin_v_perm64]
+    mov                  r5, tmpq
+    mov                  r7, srcq
+.v_loop0:
+    vpermq               m0, m26, [srcq+strideq*0]
+    vpermq               m1, m26, [srcq+strideq*1]
+    lea                srcq,      [srcq+strideq*2]
+    vpermq               m2, m26, [srcq+strideq*0]
+    vpermq               m3, m26, [srcq+strideq*1]
+    lea                srcq,      [srcq+strideq*2]
+    vpermq               m4, m26, [srcq+strideq*0]
+    vpermq               m5, m26, [srcq+strideq*1]
+    lea                srcq,      [srcq+strideq*2]
+    vpermq               m6, m26, [srcq+strideq*0]
+    punpckhbw           m12, m0, m1
+    punpcklbw            m0, m1
+    punpckhbw           m13, m1, m2
+    punpcklbw            m1, m2
+    punpckhbw           m14, m2, m3
+    punpcklbw            m2, m3
+    punpckhbw           m15, m3, m4
+    punpcklbw            m3, m4
+    punpckhbw           m16, m4, m5
+    punpcklbw            m4, m5
+    punpckhbw           m17, m5, m6
+    punpcklbw            m5, m6
+.v_loop:
+    vpermq              m18, m26, [srcq+strideq*1]
+    lea                srcq,      [srcq+strideq*2]
+    vpermq              m19, m26, [srcq+strideq*0]
+    pmaddubsw           m20, m0, m8
+    pmaddubsw           m21, m12, m8
+    pmaddubsw           m22, m1, m8
+    pmaddubsw           m23, m13, m8
+    mova                 m0, m2
+    mova                m12, m14
+    mova                 m1, m3
+    mova                m13, m15
+    pmaddubsw            m2, m9
+    pmaddubsw           m14, m9
+    pmaddubsw            m3, m9
+    pmaddubsw           m15, m9
+    punpckhbw           m24, m6, m18
+    punpcklbw            m6, m18
+    paddw               m20, m2
+    paddw               m21, m14
+    paddw               m22, m3
+    paddw               m23, m15
+    mova                 m2, m4
+    mova                m14, m16
+    mova                 m3, m5
+    mova                m15, m17
+    pmaddubsw            m4, m10
+    pmaddubsw           m16, m10
+    pmaddubsw            m5, m10
+    pmaddubsw           m17, m10
+    punpckhbw           m25, m18, m19
+    punpcklbw           m18, m19
+    paddw               m20, m4
+    paddw               m21, m16
+    paddw               m22, m5
+    paddw               m23, m17
+    mova                 m4, m6
+    mova                m16, m24
+    mova                 m5, m18
+    mova                m17, m25
+    pmaddubsw            m6, m11
+    pmaddubsw           m24, m11
+    pmaddubsw           m18, m11
+    pmaddubsw           m25, m11
+    paddw               m20, m6
+    paddw               m21, m24
+    paddw               m22, m18
+    paddw               m23, m25
+    pmulhrsw            m20, m7
+    pmulhrsw            m21, m7
+    pmulhrsw            m22, m7
+    pmulhrsw            m23, m7
+    mova                 m6, m19
+    mova     [tmpq+wq*0+ 0], m20
+    mova     [tmpq+wq*0+64], m21
+    mova     [tmpq+wq*2+ 0], m22
+    mova     [tmpq+wq*2+64], m23
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jg .v_loop
+    movzx                hd, r6b
+    add                  r5, 64*2
+    add                  r7, 64
+    mov                tmpq, r5
+    mov                srcq, r7
+    sub                 r6d, 1<<8
+    jg .v_loop0
+%endif
+    RET
+.hv:
+    %assign stack_offset stack_offset - stack_size_padded
+    %assign stack_size_padded 0
+    WIN64_SPILL_XMM      16
+    cmp                  wd, 4
+    je .hv_w4
+    shr                 mxd, 16
+    sub                srcq, 3
+    vpbroadcastd        m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+    vpbroadcastd        m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmove               myd, mxd
+%if cpuflag(avx512)
+    tzcnt                wd, wd
+    vpbroadcastd         m8, [pd_2]
+    movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_hv)]
+    vpbroadcastd         m9, [pd_32]
+    add                  wq, r7
+%endif
+    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+    lea            stride3q, [strideq*3]
+    sub                srcq, stride3q
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    pshufd              m12, m0, q0000
+    pshufd              m13, m0, q1111
+    pshufd              m14, m0, q2222
+    pshufd              m15, m0, q3333
+%if cpuflag(avx512)
+    jmp                  wq
+%else
+    jmp .hv_w8
+%endif
+.hv_w4:
+    movzx               mxd, mxb
+    dec                srcq
+    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmove               myd, mxd
+    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+    lea            stride3q, [strideq*3]
+    sub                srcq, stride3q
+%if cpuflag(avx512)
+    mov                 r3d, 0x04
+    kmovb                k1, r3d
+    kshiftlb             k2, k1, 2
+    kshiftlb             k3, k1, 4
+    vpbroadcastd        m10, [pd_2]
+    vbroadcasti128      m16, [subpel_h_shufA]
+%else
+    mova                 m7, [subpel_h_shuf4]
+    pmovzxbd             m9, [deint_shuf4]
+    vpbroadcastd        m10, [pw_8192]
+%endif
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    vpbroadcastd        m11, [pd_32]
+    pshufd              m12, m0, q0000
+    pshufd              m13, m0, q1111
+    pshufd              m14, m0, q2222
+    pshufd              m15, m0, q3333
+%if cpuflag(avx512icl)
+    movq                xm3, [srcq+strideq*0]
+    vpbroadcastq        ym2, [srcq+strideq*1]
+    vpbroadcastq    ym3{k1}, [srcq+strideq*2]
+    vpbroadcastq     m2{k2}, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastq     m3{k2}, [srcq+strideq*0]
+    vpbroadcastq     m2{k3}, [srcq+strideq*1]
+    vpbroadcastq     m3{k3}, [srcq+strideq*2]
+    mova                m17, [spel_hv_perm4a]
+    movu                m18, [spel_hv_perm4b]
+    mova                 m0, m10
+    mova                 m1, m10
+    pshufb               m2, m16
+    pshufb               m3, m16
+    vpdpbusd             m0, m2, m8
+    vpdpbusd             m1, m3, m8
+    packssdw             m0, m1        ; _ 0  1 2  3 4  5 6
+    psraw                m0, 2
+    vpermb               m1, m17, m0   ; 01 12 23 34
+    vpermb               m2, m18, m0   ; 23 34 45 56
+.hv_w4_loop:
+    movq                xm3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    movq                xm4, [srcq+strideq*0]
+    vpbroadcastq    ym3{k1}, [srcq+strideq*1]
+    vpbroadcastq    ym4{k1}, [srcq+strideq*2]
+    mova                ym5, ym10
+    mova                ym6, ym10
+    pshufb              ym3, ym16
+    pshufb              ym4, ym16
+    vpdpbusd            ym5, ym3, ym8
+    vpdpbusd            ym6, ym4, ym8
+    mova                 m7, m11
+    packssdw            ym5, ym6       ; 7 8  9 a  _ _  _ _
+    psraw               ym5, 2
+    valignq              m0, m5, m0, 4 ; _ 4  5 6  7 8  9 a
+    vpdpwssd             m7, m1, m12
+    vpdpwssd             m7, m2, m13
+    vpermb               m1, m17, m0   ; 45 56 67 78
+    vpermb               m2, m18, m0   ; 67 78 89 9a
+    vpdpwssd             m7, m1, m14
+    vpdpwssd             m7, m2, m15
+    psrad                m7, 6
+    vpmovdw          [tmpq], m7
+%else
+    vpbroadcastq         m2, [srcq+strideq*0]
+    vpbroadcastq         m4, [srcq+strideq*1]
+    vpbroadcastq         m0, [srcq+strideq*2]
+    vpbroadcastq         m5, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastq         m3, [srcq+strideq*0]
+    vpbroadcastq         m6, [srcq+strideq*1]
+    vpbroadcastq         m1, [srcq+strideq*2]
+    vpblendd             m2, m2, m4, 0xcc ; 0 1
+    vpblendd             m0, m0, m5, 0xcc ; 2 3
+    vpblendd             m3, m3, m6, 0xcc ; 4 5
+    pshufb               m2, m7 ; 00 01 10 11  02 03 12 13
+    pshufb               m0, m7 ; 20 21 30 31  22 23 32 33
+    pshufb               m3, m7 ; 40 41 50 51  42 43 52 53
+    pshufb               m1, m7 ; 60 61 60 61  62 63 62 63
+    pmaddubsw            m2, m8
+    pmaddubsw            m0, m8
+    pmaddubsw            m3, m8
+    pmaddubsw            m1, m8
+    phaddw               m2, m0 ; 0a 1a 2a 3a  0b 1b 2b 3b
+    phaddw               m3, m1 ; 4a 5a 6a __  4b 5b 6b __
+    pmulhrsw             m2, m10
+    pmulhrsw             m3, m10
+    palignr              m4, m3, m2, 4 ; 1a 2a 3a 4a  1b 2b 3b 4b
+    punpcklwd            m1, m2, m4  ; 01 12
+    punpckhwd            m2, m4      ; 23 34
+    pshufd               m0, m3, q2121
+    punpcklwd            m3, m0      ; 45 56
+.hv_w4_loop:
+    pmaddwd              m5, m1, m12 ; a0 b0
+    pmaddwd              m6, m2, m12 ; c0 d0
+    pmaddwd              m2, m13     ; a1 b1
+    pmaddwd              m4, m3, m13 ; c1 d1
+    mova                 m1, m3
+    pmaddwd              m3, m14     ; a2 b2
+    paddd                m5, m2
+    vpbroadcastq         m2, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    paddd                m6, m4
+    paddd                m5, m3
+    vpbroadcastq         m4, [srcq+strideq*0]
+    vpbroadcastq         m3, [srcq+strideq*1]
+    vpblendd             m2, m2, m4, 0xcc
+    vpbroadcastq         m4, [srcq+strideq*2]
+    vpblendd             m3, m3, m4, 0xcc
+    pshufb               m2, m7
+    pshufb               m3, m7
+    pmaddubsw            m2, m8
+    pmaddubsw            m3, m8
+    phaddw               m2, m3
+    pmulhrsw             m2, m10
+    palignr              m3, m2, m0, 12
+    mova                 m0, m2
+    punpcklwd            m2, m3, m0  ; 67 78
+    punpckhwd            m3, m0      ; 89 9a
+    pmaddwd              m4, m2, m14 ; c2 d2
+    paddd                m6, m11
+    paddd                m5, m11
+    paddd                m6, m4
+    pmaddwd              m4, m2, m15 ; a3 b3
+    paddd                m5, m4
+    pmaddwd              m4, m3, m15 ; c3 d3
+    paddd                m6, m4
+    psrad                m5, 6
+    psrad                m6, 6
+    packssdw             m5, m6
+    vpermd               m5, m9, m5
+    mova             [tmpq], m5
+%endif
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .hv_w4_loop
+%if cpuflag(avx512)
+    vzeroupper
+%endif
+    RET
+.hv_w8:
+%if cpuflag(avx512icl)
+    WIN64_SPILL_XMM      24
+    vbroadcasti128      m16, [subpel_h_shufA]
+    vbroadcasti128      m17, [subpel_h_shufB]
+    vbroadcasti128      m18, [subpel_h_shufC]
+    vinserti128         ym0, [srcq+strideq*0], 1
+    vinserti128          m0, [srcq+strideq*1], 2
+    vinserti128          m0, [srcq+strideq*2], 3
+    movu                xm1, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vinserti128         ym1, [srcq+strideq*0], 1
+    vinserti128          m1, [srcq+strideq*1], 2
+    vinserti128          m1, [srcq+strideq*2], 3
+    mova                 m2, m8
+    mova                 m4, m8
+    mova                 m3, m8
+    mova                 m5, m8
+    pshufb              m20, m0, m16
+    pshufb              m21, m0, m17
+    pshufb              m22, m0, m18
+    pshufb              m23, m1, m16
+    pshufb               m6, m1, m17
+    pshufb               m7, m1, m18
+    vpdpbusd             m2, m20, m10
+    vpdpbusd             m4, m21, m10
+    vpdpbusd             m2, m21, m11
+    vpdpbusd             m4, m22, m11
+    vpdpbusd             m3, m23, m10
+    vpdpbusd             m5,  m6, m10
+    vpdpbusd             m3,  m6, m11
+    vpdpbusd             m5,  m7, m11
+    packssdw             m2, m4
+    packssdw             m3, m5
+    psraw                m2, 2          ; _ 0 1 2
+    psraw                m3, 2          ; 3 4 5 6
+    valignq              m0, m3, m2, 2  ; 0 1 2 3
+    valignq              m1, m3, m2, 4  ; 1 2 3 4
+    valignq              m2, m3, m2, 6  ; 2 3 4 5
+    punpcklwd            m4, m0, m1 ; 01a 12a 23a 34a
+    punpckhwd            m5, m0, m1 ; 01b 12b 23b 34b
+    punpcklwd            m6, m2, m3 ; 23a 34a 45a 56a
+    punpckhwd            m7, m2, m3 ; 23b 34b 45b 56b
+.hv_w8_loop:
+    movu               xm19, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vinserti128        ym19, [srcq+strideq*0], 1
+    vinserti128         m19, [srcq+strideq*1], 2
+    vinserti128         m19, [srcq+strideq*2], 3
+    mova                m20, m9
+    mova                m21, m9
+    mova                m22, m8
+    mova                m23, m8
+    vpdpwssd            m20, m4, m12
+    vpdpwssd            m21, m5, m12
+    vpdpwssd            m20, m6, m13
+    vpdpwssd            m21, m7, m13
+    pshufb               m0, m19, m16
+    pshufb               m1, m19, m17
+    pshufb               m2, m19, m18
+    vpdpbusd            m22, m0, m10
+    vpdpbusd            m23, m1, m10
+    vpdpbusd            m22, m1, m11
+    vpdpbusd            m23, m2, m11
+    packssdw            m22, m23
+    psraw               m22, 2          ; 7 8 9 A
+    valignq              m0, m22, m3, 2 ; 4 5 6 7
+    valignq              m1, m22, m3, 4 ; 5 6 7 8
+    valignq              m2, m22, m3, 6 ; 6 7 8 9
+    mova                 m3, m22
+    punpcklwd            m4, m0, m1 ; 45a 56a 67a 78a
+    punpckhwd            m5, m0, m1 ; 45b 56b 67b 78b
+    punpcklwd            m6, m2, m3 ; 67a 78a 89a 9Aa
+    punpckhwd            m7, m2, m3 ; 67b 78b 89b 9Ab
+    vpdpwssd            m20, m4, m14
+    vpdpwssd            m21, m5, m14
+    vpdpwssd            m20, m6, m15
+    vpdpwssd            m21, m7, m15
+    psrad               m20, 6
+    psrad               m21, 6
+    packssdw            m20, m21
+    mova             [tmpq], m20
+    add                tmpq, 64
+    sub                  hd, 4
+    jg .hv_w8_loop
+%else
+    lea                 r6d, [wq-8]
+    mov                  r5, tmpq
+    mov                  r7, srcq
+    shl                 r6d, 5
+    mov                 r6b, hb
+.hv_w8_loop0:
+    vbroadcasti128       m7, [subpel_h_shufA]
+    vbroadcasti128       m8, [subpel_h_shufB]
+    vbroadcasti128       m9, [subpel_h_shufC]
+    movu                xm4,     [srcq+strideq*0]
+    movu                xm5,     [srcq+strideq*1]
+    lea                srcq,     [srcq+strideq*2]
+    movu                xm6,     [srcq+strideq*0]
+    vbroadcasti128       m0,     [srcq+strideq*1]
+    lea                srcq,     [srcq+strideq*2]
+    vpblendd             m4, m4, m0, 0xf0            ; 0 3
+    vinserti128          m5, m5, [srcq+strideq*0], 1 ; 1 4
+    vinserti128          m6, m6, [srcq+strideq*1], 1 ; 2 5
+    lea                srcq,     [srcq+strideq*2]
+    vinserti128          m0, m0, [srcq+strideq*0], 1 ; 3 6
+    HV_H_W8              m4, m1, m2, m3, m7, m8, m9
+    HV_H_W8              m5, m1, m2, m3, m7, m8, m9
+    HV_H_W8              m6, m1, m2, m3, m7, m8, m9
+    HV_H_W8              m0, m1, m2, m3, m7, m8, m9
+    vpbroadcastd         m7, [pw_8192]
+    vpermq               m4, m4, q3120
+    vpermq               m5, m5, q3120
+    vpermq               m6, m6, q3120
+    pmulhrsw             m0, m7
+    pmulhrsw             m4, m7
+    pmulhrsw             m5, m7
+    pmulhrsw             m6, m7
+    vpermq               m7, m0, q3120
+    punpcklwd            m1, m4, m5  ; 01
+    punpckhwd            m4, m5      ; 34
+    punpcklwd            m2, m5, m6  ; 12
+    punpckhwd            m5, m6      ; 45
+    punpcklwd            m3, m6, m7  ; 23
+    punpckhwd            m6, m7      ; 56
+.hv_w8_loop:
+    vextracti128     [tmpq], m0, 1 ; not enough registers
+    movu                xm0,     [srcq+strideq*1]
+    lea                srcq,     [srcq+strideq*2]
+    vinserti128          m0, m0, [srcq+strideq*0], 1 ; 7 8
+    pmaddwd              m8, m1, m12 ; a0
+    pmaddwd              m9, m2, m12 ; b0
+    mova                 m1, m3
+    mova                 m2, m4
+    pmaddwd              m3, m13     ; a1
+    pmaddwd              m4, m13     ; b1
+    paddd                m8, m3
+    paddd                m9, m4
+    mova                 m3, m5
+    mova                 m4, m6
+    pmaddwd              m5, m14     ; a2
+    pmaddwd              m6, m14     ; b2
+    paddd                m8, m5
+    paddd                m9, m6
+    vbroadcasti128       m6, [subpel_h_shufB]
+    vbroadcasti128       m7, [subpel_h_shufC]
+    vbroadcasti128       m5, [subpel_h_shufA]
+    HV_H_W8              m0, m5, m6, m7, m5, m6, m7
+    vpbroadcastd         m5, [pw_8192]
+    vpbroadcastd         m7, [pd_32]
+    vbroadcasti128       m6, [tmpq]
+    pmulhrsw             m0, m5
+    paddd                m8, m7
+    paddd                m9, m7
+    vpermq               m7, m0, q3120    ; 7 8
+    shufpd               m6, m6, m7, 0x04 ; 6 7
+    punpcklwd            m5, m6, m7  ; 67
+    punpckhwd            m6, m7      ; 78
+    pmaddwd              m7, m5, m15 ; a3
+    paddd                m8, m7
+    pmaddwd              m7, m6, m15 ; b3
+    paddd                m7, m9
+    psrad                m8, 6
+    psrad                m7, 6
+    packssdw             m8, m7
+    vpermq               m7, m8, q3120
+    mova         [tmpq+wq*0], xm7
+    vextracti128 [tmpq+wq*2], m7, 1
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    movzx                hd, r6b
+    add                  r5, 16
+    add                  r7, 8
+    mov                tmpq, r5
+    mov                srcq, r7
+    sub                 r6d, 1<<8
+    jg .hv_w8_loop0
+%endif
+    RET
+%if cpuflag(avx512icl)
+.hv_w16:
+    mov                  wd, 16*2
+    jmp .hv_start
+.hv_w32:
+    mov                  wd, 32*2
+    jmp .hv_start
+.hv_w64:
+    mov                  wd, 64*2
+    jmp .hv_start
+.hv_w128:
+    mov                  wd, 128*2
+.hv_start:
+    WIN64_SPILL_XMM      31
+    mova                m16, [spel_h_perm16a]
+    mova                m17, [spel_h_perm16b]
+    mova                m18, [spel_h_perm16c]
+    lea                 r6d, [wq*8-16*2*8+hq]
+    mov                  r5, tmpq
+    mov                  r7, srcq
+.hv_loop0:
+    movu                ym0, [srcq+strideq*0]
+    vinserti32x8         m0, [srcq+strideq*1], 1
+    lea                srcq, [srcq+strideq*2]
+    movu                ym1, [srcq+strideq*0]
+    vinserti32x8         m1, [srcq+strideq*1], 1
+    lea                srcq, [srcq+strideq*2]
+    movu                ym2, [srcq+strideq*0]
+    vinserti32x8         m2, [srcq+strideq*1], 1
+    lea                srcq, [srcq+strideq*2]
+    movu                ym3, [srcq+strideq*0]
+    mova                 m4, m8
+    mova                 m5, m8
+    mova                 m6, m8
+    mova                 m7, m8
+    vpermb              m19, m16, m0
+    vpermb              m20, m17, m0
+    vpermb              m21, m18, m0
+    vpermb              m22, m16, m1
+    vpermb              m23, m17, m1
+    vpermb              m24, m18, m1
+    vpermb              m25, m16, m2
+    vpermb              m26, m17, m2
+    vpermb              m27, m18, m2
+    vpermb             ym28, ym16, ym3
+    vpermb             ym29, ym17, ym3
+    vpermb             ym30, ym18, ym3
+    mova                 m0, m8
+    mova                 m1, m8
+    mova                ym2, ym8
+    mova                ym3, ym8
+    vpdpbusd             m4, m19, m10
+    vpdpbusd             m5, m20, m10
+    vpdpbusd             m6, m22, m10
+    vpdpbusd             m7, m23, m10
+    vpdpbusd             m0, m25, m10
+    vpdpbusd             m1, m26, m10
+    vpdpbusd            ym2, ym28, ym10
+    vpdpbusd            ym3, ym29, ym10
+    vpdpbusd             m4, m20, m11
+    vpdpbusd             m5, m21, m11
+    vpdpbusd             m6, m23, m11
+    vpdpbusd             m7, m24, m11
+    vpdpbusd             m0, m26, m11
+    vpdpbusd             m1, m27, m11
+    vpdpbusd            ym2, ym29, ym11
+    vpdpbusd            ym3, ym30, ym11
+    packssdw             m4, m5
+    packssdw             m6, m7
+    packssdw             m0, m1
+    packssdw            ym2, ym3
+    psraw                m4, 2             ; 0a 0b 1a 1b
+    psraw                m6, 2             ; 2a 2b 3a 3b
+    psraw                m0, 2             ; 4a 4b 5a 5b
+    psraw               ym2, 2             ; 6a 6b __ __
+    vshufi32x4           m5, m4, m6, q1032 ; 1a 1b 2a 2b
+    vshufi32x4           m7, m6, m0, q1032 ; 3a 3b 4a 4b
+    vshufi32x4           m1, m0, m2, q1032 ; 5a 5b 6a 6b
+    punpcklwd            m2, m4, m5 ; 01a 01c 12a 12c
+    punpckhwd            m3, m4, m5 ; 01b 01d 12b 12d
+    punpcklwd            m4, m6, m7 ; 23a 23c 34a 34c
+    punpckhwd            m5, m6, m7 ; 23b 23d 34b 34d
+    punpcklwd            m6, m0, m1 ; 45a 45c 56a 56c
+    punpckhwd            m7, m0, m1 ; 45b 45d 56b 56d
+.hv_loop:
+    movu               ym19, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    vinserti32x8        m19, [srcq+strideq*0], 1
+    mova                m20, m9
+    mova                m21, m9
+    mova                m22, m8
+    mova                m23, m8
+    vpdpwssd            m20, m2, m12
+    vpdpwssd            m21, m3, m12
+    vpdpwssd            m20, m4, m13
+    vpdpwssd            m21, m5, m13
+    vpermb              m24, m16, m19
+    vpermb              m25, m17, m19
+    vpermb              m26, m18, m19
+    vpdpbusd            m22, m24, m10
+    vpdpbusd            m23, m25, m10
+    vpdpbusd            m22, m25, m11
+    vpdpbusd            m23, m26, m11
+    packssdw            m22, m23
+    psraw               m22, 2              ; 7a 7b 8a 8b
+    vshufi32x4           m0, m1, m22, q1032 ; 6a 6b 7a 7b
+    mova                 m2, m4
+    mova                 m3, m5
+    mova                 m1, m22
+    mova                 m4, m6
+    mova                 m5, m7
+    punpcklwd            m6, m0, m1 ; 67a 67c 78a 78c
+    punpckhwd            m7, m0, m1 ; 67b 67d 78b 78d
+    vpdpwssd            m20, m4, m14
+    vpdpwssd            m21, m5, m14
+    vpdpwssd            m20, m6, m15
+    vpdpwssd            m21, m7, m15
+    psrad               m20, 6
+    psrad               m21, 6
+    packssdw            m20, m21
+    mova          [tmpq+wq*0], ym20
+    vextracti32x8 [tmpq+wq*1], m20, 1
+    lea                tmpq, [tmpq+wq*2]
+    sub                  hd, 2
+    jg .hv_loop
+    movzx                hd, r6b
+    add                  r5, 32
+    add                  r7, 16
+    mov                tmpq, r5
+    mov                srcq, r7
+    sub                 r6d, 1<<8
+    jg .hv_loop0
+%endif
+    RET
+%endmacro
+
+%macro movifprep 2
+ %if isprep
+    mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1  r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+  %xdefine r14_save r14
+  %assign %%i 14
+  %rep 14
+   %assign %%j %%i-1
+   REMAP_REG %%i, %%j
+   %assign %%i %%i-1
+  %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+  %assign %%i 1
+  %rep 13
+   %assign %%j %%i+1
+   REMAP_REG %%i, %%j
+   %assign %%i %%i+1
+  %endrep
+  %xdefine r14 r14_save
+  %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+    RET
+ %if %1
+    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+    movq               xm%1, [srcq+ r4]
+    movq               xm%2, [srcq+ r6]
+    movhps             xm%1, [srcq+ r7]
+    movhps             xm%2, [srcq+ r9]
+    vinserti128         m%1, [srcq+r10], 1
+    vinserti128         m%2, [srcq+r11], 1
+    vpbroadcastq        m%5, [srcq+r13]
+    vpbroadcastq        m%6, [srcq+ rX]
+    add                srcq, ssq
+    movq               xm%3, [srcq+ r4]
+    movq               xm%4, [srcq+ r6]
+    movhps             xm%3, [srcq+ r7]
+    movhps             xm%4, [srcq+ r9]
+    vinserti128         m%3, [srcq+r10], 1
+    vinserti128         m%4, [srcq+r11], 1
+    vpbroadcastq        m%7, [srcq+r13]
+    vpbroadcastq        m%8, [srcq+ rX]
+    add                srcq, ssq
+    vpblendd            m%1, m%5, 0xc0
+    vpblendd            m%2, m%6, 0xc0
+    vpblendd            m%3, m%7, 0xc0
+    vpblendd            m%4, m%8, 0xc0
+    pmaddubsw           m%1, m15
+    pmaddubsw           m%2, m10
+    pmaddubsw           m%3, m15
+    pmaddubsw           m%4, m10
+    phaddw              m%1, m%2
+    phaddw              m%3, m%4
+    phaddw              m%1, m%3
+    pmulhrsw            m%1, m12
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+%else
+ %assign isprep 1
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+  %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+  %define tmp_stridem qword [rsp+104]
+ %endif
+ %xdefine base_reg r11
+ %define rndshift 6
+%endif
+    lea            base_reg, [%1_8tap_scaled_avx2]
+%define base base_reg-%1_8tap_scaled_avx2
+    tzcnt                wd, wm
+    vpbroadcastd         m8, dxm
+%if isprep && UNIX64
+    movd               xm14, mxd
+    vpbroadcastd        m14, xm14
+    mov                 r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%else
+    vpbroadcastd        m14, mxm
+%endif
+    mov                 dyd, dym
+%ifidn %1, put
+ %if WIN64
+    mov                 r8d, hm
+  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+  %define hm r5m
+  %define dxm r8m
+ %else
+  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+  %define hm r6m
+ %endif
+ %if required_stack_alignment > STACK_ALIGNMENT
+  %define dsm [rsp+96]
+  %define rX r1
+  %define rXd r1d
+ %else
+  %define dsm dsq
+  %define rX r14
+  %define rXd r14d
+ %endif
+%else ; prep
+ %if WIN64
+    mov                 r7d, hm
+  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+  %define hm r4m
+  %define dxm r7m
+ %else
+  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+  %define hm [rsp+96]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+    vpbroadcastd        m10, [base+pd_0x3ff]
+    vpbroadcastd        m12, [base+pw_8192]
+%ifidn %1, put
+    vpbroadcastd        m13, [base+pd_512]
+%else
+    vpbroadcastd        m13, [base+pd_32]
+%endif
+    pxor                 m9, m9
+    lea                ss3q, [ssq*3]
+    movzx               r7d, t1b
+    shr                 t1d, 16
+    cmp                  hd, 6
+    cmovs               t1d, r7d
+    sub                srcq, ss3q
+    cmp                 dyd, 1024
+    je .dy1
+    cmp                 dyd, 2048
+    je .dy2
+    movzx                wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+    add                  wq, base_reg
+    jmp                  wq
+%ifidn %1, put
+.w2:
+    mov                 myd, mym
+    movzx               t0d, t0b
+    dec                srcq
+    movd               xm15, t0d
+    punpckldq            m8, m9, m8
+    paddd               m14, m8 ; mx+dx*[0-1]
+    vpbroadcastd        m11, [base+pd_0x4000]
+    vpbroadcastd       xm15, xm15
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd              xm15, xm8
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 1
+    vbroadcasti128       m5, [base+bdct_lb_dw]
+    vbroadcasti128       m6, [base+subpel_s_shuf2]
+    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
+    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movq                xm0, [srcq+ssq*0]
+    movq                xm1, [srcq+ssq*2]
+    movhps              xm0, [srcq+ssq*1]
+    movhps              xm1, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    pshufb              m14, m5
+    paddb               m14, m6
+    vinserti128          m0, [srcq+ssq*0], 1
+    vinserti128          m1, [srcq+ssq*2], 1
+    vpbroadcastq         m2, [srcq+ssq*1]
+    vpbroadcastq         m3, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    vpblendd            m15, m7, 0xaa
+    vpblendd             m0, m2, 0xc0       ; 0 1  4 5
+    vpblendd             m1, m3, 0xc0       ; 2 3  6 7
+    pblendvb            m15, m11, m8
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pmaddubsw            m0, m15
+    pmaddubsw            m1, m15
+    phaddw               m0, m1
+    pmulhrsw             m0, m12            ; 0 1 2 3  4 5 6 7
+    vextracti128        xm1, m0, 1          ; 4 5 6 7
+    palignr             xm2, xm1, xm0, 4    ; 1 2 3 4
+    punpcklwd           xm3, xm0, xm2       ; 01 12
+    punpckhwd           xm0, xm2            ; 23 34
+    pshufd              xm4, xm1, q0321     ; 5 6 7 _
+    punpcklwd           xm2, xm1, xm4       ; 45 56
+    punpckhwd           xm4, xm1, xm4       ; 67 __
+.w2_loop:
+    and                 myd, 0x3ff
+    mov                 r6d, 64 << 24
+    mov                 r4d, myd
+    shr                 r4d, 6
+    lea                 r4d, [t1+r4]
+    cmovnz              r6q, [base+subpel_filters+r4*8]
+    movq               xm11, r6q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    pshufd              xm8, xm11, q0000
+    pshufd              xm9, xm11, q1111
+    pshufd             xm10, xm11, q2222
+    pshufd             xm11, xm11, q3333
+    pmaddwd             xm5, xm3, xm8
+    pmaddwd             xm6, xm0, xm9
+    pmaddwd             xm7, xm2, xm10
+    pmaddwd             xm8, xm4, xm11
+    paddd               xm5, xm6
+    paddd               xm7, xm8
+    paddd               xm5, xm13
+    paddd               xm5, xm7
+    psrad               xm5, 10
+    packssdw            xm5, xm5
+    packuswb            xm5, xm5
+    pextrw           [dstq], xm5, 0
+    add                dstq, dsq
+    dec                  hd
+    jz .ret
+    add                 myd, dyd
+    test                myd, ~0x3ff
+    jz .w2_loop
+    movq                xm5, [srcq]
+    test                myd, 0x400
+    jz .w2_skip_line
+    add                srcq, ssq
+    shufps              xm3, xm0, q1032     ; 01 12
+    shufps              xm0, xm2, q1032     ; 23 34
+    shufps              xm2, xm4, q1032     ; 45 56
+    pshufb              xm5, xm14
+    pmaddubsw           xm5, xm15
+    phaddw              xm5, xm5
+    pmulhrsw            xm5, xm12
+    palignr             xm1, xm5, xm1, 12
+    punpcklqdq          xm1, xm1            ; 6 7 6 7
+    punpcklwd           xm4, xm1, xm5       ; 67 __
+    jmp .w2_loop
+.w2_skip_line:
+    movhps              xm5, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mova                xm3, xm0            ; 01 12
+    mova                xm0, xm2            ; 23 34
+    pshufb              xm5, xm14
+    pmaddubsw           xm5, xm15
+    phaddw              xm5, xm5
+    pmulhrsw            xm5, xm12           ; 6 7 6 7
+    palignr             xm1, xm5, xm1, 8    ; 4 5 6 7
+    pshufd              xm5, xm1, q0321     ; 5 6 7 _
+    punpcklwd           xm2, xm1, xm5       ; 45 56
+    punpckhwd           xm4, xm1, xm5       ; 67 __
+    jmp .w2_loop
+%endif
+.w4:
+    mov                 myd, mym
+    vbroadcasti128       m7, [base+rescale_mul]
+    movzx               t0d, t0b
+    dec                srcq
+    movd               xm15, t0d
+    pmaddwd              m8, m7
+    vpbroadcastd        m11, [base+pd_0x4000]
+    vpbroadcastd       xm15, xm15
+    paddd               m14, m8 ; mx+dx*[0-3]
+    pand                 m0, m14, m10
+    psrld                m0, 6
+    paddd              xm15, xm0
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 1
+    pextrd             r11d, xm15, 2
+    pextrd             r13d, xm15, 3
+    movd               xm15, [base+subpel_filters+r4*8+2]
+    vbroadcasti128       m5, [base+bdct_lb_dw]
+    vpbroadcastq         m6, [base+subpel_s_shuf2]
+    pinsrd             xm15, [base+subpel_filters+r6*8+2], 1
+    pcmpeqd              m0, m9
+    psrld               m14, 10
+    movu                xm7, [srcq+ssq*0]
+    movu                xm9, [srcq+ssq*1]
+    pinsrd             xm15, [base+subpel_filters+r11*8+2], 2
+    movu                xm8, [srcq+ssq*2]
+    movu               xm10, [srcq+ss3q ]
+    pinsrd             xm15, [base+subpel_filters+r13*8+2], 3
+    lea                srcq, [srcq+ssq*4]
+    pshufb              m14, m5
+    paddb               m14, m6
+    vinserti128          m7, [srcq+ssq*0], 1
+    vinserti128          m9, [srcq+ssq*1], 1
+    vinserti128         m15, xm15, 1
+    vinserti128          m8, [srcq+ssq*2], 1
+    vinserti128         m10, [srcq+ss3q ], 1
+    lea                srcq, [srcq+ssq*4]
+    pblendvb            m15, m11, m0
+    pshufb               m7, m14
+    pshufb               m9, m14
+    pshufb               m8, m14
+    pshufb              m10, m14
+    pmaddubsw            m7, m15
+    pmaddubsw            m9, m15
+    pmaddubsw            m8, m15
+    pmaddubsw           m10, m15
+    phaddw               m7, m9
+    phaddw               m8, m10
+    pmulhrsw             m7, m12                ; 0 1  4 5
+    pmulhrsw             m8, m12                ; 2 3  6 7
+    vextracti128        xm9, m7, 1              ; 4 5
+    vextracti128        xm3, m8, 1              ; 6 7
+    shufps              xm4, xm7, xm8, q1032    ; 1 2
+    shufps              xm5, xm8, xm9, q1032    ; 3 4
+    shufps              xm6, xm9, xm3, q1032    ; 5 6
+    psrldq             xm11, xm3, 8             ; 7 _
+    punpcklwd           xm0, xm7, xm4   ; 01
+    punpckhwd           xm7, xm4        ; 12
+    punpcklwd           xm1, xm8, xm5   ; 23
+    punpckhwd           xm8, xm5        ; 34
+    punpcklwd           xm2, xm9, xm6   ; 45
+    punpckhwd           xm9, xm6        ; 56
+    punpcklwd           xm3, xm11       ; 67
+    mova         [rsp+0x00], xm7
+    mova         [rsp+0x10], xm8
+    mova         [rsp+0x20], xm9
+.w4_loop:
+    and                 myd, 0x3ff
+    mov                 r6d, 64 << 24
+    mov                 r4d, myd
+    shr                 r4d, 6
+    lea                 r4d, [t1+r4]
+    cmovnz              r6q, [base+subpel_filters+r4*8]
+    movq               xm10, r6q
+    punpcklbw          xm10, xm10
+    psraw              xm10, 8
+    pshufd              xm7, xm10, q0000
+    pshufd              xm8, xm10, q1111
+    pshufd              xm9, xm10, q2222
+    pshufd             xm10, xm10, q3333
+    pmaddwd             xm4, xm0, xm7
+    pmaddwd             xm5, xm1, xm8
+    pmaddwd             xm6, xm2, xm9
+    pmaddwd             xm7, xm3, xm10
+    paddd               xm4, xm5
+    paddd               xm6, xm7
+    paddd               xm4, xm13
+    paddd               xm4, xm6
+    psrad               xm4, rndshift
+    packssdw            xm4, xm4
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movd             [dstq], xm4
+    add                dstq, dsq
+%else
+    movq             [tmpq], xm4
+    add                tmpq, 8
+%endif
+    dec                  hd
+    jz .ret
+    add                 myd, dyd
+    test                myd, ~0x3ff
+    jz .w4_loop
+    movu                xm4, [srcq]
+    test                myd, 0x400
+    jz .w4_skip_line
+    mova                xm0, [rsp+0x00]
+    mova         [rsp+0x00], xm1
+    mova                xm1, [rsp+0x10]
+    mova         [rsp+0x10], xm2
+    mova                xm2, [rsp+0x20]
+    mova         [rsp+0x20], xm3
+    pshufb              xm4, xm14
+    pmaddubsw           xm4, xm15
+    phaddw              xm4, xm4
+    pmulhrsw            xm4, xm12
+    punpcklwd           xm3, xm11, xm4
+    mova               xm11, xm4
+    add                srcq, ssq
+    jmp .w4_loop
+.w4_skip_line:
+    movu                xm5, [srcq+ssq*1]
+    movu                 m6, [rsp+0x10]
+    pshufb              xm4, xm14
+    pshufb              xm5, xm14
+    pmaddubsw           xm4, xm15
+    pmaddubsw           xm5, xm15
+    movu         [rsp+0x00], m6
+    phaddw              xm4, xm5
+    pmulhrsw            xm4, xm12
+    punpcklwd           xm9, xm11, xm4
+    mova         [rsp+0x20], xm9
+    psrldq             xm11, xm4, 8
+    mova                xm0, xm1
+    mova                xm1, xm2
+    mova                xm2, xm3
+    punpcklwd           xm3, xm4, xm11
+    lea                srcq, [srcq+ssq*2]
+    jmp .w4_loop
+.w8:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    movd               xm15, t0d
+    pmaddwd              m8, [base+rescale_mul]
+    vpbroadcastq        m11, [base+pq_0x40000000]
+    vpbroadcastd        m15, xm15
+    paddd               m14, m8 ; mx+dx*[0-7]
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd               m15, m6
+    pcmpeqd              m6, m9
+    vextracti128        xm7, m15, 1
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd              r7d, xm15, 1
+    pextrd              r9d, xm15, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    movq               xm15, [base+subpel_filters+r4*8]
+    movq               xm10, [base+subpel_filters+r6*8]
+    movhps             xm15, [base+subpel_filters+r7*8]
+    movhps             xm10, [base+subpel_filters+r9*8]
+    vinserti128         m15, [base+subpel_filters+r10*8], 1
+    vinserti128         m10, [base+subpel_filters+r11*8], 1
+    vpbroadcastq         m9, [base+subpel_filters+r13*8]
+    vpbroadcastq         m8, [base+subpel_filters+rX*8]
+    psrld               m14, 10
+    mova              [rsp], xm14
+    vextracti128        xm7, m14, 1
+    movd                r4d, xm14
+    pextrd              r6d, xm14, 2
+    pextrd              r7d, xm14, 1
+    pextrd              r9d, xm14, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    pshufd               m5, m6, q1100
+    pshufd               m6, m6, q3322
+    vpblendd            m15, m9, 0xc0
+    vpblendd            m10, m8, 0xc0
+    pblendvb            m15, m11, m5
+    pblendvb            m10, m11, m6
+    vbroadcasti128      m14, [base+subpel_s_shuf8]
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    mov                 myd, mym
+    mov                 dyd, dym
+    pshufb               m0, m14    ; 01a 01b
+    pshufb               m1, m14    ; 23a 23b
+    pshufb               m2, m14    ; 45a 45b
+    pshufb               m3, m14    ; 67a 67b
+    vbroadcasti128      m14, [base+wswap]
+.w8_loop:
+    and                 myd, 0x3ff
+    mov                 r6d, 64 << 24
+    mov                 r4d, myd
+    shr                 r4d, 6
+    lea                 r4d, [t1+r4]
+    cmovnz              r6q, [base+subpel_filters+r4*8]
+    movq               xm11, r6q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    vinserti128         m11, xm11, 1
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pshufd               m8, m11, q2222
+    pshufd              m11, m11, q3333
+    pmaddwd              m6, m2, m8
+    pmaddwd              m7, m3, m11
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movq             [dstq], xm4
+    add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
+    dec                  hd
+    jz .ret
+    add                 myd, dyd
+    test                myd, ~0x3ff
+    jz .w8_loop
+    test                myd, 0x400
+    mov            [rsp+16], myd
+    mov                 r4d, [rsp+ 0]
+    mov                 r6d, [rsp+ 8]
+    mov                 r7d, [rsp+ 4]
+    mov                 r9d, [rsp+12]
+    jz .w8_skip_line
+    vpbroadcastq         m6, [srcq+r13]
+    vpbroadcastq         m7, [srcq+ rX]
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
+    vinserti128          m4, [srcq+r10], 1
+    vinserti128          m5, [srcq+r11], 1
+    add                srcq, ssq
+    mov                 myd, [rsp+16]
+    mov                 dyd, dym
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pshufb               m2, m14
+    pshufb               m3, m14
+    vpblendd             m4, m6, 0xc0
+    vpblendd             m5, m7, 0xc0
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m10
+    phaddw               m4, m5
+    pslld                m5, m4, 16
+    paddw                m4, m5
+    pmulhrsw             m4, m12
+    pblendw              m0, m1, 0xaa
+    pblendw              m1, m2, 0xaa
+    pblendw              m2, m3, 0xaa
+    pblendw              m3, m4, 0xaa
+    jmp .w8_loop
+.w8_skip_line:
+    mova                 m0, m1
+    mova                 m1, m2
+    mova                 m2, m3
+    vpbroadcastq         m7, [srcq+r13]
+    vpbroadcastq         m8, [srcq+ rX]
+    movq                xm3, [srcq+ r4]
+    movq                xm4, [srcq+ r6]
+    movhps              xm3, [srcq+ r7]
+    movhps              xm4, [srcq+ r9]
+    vinserti128          m3, [srcq+r10], 1
+    vinserti128          m4, [srcq+r11], 1
+    add                srcq, ssq
+    movq                xm5, [srcq+ r4]
+    movq                xm6, [srcq+ r6]
+    movhps              xm5, [srcq+ r7]
+    movhps              xm6, [srcq+ r9]
+    vinserti128          m5, [srcq+r10], 1
+    vinserti128          m6, [srcq+r11], 1
+    vpbroadcastq         m9, [srcq+r13]
+    vpbroadcastq        m11, [srcq+ rX]
+    add                srcq, ssq
+    mov                 myd, [rsp+16]
+    mov                 dyd, dym
+    vpblendd             m3, m7, 0xc0
+    vpblendd             m4, m8, 0xc0
+    vpblendd             m5, m9, 0xc0
+    vpblendd             m6, m11, 0xc0
+    pmaddubsw            m3, m15
+    pmaddubsw            m4, m10
+    pmaddubsw            m5, m15
+    pmaddubsw            m6, m10
+    phaddw               m3, m4
+    phaddw               m5, m6
+    psrld                m4, m3, 16
+    pslld                m6, m5, 16
+    paddw                m3, m4
+    paddw                m5, m6
+    pblendw              m3, m5, 0xaa
+    pmulhrsw             m3, m12
+    jmp .w8_loop
+.w16:
+    mov      dword [rsp+48], 2
+    movifprep   tmp_stridem, 32
+    jmp .w_start
+.w32:
+    mov      dword [rsp+48], 4
+    movifprep   tmp_stridem, 64
+    jmp .w_start
+.w64:
+    mov      dword [rsp+48], 8
+    movifprep   tmp_stridem, 128
+    jmp .w_start
+.w128:
+    mov      dword [rsp+48], 16
+    movifprep   tmp_stridem, 256
+.w_start:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    pmaddwd              m8, [base+rescale_mul]
+    movd               xm15, t0d
+    mov            [rsp+72], t0d
+    mov            [rsp+56], srcq
+    mov            [rsp+64], r0q ; dstq / tmpq
+%if UNIX64
+    mov                  hm, hd
+%endif
+    shl           dword dxm, 3 ; dx*8
+    vpbroadcastd        m15, xm15
+    paddd               m14, m8 ; mx+dx*[0-7]
+    jmp .hloop
+.hloop_prep:
+    dec      dword [rsp+48]
+    jz .ret
+    add      qword [rsp+64], 8*(isprep+1)
+    mov                  hd, hm
+    vpbroadcastd         m8, dxm
+    vpbroadcastd        m10, [base+pd_0x3ff]
+    paddd               m14, m8, [rsp+16]
+    vpbroadcastd        m15, [rsp+72]
+    pxor                 m9, m9
+    mov                srcq, [rsp+56]
+    mov                 r0q, [rsp+64] ; dstq / tmpq
+.hloop:
+    vpbroadcastq        m11, [base+pq_0x40000000]
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd               m15, m6
+    pcmpeqd              m6, m9
+    vextracti128        xm7, m15, 1
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd              r7d, xm15, 1
+    pextrd              r9d, xm15, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    movu           [rsp+16], m14
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
+    vinserti128         m15, [base+subpel_filters+r10*8], 1
+    vinserti128         m10, [base+subpel_filters+r11*8], 1
+    vpbroadcastq         m9, [base+subpel_filters+r13*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    vextracti128        xm7, m14, 1
+    mova              [rsp], xm14
+    movd                r4d, xm14
+    pextrd              r6d, xm14, 2
+    pextrd              r7d, xm14, 1
+    pextrd              r9d, xm14, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    pshufd               m5, m6, q1100
+    pshufd               m6, m6, q3322
+    vpblendd            m15, m9, 0xc0
+    vpblendd            m10, m8, 0xc0
+    pblendvb            m15, m11, m5
+    pblendvb            m10, m11, m6
+    vbroadcasti128      m14, [base+subpel_s_shuf8]
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    mov                 myd, mym
+    mov                 dyd, dym
+    pshufb               m0, m14    ; 01a 01b
+    pshufb               m1, m14    ; 23a 23b
+    pshufb               m2, m14    ; 45a 45b
+    pshufb               m3, m14    ; 67a 67b
+    vbroadcasti128      m14, [base+wswap]
+.vloop:
+    and                 myd, 0x3ff
+    mov                 r6d, 64 << 24
+    mov                 r4d, myd
+    shr                 r4d, 6
+    lea                 r4d, [t1+r4]
+    cmovnz              r6q, [base+subpel_filters+r4*8]
+    movq               xm11, r6q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    vinserti128         m11, xm11, 1
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pshufd               m8, m11, q2222
+    pshufd              m11, m11, q3333
+    pmaddwd              m6, m2, m8
+    pmaddwd              m7, m3, m11
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movq             [dstq], xm4
+    add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, tmp_stridem
+%endif
+    dec                  hd
+    jz .hloop_prep
+    add                 myd, dyd
+    test                myd, ~0x3ff
+    jz .vloop
+    test                myd, 0x400
+    mov            [rsp+52], myd
+    mov                 r4d, [rsp+ 0]
+    mov                 r6d, [rsp+ 8]
+    mov                 r7d, [rsp+ 4]
+    mov                 r9d, [rsp+12]
+    jz .skip_line
+    vpbroadcastq         m6, [srcq+r13]
+    vpbroadcastq         m7, [srcq+ rX]
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
+    vinserti128          m4, [srcq+r10], 1
+    vinserti128          m5, [srcq+r11], 1
+    add                srcq, ssq
+    mov                 myd, [rsp+52]
+    mov                 dyd, dym
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pshufb               m2, m14
+    pshufb               m3, m14
+    vpblendd             m4, m6, 0xc0
+    vpblendd             m5, m7, 0xc0
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m10
+    phaddw               m4, m5
+    pslld                m5, m4, 16
+    paddw                m4, m5
+    pmulhrsw             m4, m12
+    pblendw              m0, m1, 0xaa
+    pblendw              m1, m2, 0xaa
+    pblendw              m2, m3, 0xaa
+    pblendw              m3, m4, 0xaa
+    jmp .vloop
+.skip_line:
+    mova                 m0, m1
+    mova                 m1, m2
+    mova                 m2, m3
+    vpbroadcastq         m7, [srcq+r13]
+    vpbroadcastq         m8, [srcq+ rX]
+    movq                xm3, [srcq+ r4]
+    movq                xm4, [srcq+ r6]
+    movhps              xm3, [srcq+ r7]
+    movhps              xm4, [srcq+ r9]
+    vinserti128          m3, [srcq+r10], 1
+    vinserti128          m4, [srcq+r11], 1
+    add                srcq, ssq
+    movq                xm5, [srcq+ r4]
+    movq                xm6, [srcq+ r6]
+    movhps              xm5, [srcq+ r7]
+    movhps              xm6, [srcq+ r9]
+    vinserti128          m5, [srcq+r10], 1
+    vinserti128          m6, [srcq+r11], 1
+    vpbroadcastq         m9, [srcq+r13]
+    vpbroadcastq        m11, [srcq+ rX]
+    add                srcq, ssq
+    mov                 myd, [rsp+52]
+    mov                 dyd, dym
+    vpblendd             m3, m7, 0xc0
+    vpblendd             m4, m8, 0xc0
+    vpblendd             m5, m9, 0xc0
+    vpblendd             m6, m11, 0xc0
+    pmaddubsw            m3, m15
+    pmaddubsw            m4, m10
+    pmaddubsw            m5, m15
+    pmaddubsw            m6, m10
+    phaddw               m3, m4
+    phaddw               m5, m6
+    psrld                m4, m3, 16
+    pslld                m6, m5, 16
+    paddw                m3, m4
+    paddw                m5, m6
+    pblendw              m3, m5, 0xaa
+    pmulhrsw             m3, m12
+    jmp .vloop
+.dy1:
+    movzx                wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+    add                  wq, base_reg
+    jmp                  wq
+%ifidn %1, put
+.dy1_w2:
+    mov                 myd, mym
+    movzx               t0d, t0b
+    dec                srcq
+    movd               xm15, t0d
+    punpckldq            m8, m9, m8
+    paddd               m14, m8 ; mx+dx*[0-1]
+    vpbroadcastd        m11, [base+pd_0x4000]
+    vpbroadcastd       xm15, xm15
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd              xm15, xm8
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 1
+    vbroadcasti128       m5, [base+bdct_lb_dw]
+    vbroadcasti128       m6, [base+subpel_s_shuf2]
+    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
+    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movq                xm0, [srcq+ssq*0]
+    movq                xm1, [srcq+ssq*2]
+    movhps              xm0, [srcq+ssq*1]
+    movhps              xm1, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    pshufb              m14, m5
+    paddb               m14, m6
+    vinserti128          m0, [srcq+ssq*0], 1
+    vinserti128          m1, [srcq+ssq*2], 1
+    vpbroadcastq         m2, [srcq+ssq*1]
+    add                srcq, ss3q
+    movq               xm10, r4q
+    punpcklbw          xm10, xm10
+    psraw              xm10, 8
+    vpblendd            m15, m7, 0xaa
+    pblendvb            m15, m11, m8
+    pshufd              xm8, xm10, q0000
+    pshufd              xm9, xm10, q1111
+    pshufd             xm11, xm10, q3333
+    pshufd             xm10, xm10, q2222
+    vpblendd             m0, m2, 0xc0
+    pshufb               m1, m14
+    pshufb               m0, m14
+    pmaddubsw            m1, m15
+    pmaddubsw            m0, m15
+    phaddw               m0, m1
+    pmulhrsw             m0, m12
+    vextracti128        xm1, m0, 1
+    palignr             xm2, xm1, xm0, 4
+    pshufd              xm4, xm1, q2121
+    punpcklwd           xm3, xm0, xm2       ; 01 12
+    punpckhwd           xm0, xm2            ; 23 34
+    punpcklwd           xm2, xm1, xm4       ; 45 56
+.dy1_w2_loop:
+    movq                xm1, [srcq+ssq*0]
+    movhps              xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pmaddwd             xm5, xm3, xm8
+    pmaddwd             xm6, xm0, xm9
+    pmaddwd             xm7, xm2, xm10
+    mova                xm3, xm0
+    mova                xm0, xm2
+    paddd               xm5, xm13
+    paddd               xm6, xm7
+    pshufb              xm1, xm14
+    pmaddubsw           xm1, xm15
+    phaddw              xm1, xm1
+    pmulhrsw            xm1, xm12
+    palignr             xm7, xm1, xm4, 12
+    punpcklwd           xm2, xm7, xm1     ; 67 78
+    pmaddwd             xm7, xm2, xm11
+    mova                xm4, xm1
+    paddd               xm5, xm6
+    paddd               xm5, xm7
+    psrad               xm5, rndshift
+    packssdw            xm5, xm5
+    packuswb            xm5, xm5
+    pextrw     [dstq+dsq*0], xm5, 0
+    pextrw     [dstq+dsq*1], xm5, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .dy1_w2_loop
+    RET
+%endif
+.dy1_w4:
+    mov                 myd, mym
+    vbroadcasti128       m7, [base+rescale_mul]
+    movzx               t0d, t0b
+    dec                srcq
+    movd               xm15, t0d
+    pmaddwd              m8, m7
+    vpbroadcastd        m11, [base+pd_0x4000]
+    vpbroadcastd       xm15, xm15
+    paddd               m14, m8 ; mx+dx*[0-3]
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd              xm15, xm8
+    vpermq               m8, m8, q3120
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd             r11d, xm15, 1
+    pextrd             r13d, xm15, 3
+    movd               xm15, [base+subpel_filters+r4*8+2]
+    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
+    movu                xm2, [srcq+ssq*0]
+    movu                xm3, [srcq+ssq*2]
+    vbroadcasti128       m5, [base+bdct_lb_dw]
+    vpbroadcastq         m6, [base+subpel_s_shuf2]
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    pinsrd             xm15, [base+subpel_filters+r11*8+2], 1
+    vpblendd             m7, [base+subpel_filters+r13*8+2-20], 0x20
+    vinserti128          m2, [srcq+ssq*1], 1
+    vinserti128          m3, [srcq+ss3q ], 1
+    lea                srcq, [srcq+ssq*4]
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    pshufb              m14, m5
+    paddb               m14, m6
+    movu                xm4, [srcq+ssq*0]
+    movu                xm5, [srcq+ssq*2]
+    vinserti128          m4, [srcq+ssq*1], 1
+    add                srcq, ss3q
+    vpblendd            m15, m7, 0x30
+    punpcklqdq          m15, m15
+    pblendvb            m15, m11, m8
+    movq               xm10, r4q
+    punpcklbw          xm10, xm10
+    psraw              xm10, 8
+    vinserti128         m10, xm10, 1
+    pshufb               m2, m14
+    pshufb               m3, m14
+    pshufb               m4, m14
+    pshufb              xm5, xm14
+    vpermq               m2, m2, q3120
+    vpermq               m3, m3, q3120
+    vpermq               m4, m4, q3120
+    vpermq               m5, m5, q3120
+    pshufd               m7, m10, q0000
+    pshufd               m8, m10, q1111
+    pshufd               m9, m10, q2222
+    pshufd              m10, m10, q3333
+    pmaddubsw            m2, m15
+    pmaddubsw            m3, m15
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m15
+    phaddw               m2, m3
+    phaddw               m4, m5
+    pmulhrsw             m2, m12
+    pmulhrsw             m4, m12
+    palignr              m5, m4, m2, 4
+    pshufd               m3, m4, q2121
+    punpcklwd            m0, m2, m5     ; 01 12
+    punpckhwd            m1, m2, m5     ; 23 34
+    punpcklwd            m2, m4, m3     ; 45 56
+.dy1_w4_loop:
+    movu               xm11, [srcq+ssq*0]
+    vinserti128         m11, [srcq+ssq*1], 1
+    lea                srcq, [srcq+ssq*2]
+    pmaddwd              m4, m0, m7
+    pmaddwd              m5, m1, m8
+    pmaddwd              m6, m2, m9
+    mova                 m0, m1
+    mova                 m1, m2
+    paddd                m4, m13
+    paddd                m5, m6
+    pshufb              m11, m14
+    vpermq              m11, m11, q3120
+    pmaddubsw           m11, m15
+    phaddw              m11, m11
+    pmulhrsw            m11, m12
+    palignr              m6, m11, m3, 12
+    punpcklwd            m2, m6, m11    ; 67 78
+    mova                 m3, m11
+    pmaddwd              m6, m2, m10
+    paddd                m4, m5
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    pshuflw             xm4, xm4, q3120
+    movd       [dstq+dsq*0], xm4
+    pextrd     [dstq+dsq*1], xm4, 1
+    lea                dstq, [dstq+dsq*2]
+%else
+    pshufd              xm4, xm4, q3120
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
+    sub                  hd, 2
+    jg .dy1_w4_loop
+    MC_8TAP_SCALED_RET
+.dy1_w8:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    movd               xm15, t0d
+    pmaddwd              m8, [base+rescale_mul]
+    vpbroadcastq        m11, [base+pq_0x40000000]
+    vpbroadcastd        m15, xm15
+    paddd               m14, m8 ; mx+dx*[0-7]
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd               m15, m6
+    pcmpeqd              m6, m9
+    vextracti128        xm7, m15, 1
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd              r7d, xm15, 1
+    pextrd              r9d, xm15, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
+    vinserti128         m15, [base+subpel_filters+r10*8], 1
+    vinserti128         m10, [base+subpel_filters+r11*8], 1
+    vpbroadcastq         m9, [base+subpel_filters+r13*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    vextracti128        xm7, m14, 1
+    movd                r4d, xm14
+    pextrd              r6d, xm14, 2
+    pextrd              r7d, xm14, 1
+    pextrd              r9d, xm14, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    mov            [rsp+32], r7d
+    pshufd               m5, m6, q1100
+    pshufd               m6, m6, q3322
+    vpblendd            m15, m9, 0xc0
+    vpblendd            m10, m8, 0xc0
+    pblendvb            m15, m11, m5
+    pblendvb            m10, m11, m6
+    vbroadcasti128      m14, [base+subpel_s_shuf8]
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    mov                 myd, mym
+    movu              [rsp], m10
+    pshufb               m0, m14    ; 01a 01b
+    pshufb               m1, m14    ; 23a 23b
+    pshufb               m2, m14    ; 45a 45b
+    pshufb               m3, m14    ; 67a 67b
+    shr                 myd, 6
+    lea                 myd, [t1+myq]
+    mov                 t1d, 64 << 24
+    cmovnz              t1q, [base+subpel_filters+myq*8]
+    vbroadcasti128      m14, [base+wswap]
+    movq               xm11, t1q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    vinserti128         m11, xm11, 1
+    mov                 r7d, [rsp+32]
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pshufd              m10, m11, q2222
+    pshufd              m11, m11, q3333
+.dy1_w8_loop:
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pmaddwd              m6, m2, m10
+    pmaddwd              m7, m3, m11
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movq             [dstq], xm4
+    add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
+    dec                  hd
+    jz .ret
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
+    vinserti128          m4, [srcq+r10], 1
+    vinserti128          m5, [srcq+r11], 1
+    vpbroadcastq         m6, [srcq+r13]
+    vpbroadcastq         m7, [srcq+ rX]
+    add                srcq, ssq
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pshufb               m2, m14
+    pshufb               m3, m14
+    vpblendd             m4, m6, 0xc0
+    vpblendd             m5, m7, 0xc0
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, [rsp]
+    phaddw               m4, m5
+    pslld                m5, m4, 16
+    paddw                m4, m5
+    pmulhrsw             m4, m12
+    pblendw              m0, m1, 0xaa
+    pblendw              m1, m2, 0xaa
+    pblendw              m2, m3, 0xaa
+    pblendw              m3, m4, 0xaa
+    jmp .dy1_w8_loop
+.dy1_w16:
+    mov      dword [rsp+72], 2
+    movifprep   tmp_stridem, 32
+    jmp .dy1_w_start
+.dy1_w32:
+    mov      dword [rsp+72], 4
+    movifprep   tmp_stridem, 64
+    jmp .dy1_w_start
+.dy1_w64:
+    mov      dword [rsp+72], 8
+    movifprep   tmp_stridem, 128
+    jmp .dy1_w_start
+.dy1_w128:
+    mov      dword [rsp+72], 16
+    movifprep   tmp_stridem, 256
+.dy1_w_start:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    pmaddwd              m8, [base+rescale_mul]
+    movd               xm15, t0d
+    mov            [rsp+76], t0d
+    mov            [rsp+80], srcq
+    mov            [rsp+88], r0q ; dstq / tmpq
+%if UNIX64
+    mov                  hm, hd
+%endif
+    shl           dword dxm, 3 ; dx*8
+    vpbroadcastd        m15, xm15
+    paddd               m14, m8 ; mx+dx*[0-7]
+    jmp .dy1_hloop
+.dy1_hloop_prep:
+    dec      dword [rsp+72]
+    jz .ret
+    add      qword [rsp+88], 8*(isprep+1)
+    mov                  hd, hm
+    vpbroadcastd         m8, dxm
+    vpbroadcastd        m10, [base+pd_0x3ff]
+    paddd               m14, m8, [rsp+32]
+    vpbroadcastd        m15, [rsp+76]
+    pxor                 m9, m9
+    mov                srcq, [rsp+80]
+    mov                 r0q, [rsp+88] ; dstq / tmpq
+.dy1_hloop:
+    vpbroadcastq        m11, [base+pq_0x40000000]
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd               m15, m6
+    pcmpeqd              m6, m9
+    vextracti128        xm7, m15, 1
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd              r7d, xm15, 1
+    pextrd              r9d, xm15, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    movu           [rsp+32], m14
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
+    vinserti128         m15, [base+subpel_filters+r10*8], 1
+    vinserti128         m10, [base+subpel_filters+r11*8], 1
+    vpbroadcastq         m9, [base+subpel_filters+r13*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    vextracti128        xm7, m14, 1
+    movq           [rsp+64], xm14
+    movd                r4d, xm14
+    pextrd              r6d, xm14, 2
+    pextrd              r7d, xm14, 1
+    pextrd              r9d, xm14, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    pshufd               m5, m6, q1100
+    pshufd               m6, m6, q3322
+    vpblendd            m15, m9, 0xc0
+    vpblendd            m10, m8, 0xc0
+    pblendvb            m15, m11, m5
+    pblendvb            m10, m11, m6
+    vbroadcasti128      m14, [base+subpel_s_shuf8]
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    mov                 myd, mym
+    movu              [rsp], m10
+    pshufb               m0, m14    ; 01a 01b
+    pshufb               m1, m14    ; 23a 23b
+    pshufb               m2, m14    ; 45a 45b
+    pshufb               m3, m14    ; 67a 67b
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    vbroadcasti128      m14, [base+wswap]
+    movq               xm11, r4q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    vinserti128         m11, xm11, 1
+    mov                 r4d, [rsp+64]
+    mov                 r7d, [rsp+68]
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pshufd              m10, m11, q2222
+    pshufd              m11, m11, q3333
+.dy1_vloop:
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pmaddwd              m6, m2, m10
+    pmaddwd              m7, m3, m11
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movq             [dstq], xm4
+    add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, tmp_stridem
+%endif
+    dec                  hd
+    jz .dy1_hloop_prep
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
+    vinserti128          m4, [srcq+r10], 1
+    vinserti128          m5, [srcq+r11], 1
+    vpbroadcastq         m6, [srcq+r13]
+    vpbroadcastq         m7, [srcq+ rX]
+    add                srcq, ssq
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pshufb               m2, m14
+    pshufb               m3, m14
+    vpblendd             m4, m6, 0xc0
+    vpblendd             m5, m7, 0xc0
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, [rsp]
+    phaddw               m4, m5
+    pslld                m5, m4, 16
+    paddw                m4, m5
+    pmulhrsw             m4, m12
+    pblendw              m0, m1, 0xaa
+    pblendw              m1, m2, 0xaa
+    pblendw              m2, m3, 0xaa
+    pblendw              m3, m4, 0xaa
+    jmp .dy1_vloop
+.dy2:
+    movzx                wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+    add                  wq, base_reg
+    jmp                  wq
+%ifidn %1, put
+.dy2_w2:
+    mov                 myd, mym
+    movzx               t0d, t0b
+    dec                srcq
+    movd               xm15, t0d
+    punpckldq            m8, m9, m8
+    paddd               m14, m8 ; mx+dx*[0-1]
+    vpbroadcastd        m11, [base+pd_0x4000]
+    vpbroadcastd       xm15, xm15
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd              xm15, xm8
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 1
+    vbroadcasti128       m5, [base+bdct_lb_dw]
+    vbroadcasti128       m6, [base+subpel_s_shuf2]
+    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
+    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movq                xm0, [srcq+ssq*0]
+    vpbroadcastq         m2, [srcq+ssq*1]
+    movhps              xm0, [srcq+ssq*2]
+    vpbroadcastq         m3, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    pshufb              m14, m5
+    paddb               m14, m6
+    vpblendd            m15, m7, 0xaa
+    pblendvb            m15, m11, m8
+    movhps              xm1, [srcq+ssq*0]
+    vpbroadcastq         m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    vpblendd             m0, m2, 0x30
+    vpblendd             m1, m4, 0xc0
+    vpblendd             m0, m3, 0xc0
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pmaddubsw            m0, m15
+    pmaddubsw            m1, m15
+    movq               xm11, r4q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    phaddw               m0, m1
+    pmulhrsw             m0, m12            ; 0 2 _ 4  1 3 _ 5
+    pshufd              xm8, xm11, q0000
+    pshufd              xm9, xm11, q1111
+    pshufd             xm10, xm11, q2222
+    pshufd             xm11, xm11, q3333
+    pshufd               m2, m0, q3110      ; 0 2 2 4  1 3 3 5
+    vextracti128        xm1, m2, 1
+    punpcklwd           xm3, xm2, xm1       ; 01 23
+    punpckhwd           xm2, xm1            ; 23 45
+.dy2_w2_loop:
+    movq                xm6, [srcq+ssq*0]
+    vpbroadcastq         m7, [srcq+ssq*1]
+    movhps              xm6, [srcq+ssq*2]
+    vpbroadcastq         m1, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    pmaddwd             xm4, xm3, xm8
+    pmaddwd             xm5, xm2, xm9
+    vpblendd             m6, m7, 0x30
+    vpblendd             m6, m1, 0xc0
+    pshufb               m6, m14
+    pmaddubsw            m6, m15
+    phaddw               m6, m6
+    pmulhrsw             m6, m12
+    palignr              m0, m6, m0, 8
+    pshufd               m2, m0, q3221
+    vextracti128        xm1, m2, 1
+    punpcklwd           xm3, xm2, xm1       ; 45 67
+    punpckhwd           xm2, xm1            ; 67 89
+    pmaddwd             xm6, xm3, xm10
+    pmaddwd             xm7, xm2, xm11
+    paddd               xm4, xm5
+    paddd               xm4, xm13
+    paddd               xm6, xm7
+    paddd               xm4, xm6
+    psrad               xm4, rndshift
+    packssdw            xm4, xm4
+    packuswb            xm4, xm4
+    pextrw     [dstq+dsq*0], xm4, 0
+    pextrw     [dstq+dsq*1], xm4, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .dy2_w2_loop
+    RET
+%endif
+.dy2_w4:
+    mov                 myd, mym
+    vbroadcasti128       m7, [base+rescale_mul]
+    movzx               t0d, t0b
+    dec                srcq
+    movd               xm15, t0d
+    pmaddwd              m8, m7
+    vpbroadcastd        m11, [base+pd_0x4000]
+    vpbroadcastd       xm15, xm15
+    paddd               m14, m8 ; mx+dx*[0-3]
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd              xm15, xm8
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 1
+    pextrd             r11d, xm15, 2
+    pextrd             r13d, xm15, 3
+    movd               xm15, [base+subpel_filters+r4*8+2]
+    vbroadcasti128       m5, [base+bdct_lb_dw]
+    vpbroadcastq         m6, [base+subpel_s_shuf2]
+    pinsrd             xm15, [base+subpel_filters+r6*8+2], 1
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movu                xm0, [srcq+ssq*0]
+    movu                xm2, [srcq+ssq*2]
+    pinsrd             xm15, [base+subpel_filters+r11*8+2], 2
+    movu                xm1, [srcq+ssq*1]
+    movu                xm3, [srcq+ss3q ]
+    pinsrd             xm15, [base+subpel_filters+r13*8+2], 3
+    lea                srcq, [srcq+ssq*4]
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    vinserti128         m15, xm15, 1
+    pshufb              m14, m5
+    paddb               m14, m6
+    vinserti128          m2, [srcq+ssq*0], 1
+    vinserti128          m3, [srcq+ssq*1], 1
+    lea                srcq, [srcq+ssq*2]
+    pblendvb            m15, m11, m8
+    pshufb              xm0, xm14
+    pshufb               m2, m14
+    pshufb              xm1, xm14
+    pshufb               m3, m14
+    pmaddubsw           xm0, xm15
+    pmaddubsw            m2, m15
+    pmaddubsw           xm1, xm15
+    pmaddubsw            m3, m15
+    movq               xm11, r4q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    vinserti128         m11, xm11, 1
+    phaddw               m0, m2
+    phaddw               m1, m3
+    pmulhrsw             m0, m12    ; 0 2  _ 4
+    pmulhrsw             m1, m12    ; 1 3  _ 5
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pshufd              m10, m11, q2222
+    pshufd              m11, m11, q3333
+    punpcklwd           xm2, xm0, xm1
+    punpckhwd            m1, m0, m1     ; 23 45
+    vinserti128          m0, m2, xm1, 1 ; 01 23
+.dy2_w4_loop:
+    movu                xm6, [srcq+ssq*0]
+    movu                xm7, [srcq+ssq*1]
+    vinserti128          m6, [srcq+ssq*2], 1
+    vinserti128          m7, [srcq+ss3q ], 1
+    lea                srcq, [srcq+ssq*4]
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pshufb               m6, m14
+    pshufb               m7, m14
+    pmaddubsw            m6, m15
+    pmaddubsw            m7, m15
+    psrld                m2, m6, 16
+    pslld                m3, m7, 16
+    paddw                m6, m2
+    paddw                m7, m3
+    pblendw              m6, m7, 0xaa   ; 67 89
+    pmulhrsw             m6, m12
+    paddd                m4, m5
+    vpblendd             m0, m1, m6, 0x0f
+    mova                 m1, m6
+    vpermq               m0, m0, q1032  ; 45 67
+    pmaddwd              m6, m0, m10
+    pmaddwd              m7, m1, m11
+    paddd                m4, m13
+    paddd                m6, m7
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movd       [dstq+dsq*0], xm4
+    pextrd     [dstq+dsq*1], xm4, 1
+    lea                dstq, [dstq+dsq*2]
+%else
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
+    sub                  hd, 2
+    jg .dy2_w4_loop
+    MC_8TAP_SCALED_RET
+.dy2_w8:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    movd               xm15, t0d
+    pmaddwd              m8, [base+rescale_mul]
+    vpbroadcastq        m11, [base+pq_0x40000000]
+    vpbroadcastd        m15, xm15
+    paddd               m14, m8 ; mx+dx*[0-7]
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd               m15, m6
+    pcmpeqd              m6, m9
+    vextracti128        xm7, m15, 1
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd              r7d, xm15, 1
+    pextrd              r9d, xm15, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
+    vinserti128         m15, [base+subpel_filters+r10*8], 1
+    vinserti128         m10, [base+subpel_filters+r11*8], 1
+    vpbroadcastq         m9, [base+subpel_filters+r13*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    vextracti128        xm7, m14, 1
+    movd                r4d, xm14
+    pextrd              r6d, xm14, 2
+    pextrd              r7d, xm14, 1
+    pextrd              r9d, xm14, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    mov               [rsp], r7d
+    pshufd               m5, m6, q1100
+    pshufd               m6, m6, q3322
+    vpblendd            m15, m9, 0xc0
+    vpblendd            m10, m8, 0xc0
+    pblendvb            m15, m11, m5
+    pblendvb            m10, m11, m6
+    vbroadcasti128      m14, [base+subpel_s_shuf8]
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    mov                 myd, mym
+    pshufb               m0, m14    ; 01a 01b
+    pshufb               m1, m14    ; 23a 23b
+    pshufb               m2, m14    ; 45a 45b
+    pshufb               m3, m14    ; 67a 67b
+    shr                 myd, 6
+    lea                 myd, [t1+myq]
+    mov                 t1d, 64 << 24
+    cmovnz              t1q, [base+subpel_filters+myq*8]
+    movq               xm11, t1q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    vinserti128         m11, xm11, 1
+    mov                 r7d, [rsp]
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pshufd              m14, m11, q2222
+    pshufd              m11, m11, q3333
+.dy2_w8_loop:
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pmaddwd              m6, m2, m14
+    pmaddwd              m7, m3, m11
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movq             [dstq], xm4
+    add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
+    dec                  hd
+    jz .ret
+    mova                 m0, m1
+    mova                 m1, m2
+    mova                 m2, m3
+    movq                xm3, [srcq+ r4]
+    movq                xm4, [srcq+ r6]
+    movhps              xm3, [srcq+ r7]
+    movhps              xm4, [srcq+ r9]
+    vinserti128          m3, [srcq+r10], 1
+    vinserti128          m4, [srcq+r11], 1
+    vpbroadcastq         m5, [srcq+r13]
+    vpbroadcastq         m6, [srcq+ rX]
+    add                srcq, ssq
+    vpblendd             m3, m5, 0xc0
+    vpblendd             m4, m6, 0xc0
+    pmaddubsw            m3, m15
+    pmaddubsw            m4, m10
+    phaddw               m3, m4
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
+    vinserti128          m4, [srcq+r10], 1
+    vinserti128          m5, [srcq+r11], 1
+    vpbroadcastq         m6, [srcq+r13]
+    vpbroadcastq         m7, [srcq+ rX]
+    add                srcq, ssq
+    vpblendd             m4, m6, 0xc0
+    vpblendd             m5, m7, 0xc0
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m10
+    phaddw               m4, m5
+    psrld                m5, m3, 16
+    pslld                m6, m4, 16
+    paddw                m3, m5
+    paddw                m4, m6
+    pblendw              m3, m4, 0xaa
+    pmulhrsw             m3, m12
+    jmp .dy2_w8_loop
+.dy2_w16:
+    mov      dword [rsp+40], 2
+    movifprep   tmp_stridem, 32
+    jmp .dy2_w_start
+.dy2_w32:
+    mov      dword [rsp+40], 4
+    movifprep   tmp_stridem, 64
+    jmp .dy2_w_start
+.dy2_w64:
+    mov      dword [rsp+40], 8
+    movifprep   tmp_stridem, 128
+    jmp .dy2_w_start
+.dy2_w128:
+    mov      dword [rsp+40], 16
+    movifprep   tmp_stridem, 256
+.dy2_w_start:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    pmaddwd              m8, [base+rescale_mul]
+    movd               xm15, t0d
+    mov            [rsp+64], t0d
+    mov            [rsp+48], srcq
+    mov            [rsp+56], r0q ; dstq / tmpq
+%if UNIX64
+    mov                  hm, hd
+%endif
+    shl           dword dxm, 3 ; dx*8
+    vpbroadcastd        m15, xm15
+    paddd               m14, m8 ; mx+dx*[0-7]
+    jmp .dy2_hloop
+.dy2_hloop_prep:
+    dec      dword [rsp+40]
+    jz .ret
+    add      qword [rsp+56], 8*(isprep+1)
+    mov                  hd, hm
+    vpbroadcastd         m8, dxm
+    vpbroadcastd        m10, [base+pd_0x3ff]
+    paddd               m14, m8, [rsp]
+    vpbroadcastd        m15, [rsp+64]
+    pxor                 m9, m9
+    mov                srcq, [rsp+48]
+    mov                 r0q, [rsp+56] ; dstq / tmpq
+.dy2_hloop:
+    vpbroadcastq        m11, [base+pq_0x40000000]
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd               m15, m6
+    pcmpeqd              m6, m9
+    vextracti128        xm7, m15, 1
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd              r7d, xm15, 1
+    pextrd              r9d, xm15, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    movu              [rsp], m14
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
+    vinserti128         m15, [base+subpel_filters+r10*8], 1
+    vinserti128         m10, [base+subpel_filters+r11*8], 1
+    vpbroadcastq         m9, [base+subpel_filters+r13*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    vextracti128        xm7, m14, 1
+    movq           [rsp+32], xm14
+    movd                r4d, xm14
+    pextrd              r6d, xm14, 2
+    pextrd              r7d, xm14, 1
+    pextrd              r9d, xm14, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    pshufd               m5, m6, q1100
+    pshufd               m6, m6, q3322
+    vpblendd            m15, m9, 0xc0
+    vpblendd            m10, m8, 0xc0
+    pblendvb            m15, m11, m5
+    pblendvb            m10, m11, m6
+    vbroadcasti128      m14, [base+subpel_s_shuf8]
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    mov                 myd, mym
+    pshufb               m0, m14    ; 01a 01b
+    pshufb               m1, m14    ; 23a 23b
+    pshufb               m2, m14    ; 45a 45b
+    pshufb               m3, m14    ; 67a 67b
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    movq               xm14, r4q
+    punpcklbw          xm14, xm14
+    psraw              xm14, 8
+    vinserti128         m14, xm14, 1
+    mov                 r4d, [rsp+32]
+    mov                 r7d, [rsp+36]
+    pshufd               m8, m14, q0000
+    pshufd               m9, m14, q1111
+    pshufd              m11, m14, q2222
+    pshufd              m14, m14, q3333
+.dy2_vloop:
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pmaddwd              m6, m2, m11
+    pmaddwd              m7, m3, m14
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movq             [dstq], xm4
+    add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, tmp_stridem
+%endif
+    dec                  hd
+    jz .dy2_hloop_prep
+    mova                 m0, m1
+    mova                 m1, m2
+    mova                 m2, m3
+    movq                xm3, [srcq+ r4]
+    movq                xm4, [srcq+ r6]
+    movhps              xm3, [srcq+ r7]
+    movhps              xm4, [srcq+ r9]
+    vinserti128          m3, [srcq+r10], 1
+    vinserti128          m4, [srcq+r11], 1
+    vpbroadcastq         m5, [srcq+r13]
+    vpbroadcastq         m6, [srcq+ rX]
+    add                srcq, ssq
+    vpblendd             m3, m5, 0xc0
+    vpblendd             m4, m6, 0xc0
+    pmaddubsw            m3, m15
+    pmaddubsw            m4, m10
+    phaddw               m3, m4
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
+    vinserti128          m4, [srcq+r10], 1
+    vinserti128          m5, [srcq+r11], 1
+    vpbroadcastq         m6, [srcq+r13]
+    vpbroadcastq         m7, [srcq+ rX]
+    add                srcq, ssq
+    vpblendd             m4, m6, 0xc0
+    vpblendd             m5, m7, 0xc0
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m10
+    phaddw               m4, m5
+    psrld                m5, m3, 16
+    pslld                m6, m4, 16
+    paddw                m3, m5
+    paddw                m4, m6
+    pblendw              m3, m4, 0xaa
+    pmulhrsw             m3, m12
+    jmp .dy2_vloop
+.ret:
+    MC_8TAP_SCALED_RET 0
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled
+    mov                 t0d, (5*15 << 16) | 5*15
+    mov                 t1d, (5*15 << 16) | 5*15
+    jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
+%endmacro
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+BILIN_SCALED_FN put
+PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
+PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
+PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
+PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP
+PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+BILIN_SCALED_FN prep
+PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
+PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
+PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
+PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP
+PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
+MC_8TAP_SCALED prep
+
+%macro WARP_V 5 ; dst, 02, 46, 13, 57
+    ; Can be done using gathers, but that's terribly slow on many CPU:s
+    lea               tmp1d, [myq+deltaq*4]
+    lea               tmp2d, [myq+deltaq*1]
+    shr                 myd, 10
+    shr               tmp1d, 10
+    movq                xm8, [filterq+myq  *8]
+    vinserti128          m8, [filterq+tmp1q*8], 1 ; a e
+    lea               tmp1d, [tmp2q+deltaq*4]
+    lea                 myd, [tmp2q+deltaq*1]
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movq                xm0, [filterq+tmp2q*8]
+    vinserti128          m0, [filterq+tmp1q*8], 1 ; b f
+    lea               tmp1d, [myq+deltaq*4]
+    lea               tmp2d, [myq+deltaq*1]
+    shr                 myd, 10
+    shr               tmp1d, 10
+    movq                xm9, [filterq+myq  *8]
+    vinserti128          m9, [filterq+tmp1q*8], 1 ; c g
+    lea               tmp1d, [tmp2q+deltaq*4]
+    lea                 myd, [tmp2q+gammaq]       ; my += gamma
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    punpcklwd            m8, m0
+    movq                xm0, [filterq+tmp2q*8]
+    vinserti128          m0, [filterq+tmp1q*8], 1 ; d h
+    punpcklwd            m0, m9, m0
+    punpckldq            m9, m8, m0
+    punpckhdq            m0, m8, m0
+    punpcklbw            m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+    punpckhbw            m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
+    pmaddwd             m%2, m8
+    pmaddwd              m9, m%3
+    punpcklbw            m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+    punpckhbw            m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
+    pmaddwd              m8, m%4
+    pmaddwd              m0, m%5
+    paddd               m%2, m9
+    paddd                m0, m8
+    paddd               m%1, m0, m%2
+%endmacro
+
+cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
+%if WIN64
+    sub                 rsp, 0xa0
+%endif
+    call mangle(private_prefix %+ _warp_affine_8x8_avx2).main
+.loop:
+    psrad                m7, 13
+    psrad                m0, 13
+    packssdw             m7, m0
+    pmulhrsw             m7, m14 ; (x + (1 << 6)) >> 7
+    vpermq               m7, m7, q3120
+    mova         [tmpq+tsq*0], xm7
+    vextracti128 [tmpq+tsq*2], m7, 1
+    dec                 r4d
+    jz   mangle(private_prefix %+ _warp_affine_8x8_avx2).end
+    call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2
+    lea                tmpq, [tmpq+tsq*4]
+    jmp .loop
+
+cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
+                                   beta, filter, tmp1, delta, my, gamma
+%if WIN64
+    sub                 rsp, 0xa0
+    %assign xmm_regs_used 16
+    %assign stack_size_padded 0xa0
+    %assign stack_offset stack_offset+stack_size_padded
+%endif
+    call .main
+    jmp .start
+.loop:
+    call .main2
+    lea                dstq, [dstq+dsq*2]
+.start:
+    psrad                m7, 18
+    psrad                m0, 18
+    packusdw             m7, m0
+    pavgw                m7, m11 ; (x + (1 << 10)) >> 11
+    vextracti128        xm0, m7, 1
+    packuswb            xm7, xm0
+    pshufd              xm7, xm7, q3120
+    movq       [dstq+dsq*0], xm7
+    movhps     [dstq+dsq*1], xm7
+    dec                 r4d
+    jg .loop
+.end:
+    RET
+ALIGN function_align
+.main:
+    ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+    mov               abcdq, r5m
+    mov                 mxd, r6m
+    movaps [rsp+stack_offset+0x10], xmm6
+    movaps [rsp+stack_offset+0x20], xmm7
+    movaps       [rsp+0x28], xmm8
+    movaps       [rsp+0x38], xmm9
+    movaps       [rsp+0x48], xmm10
+    movaps       [rsp+0x58], xmm11
+    movaps       [rsp+0x68], xmm12
+    movaps       [rsp+0x78], xmm13
+    movaps       [rsp+0x88], xmm14
+    movaps       [rsp+0x98], xmm15
+%endif
+    movsx            alphad, word [abcdq+2*0]
+    movsx             betad, word [abcdq+2*1]
+    mova                m12, [warp_8x8_shufA]
+    mova                m13, [warp_8x8_shufB]
+    vpbroadcastd        m14, [pw_8192]
+    vpbroadcastd        m15, [pd_32768]
+    pxor                m11, m11
+    lea             filterq, [mc_warp_filter]
+    lea               tmp1q, [ssq*3+3]
+    add                 mxd, 512+(64<<10)
+    lea               tmp2d, [alphaq*3]
+    sub                srcq, tmp1q    ; src -= src_stride*3 + 3
+    sub               betad, tmp2d    ; beta -= alpha*3
+    mov                 myd, r7m
+    call .h
+    psrld                m1, m0, 16
+    call .h
+    psrld                m4, m0, 16
+    call .h
+    pblendw              m1, m0, 0xaa ; 02
+    call .h
+    pblendw              m4, m0, 0xaa ; 13
+    call .h
+    psrld                m2, m1, 16
+    pblendw              m2, m0, 0xaa ; 24
+    call .h
+    psrld                m5, m4, 16
+    pblendw              m5, m0, 0xaa ; 35
+    call .h
+    psrld                m3, m2, 16
+    pblendw              m3, m0, 0xaa ; 46
+    movsx            deltad, word [abcdq+2*2]
+    movsx            gammad, word [abcdq+2*3]
+    add                 myd, 512+(64<<10)
+    mov                 r4d, 4
+    lea               tmp1d, [deltaq*3]
+    sub              gammad, tmp1d    ; gamma -= delta*3
+.main2:
+    call .h
+    psrld                m6, m5, 16
+    pblendw              m6, m0, 0xaa ; 57
+    WARP_V                7, 1, 3, 4, 6
+    call .h
+    mova                 m1, m2
+    mova                 m2, m3
+    psrld                m3, 16
+    pblendw              m3, m0, 0xaa ; 68
+    WARP_V                0, 4, 6, 1, 3
+    mova                 m4, m5
+    mova                 m5, m6
+    ret
+ALIGN function_align
+.h:
+    lea               tmp1d, [mxq+alphaq*4]
+    lea               tmp2d, [mxq+alphaq*1]
+    vbroadcasti128      m10, [srcq]
+    shr                 mxd, 10
+    shr               tmp1d, 10
+    movq                xm8, [filterq+mxq  *8]
+    vinserti128          m8, [filterq+tmp1q*8], 1
+    lea               tmp1d, [tmp2q+alphaq*4]
+    lea                 mxd, [tmp2q+alphaq*1]
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movq                xm0, [filterq+tmp2q*8]
+    vinserti128          m0, [filterq+tmp1q*8], 1
+    lea               tmp1d, [mxq+alphaq*4]
+    lea               tmp2d, [mxq+alphaq*1]
+    shr                 mxd, 10
+    shr               tmp1d, 10
+    movq                xm9, [filterq+mxq  *8]
+    vinserti128          m9, [filterq+tmp1q*8], 1
+    lea               tmp1d, [tmp2q+alphaq*4]
+    lea                 mxd, [tmp2q+betaq] ; mx += beta
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    punpcklqdq           m8, m0  ; 0 1   4 5
+    movq                xm0, [filterq+tmp2q*8]
+    vinserti128          m0, [filterq+tmp1q*8], 1
+    punpcklqdq           m9, m0  ; 2 3   6 7
+    pshufb               m0, m10, m12
+    pmaddubsw            m0, m8
+    pshufb              m10, m13
+    pmaddubsw           m10, m9
+    add                srcq, ssq
+    phaddw               m0, m10
+    pmaddwd              m0, m14 ; 17-bit intermediate, upshifted by 13
+    paddd                m0, m15 ; rounded 14-bit result in upper 16 bits of dword
+    ret
+
+%macro WRAP_YMM 1+
+    INIT_YMM cpuname
+    %1
+    INIT_ZMM cpuname
+%endmacro
+
+%macro BIDIR_FN 1 ; op
+%if mmsize == 64
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    cmp                  hd, 8
+    jg .w4_h16
+    WRAP_YMM %1           0
+    vextracti32x4      xmm1, ym0, 1
+    movd   [dstq          ], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xmm1
+    pextrd [dstq+stride3q ], xmm1, 1
+    jl .w4_ret
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq          ], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xmm1, 2
+    pextrd [dstq+stride3q ], xmm1, 3
+.w4_ret:
+    RET
+.w4_h16:
+    vpbroadcastd         m7, strided
+    pmulld               m7, [bidir_sctr_w4]
+    %1                    0
+    kxnorw               k1, k1, k1
+    vpscatterdd [dstq+m7]{k1}, m0
+    RET
+.w8:
+    cmp                  hd, 4
+    jne .w8_h8
+    WRAP_YMM %1           0
+    vextracti128       xmm1, ym0, 1
+    movq   [dstq          ], xm0
+    movq   [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xmm1
+    RET
+.w8_loop:
+    %1_INC_PTR            2
+    lea                dstq, [dstq+strideq*4]
+.w8_h8:
+    %1                    0
+    vextracti32x4      xmm1, ym0, 1
+    vextracti32x4      xmm2, m0, 2
+    vextracti32x4      xmm3, m0, 3
+    movq   [dstq          ], xm0
+    movq   [dstq+strideq*1], xmm1
+    movq   [dstq+strideq*2], xmm2
+    movq   [dstq+stride3q ], xmm3
+    lea                dstq, [dstq+strideq*4]
+    movhps [dstq          ], xm0
+    movhps [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xmm2
+    movhps [dstq+stride3q ], xmm3
+    sub                  hd, 8
+    jg .w8_loop
+    RET
+.w16_loop:
+    %1_INC_PTR            2
+    lea                dstq, [dstq+strideq*4]
+.w16:
+    %1                    0
+    vpermq               m0, m0, q3120
+    mova          [dstq          ], xm0
+    vextracti32x4 [dstq+strideq*1], m0, 2
+    vextracti32x4 [dstq+strideq*2], ym0, 1
+    vextracti32x4 [dstq+stride3q ], m0, 3
+    sub                  hd, 4
+    jg .w16_loop
+    RET
+.w32:
+    pmovzxbq             m7, [warp_8x8_shufA]
+.w32_loop:
+    %1                    0
+    %1_INC_PTR            2
+    vpermq               m0, m7, m0
+    mova          [dstq+strideq*0], ym0
+    vextracti32x8 [dstq+strideq*1], m0, 1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w32_loop
+    RET
+.w64:
+    pmovzxbq             m7, [warp_8x8_shufA]
+.w64_loop:
+    %1                    0
+    %1_INC_PTR            2
+    vpermq               m0, m7, m0
+    mova             [dstq], m0
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128:
+    pmovzxbq             m7, [warp_8x8_shufA]
+.w128_loop:
+    %1                    0
+    vpermq               m6, m7, m0
+    %1                    2
+    mova        [dstq+64*0], m6
+    %1_INC_PTR            4
+    vpermq               m6, m7, m0
+    mova        [dstq+64*1], m6
+    add                dstq, strideq
+    dec                  hd
+    jg .w128_loop
+    RET
+%else
+    %1                    0
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    vextracti128        xm1, m0, 1
+    movd   [dstq          ], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q ], xm1, 1
+    cmp                  hd, 4
+    je .ret
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq          ], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+    cmp                  hd, 8
+    je .ret
+    %1                    2
+    lea                dstq, [dstq+strideq*4]
+    vextracti128        xm1, m0, 1
+    movd   [dstq          ], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q ], xm1, 1
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq          ], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+.ret:
+    RET
+.w8_loop:
+    %1_INC_PTR            2
+    %1                    0
+    lea                dstq, [dstq+strideq*4]
+.w8:
+    vextracti128        xm1, m0, 1
+    movq   [dstq          ], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xm1
+    sub                  hd, 4
+    jg .w8_loop
+    RET
+.w16_loop:
+    %1_INC_PTR            4
+    %1                    0
+    lea                dstq, [dstq+strideq*4]
+.w16:
+    vpermq               m0, m0, q3120
+    mova         [dstq          ], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    %1                    2
+    vpermq               m0, m0, q3120
+    mova         [dstq+strideq*2], xm0
+    vextracti128 [dstq+stride3q ], m0, 1
+    sub                  hd, 4
+    jg .w16_loop
+    RET
+.w32_loop:
+    %1_INC_PTR            4
+    %1                    0
+    lea                dstq, [dstq+strideq*2]
+.w32:
+    vpermq               m0, m0, q3120
+    mova   [dstq+strideq*0], m0
+    %1                    2
+    vpermq               m0, m0, q3120
+    mova   [dstq+strideq*1], m0
+    sub                  hd, 2
+    jg .w32_loop
+    RET
+.w64_loop:
+    %1_INC_PTR            4
+    %1                    0
+    add                dstq, strideq
+.w64:
+    vpermq               m0, m0, q3120
+    mova             [dstq], m0
+    %1                    2
+    vpermq               m0, m0, q3120
+    mova          [dstq+32], m0
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128_loop:
+    %1                    0
+    add                dstq, strideq
+.w128:
+    vpermq               m0, m0, q3120
+    mova        [dstq+0*32], m0
+    %1                    2
+    vpermq               m0, m0, q3120
+    mova        [dstq+1*32], m0
+    %1_INC_PTR            8
+    %1                   -4
+    vpermq               m0, m0, q3120
+    mova        [dstq+2*32], m0
+    %1                   -2
+    vpermq               m0, m0, q3120
+    mova        [dstq+3*32], m0
+    dec                  hd
+    jg .w128_loop
+    RET
+%endif
+%endmacro
+
+%macro AVG 1 ; src_offset
+    mova                 m0, [tmp1q+(%1+0)*mmsize]
+    paddw                m0, [tmp2q+(%1+0)*mmsize]
+    mova                 m1, [tmp1q+(%1+1)*mmsize]
+    paddw                m1, [tmp2q+(%1+1)*mmsize]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    packuswb             m0, m1
+%endmacro
+
+%macro AVG_INC_PTR 1
+    add               tmp1q, %1*mmsize
+    add               tmp2q, %1*mmsize
+%endmacro
+
+%macro AVG_FN 0
+cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg %+ SUFFIX %+ _table
+    lea                  r6, [avg %+ SUFFIX %+ _table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, dword [r6+wq*4]
+    vpbroadcastd         m2, [base+pw_1024]
+    add                  wq, r6
+    BIDIR_FN            AVG
+%endmacro
+
+%macro W_AVG 1 ; src_offset
+    ; (a * weight + b * (16 - weight) + 128) >> 8
+    ; = ((a - b) * weight + (b << 4) + 128) >> 8
+    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
+    mova                 m0,     [tmp1q+(%1+0)*mmsize]
+    psubw                m2, m0, [tmp2q+(%1+0)*mmsize]
+    mova                 m1,     [tmp1q+(%1+1)*mmsize]
+    psubw                m3, m1, [tmp2q+(%1+1)*mmsize]
+    pmulhw               m2, m4
+    pmulhw               m3, m4
+    paddw                m0, m2
+    paddw                m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+%macro W_AVG_FN 0
+cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg %+ SUFFIX %+ _table
+    lea                  r6, [w_avg %+ SUFFIX %+ _table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    vpbroadcastw         m4, r6m ; weight
+    movsxd               wq, dword [r6+wq*4]
+    vpbroadcastd         m5, [base+pw_2048]
+    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
+    add                  wq, r6
+    cmp           dword r6m, 7
+    jg .weight_gt7
+    mov                  r6, tmp1q
+    pxor                 m0, m0
+    mov               tmp1q, tmp2q
+    psubw                m4, m0, m4 ; -weight
+    mov               tmp2q, r6
+.weight_gt7:
+    BIDIR_FN          W_AVG
+%endmacro
+
+%macro MASK 1 ; src_offset
+    ; (a * m + b * (64 - m) + 512) >> 10
+    ; = ((a - b) * m + (b << 6) + 512) >> 10
+    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+%if mmsize == 64
+    vpermq               m3, m8, [maskq+%1*32]
+%else
+    vpermq               m3,     [maskq+%1*16], q3120
+%endif
+    mova                 m0,     [tmp2q+(%1+0)*mmsize]
+    psubw                m1, m0, [tmp1q+(%1+0)*mmsize]
+    psubb                m3, m4, m3
+    paddw                m1, m1     ; (b - a) << 1
+    paddb                m3, m3
+    punpcklbw            m2, m4, m3 ; -m << 9
+    pmulhw               m1, m2
+    paddw                m0, m1
+    mova                 m1,     [tmp2q+(%1+1)*mmsize]
+    psubw                m2, m1, [tmp1q+(%1+1)*mmsize]
+    paddw                m2, m2
+    punpckhbw            m3, m4, m3
+    pmulhw               m2, m3
+    paddw                m1, m2
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+%endmacro
+
+%macro MASK_INC_PTR 1
+    add               maskq, %1*mmsize/2
+    add               tmp2q, %1*mmsize
+    add               tmp1q, %1*mmsize
+%endmacro
+
+%macro MASK_FN 0
+cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask %+ SUFFIX %+ _table
+    lea                  r7, [mask %+ SUFFIX %+ _table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    mov               maskq, maskmp
+    movsxd               wq, dword [r7+wq*4]
+    pxor                 m4, m4
+%if mmsize == 64
+    mova                 m8, [base+bilin_v_perm64]
+%endif
+    vpbroadcastd         m5, [base+pw_2048]
+    add                  wq, r7
+    BIDIR_FN           MASK
+%endmacro MASK_FN
+
+%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
+    mova                m%1, [tmp1q+mmsize*%3]
+    mova                 m1, [tmp2q+mmsize*%3]
+    psubw                m1, m%1
+    pabsw               m%2, m1
+    psubusw             m%2, m6, m%2
+    psrlw               m%2, 8 ; 64 - m
+    psllw                m2, m%2, 10
+    pmulhw               m1, m2
+    paddw               m%1, m1
+    mova                 m1, [tmp1q+mmsize*%4]
+    mova                 m2, [tmp2q+mmsize*%4]
+    psubw                m2, m1
+    pabsw                m3, m2
+    psubusw              m3, m6, m3
+%if cpuflag(avx512icl)
+    vpshldw             m%2, m3, 8
+    psllw                m3, m%2, 10
+%if %5
+    psubb               m%2, m5, m%2
+%endif
+%else
+    psrlw                m3, 8
+%if %5
+    packuswb            m%2, m3
+    psubb               m%2, m5, m%2
+    vpermq              m%2, m%2, q3120
+%else
+    phaddw              m%2, m3
+%endif
+    psllw                m3, 10
+%endif
+    pmulhw               m2, m3
+    paddw                m1, m2
+    pmulhrsw            m%1, m7
+    pmulhrsw             m1, m7
+    packuswb            m%1, m1
+%endmacro
+
+cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx2_table
+    lea                  r6, [blend_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movifnidn         maskq, maskmp
+    movsxd               wq, dword [r6+wq*4]
+    vpbroadcastd         m4, [base+pb_64]
+    vpbroadcastd         m5, [base+pw_512]
+    add                  wq, r6
+    lea                  r6, [dsq*3]
+    jmp                  wq
+.w4:
+    movd                xm0, [dstq+dsq*0]
+    pinsrd              xm0, [dstq+dsq*1], 1
+    vpbroadcastd        xm1, [dstq+dsq*2]
+    pinsrd              xm1, [dstq+r6   ], 3
+    mova                xm6, [maskq]
+    psubb               xm3, xm4, xm6
+    punpcklbw           xm2, xm3, xm6
+    punpckhbw           xm3, xm6
+    mova                xm6, [tmpq]
+    add               maskq, 4*4
+    add                tmpq, 4*4
+    punpcklbw           xm0, xm6
+    punpckhbw           xm1, xm6
+    pmaddubsw           xm0, xm2
+    pmaddubsw           xm1, xm3
+    pmulhrsw            xm0, xm5
+    pmulhrsw            xm1, xm5
+    packuswb            xm0, xm1
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    pextrd     [dstq+dsq*2], xm0, 2
+    pextrd     [dstq+r6   ], xm0, 3
+    lea                dstq, [dstq+dsq*4]
+    sub                  hd, 4
+    jg .w4
+    RET
+ALIGN function_align
+.w8:
+    movq                xm1, [dstq+dsq*0]
+    movhps              xm1, [dstq+dsq*1]
+    vpbroadcastq         m2, [dstq+dsq*2]
+    vpbroadcastq         m3, [dstq+r6   ]
+    mova                 m0, [maskq]
+    mova                 m6, [tmpq]
+    add               maskq, 8*4
+    add                tmpq, 8*4
+    vpblendd             m1, m2, 0x30
+    vpblendd             m1, m3, 0xc0
+    psubb                m3, m4, m0
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
+    punpcklbw            m0, m1, m6
+    punpckhbw            m1, m6
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    vextracti128        xm1, m0, 1
+    movq       [dstq+dsq*0], xm0
+    movhps     [dstq+dsq*1], xm0
+    movq       [dstq+dsq*2], xm1
+    movhps     [dstq+r6   ], xm1
+    lea                dstq, [dstq+dsq*4]
+    sub                  hd, 4
+    jg .w8
+    RET
+ALIGN function_align
+.w16:
+    mova                 m0, [maskq]
+    mova                xm1, [dstq+dsq*0]
+    vinserti128          m1, [dstq+dsq*1], 1
+    psubb                m3, m4, m0
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
+    mova                 m6, [tmpq]
+    add               maskq, 16*2
+    add                tmpq, 16*2
+    punpcklbw            m0, m1, m6
+    punpckhbw            m1, m6
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], m0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w16
+    RET
+ALIGN function_align
+.w32:
+    mova                 m0, [maskq]
+    mova                 m1, [dstq]
+    mova                 m6, [tmpq]
+    add               maskq, 32
+    add                tmpq, 32
+    psubb                m3, m4, m0
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
+    punpcklbw            m0, m1, m6
+    punpckhbw            m1, m6
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, dsq
+    dec                  hd
+    jg .w32
+    RET
+
+cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_avx2_table
+    lea                  r5, [blend_v_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, dword [r5+wq*4]
+    vpbroadcastd         m5, [base+pw_512]
+    add                  wq, r5
+    add               maskq, obmc_masks-blend_v_avx2_table
+    jmp                  wq
+.w2:
+    vpbroadcastd        xm2, [maskq+2*2]
+.w2_s0_loop:
+    movd                xm0, [dstq+dsq*0]
+    pinsrw              xm0, [dstq+dsq*1], 1
+    movd                xm1, [tmpq]
+    add                tmpq, 2*2
+    punpcklbw           xm0, xm1
+    pmaddubsw           xm0, xm2
+    pmulhrsw            xm0, xm5
+    packuswb            xm0, xm0
+    pextrw     [dstq+dsq*0], xm0, 0
+    pextrw     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w2_s0_loop
+    RET
+ALIGN function_align
+.w4:
+    vpbroadcastq        xm2, [maskq+4*2]
+.w4_loop:
+    movd                xm0, [dstq+dsq*0]
+    pinsrd              xm0, [dstq+dsq*1], 1
+    movq                xm1, [tmpq]
+    add                tmpq, 4*2
+    punpcklbw           xm0, xm1
+    pmaddubsw           xm0, xm2
+    pmulhrsw            xm0, xm5
+    packuswb            xm0, xm0
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w4_loop
+    RET
+ALIGN function_align
+.w8:
+    vbroadcasti128       m4, [maskq+8*2]
+.w8_loop:
+    vpbroadcastq         m2, [dstq+dsq*0]
+    movq                xm0, [dstq+dsq*1]
+    vpblendd             m0, m2, 0x30
+    movq                xm1, [tmpq+8*1]
+    vinserti128          m1, [tmpq+8*0], 1
+    add                tmpq, 8*2
+    punpcklbw            m0, m1
+    pmaddubsw            m0, m4
+    pmulhrsw             m0, m5
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    movhps     [dstq+dsq*0], xm0
+    movq       [dstq+dsq*1], xm0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+ALIGN function_align
+.w16:
+    vbroadcasti128       m3, [maskq+16*2]
+    vbroadcasti128       m4, [maskq+16*3]
+.w16_loop:
+    mova                xm1, [dstq+dsq*0]
+    vinserti128          m1, [dstq+dsq*1], 1
+    mova                 m2, [tmpq]
+    add                tmpq, 16*2
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m4
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], m0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w16_loop
+    RET
+ALIGN function_align
+.w32:
+    mova                xm3, [maskq+16*4]
+    vinserti128          m3, [maskq+16*6], 1
+    mova                xm4, [maskq+16*5]
+    vinserti128          m4, [maskq+16*7], 1
+.w32_loop:
+    mova                 m1, [dstq]
+    mova                 m2, [tmpq]
+    add                tmpq, 32
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m4
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, dsq
+    dec                  hd
+    jg .w32_loop
+    RET
+
+cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_h_avx2_table
+    lea                  r5, [blend_h_avx2_table]
+    mov                 r6d, wd
+    tzcnt                wd, wd
+    mov                  hd, hm
+    movsxd               wq, dword [r5+wq*4]
+    vpbroadcastd         m5, [base+pw_512]
+    add                  wq, r5
+    lea               maskq, [base+obmc_masks+hq*2]
+    lea                  hd, [hq*3]
+    shr                  hd, 2 ; h * 3/4
+    lea               maskq, [maskq+hq*2]
+    neg                  hq
+    jmp                  wq
+.w2:
+    movd                xm0, [dstq+dsq*0]
+    pinsrw              xm0, [dstq+dsq*1], 1
+    movd                xm2, [maskq+hq*2]
+    movd                xm1, [tmpq]
+    add                tmpq, 2*2
+    punpcklwd           xm2, xm2
+    punpcklbw           xm0, xm1
+    pmaddubsw           xm0, xm2
+    pmulhrsw            xm0, xm5
+    packuswb            xm0, xm0
+    pextrw     [dstq+dsq*0], xm0, 0
+    pextrw     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    add                  hq, 2
+    jl .w2
+    RET
+ALIGN function_align
+.w4:
+    mova                xm3, [blend_shuf]
+.w4_loop:
+    movd                xm0, [dstq+dsq*0]
+    pinsrd              xm0, [dstq+dsq*1], 1
+    movd                xm2, [maskq+hq*2]
+    movq                xm1, [tmpq]
+    add                tmpq, 4*2
+    pshufb              xm2, xm3
+    punpcklbw           xm0, xm1
+    pmaddubsw           xm0, xm2
+    pmulhrsw            xm0, xm5
+    packuswb            xm0, xm0
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    add                  hq, 2
+    jl .w4_loop
+    RET
+ALIGN function_align
+.w8:
+    vbroadcasti128       m4, [blend_shuf]
+    shufpd               m4, m4, 0x03
+.w8_loop:
+    vpbroadcastq         m1, [dstq+dsq*0]
+    movq                xm0, [dstq+dsq*1]
+    vpblendd             m0, m1, 0x30
+    vpbroadcastd         m3, [maskq+hq*2]
+    movq                xm1, [tmpq+8*1]
+    vinserti128          m1, [tmpq+8*0], 1
+    add                tmpq, 8*2
+    pshufb               m3, m4
+    punpcklbw            m0, m1
+    pmaddubsw            m0, m3
+    pmulhrsw             m0, m5
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    movhps     [dstq+dsq*0], xm0
+    movq       [dstq+dsq*1], xm0
+    lea                dstq, [dstq+dsq*2]
+    add                  hq, 2
+    jl .w8_loop
+    RET
+ALIGN function_align
+.w16:
+    vbroadcasti128       m4, [blend_shuf]
+    shufpd               m4, m4, 0x0c
+.w16_loop:
+    mova                xm1, [dstq+dsq*0]
+    vinserti128          m1, [dstq+dsq*1], 1
+    vpbroadcastd         m3, [maskq+hq*2]
+    mova                 m2, [tmpq]
+    add                tmpq, 16*2
+    pshufb               m3, m4
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], m0, 1
+    lea                dstq, [dstq+dsq*2]
+    add                  hq, 2
+    jl .w16_loop
+    RET
+ALIGN function_align
+.w32: ; w32/w64/w128
+    sub                 dsq, r6
+.w32_loop0:
+    vpbroadcastw         m3, [maskq+hq*2]
+    mov                  wd, r6d
+.w32_loop:
+    mova                 m1, [dstq]
+    mova                 m2, [tmpq]
+    add                tmpq, 32
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, 32
+    sub                  wd, 32
+    jg .w32_loop
+    add                dstq, dsq
+    inc                  hq
+    jl .w32_loop0
+    RET
+
+cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+                             bottomext, rightext
+    ; we assume that the buffer (stride) is larger than width, so we can
+    ; safely overwrite by a few bytes
+
+    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+    xor                r12d, r12d
+    lea                 r10, [ihq-1]
+    cmp                  yq, ihq
+    cmovs               r10, yq
+    test                 yq, yq
+    cmovs               r10, r12
+    imul                r10, sstrideq
+    add                srcq, r10
+
+    ; ref += iclip(x, 0, iw - 1)
+    lea                 r10, [iwq-1]
+    cmp                  xq, iwq
+    cmovs               r10, xq
+    test                 xq, xq
+    cmovs               r10, r12
+    add                srcq, r10
+
+    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+    lea          bottomextq, [yq+bhq]
+    sub          bottomextq, ihq
+    lea                  r3, [bhq-1]
+    cmovs        bottomextq, r12
+
+    DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
+                bottomext, rightext
+
+    ; top_ext = iclip(-y, 0, bh - 1)
+    neg             topextq
+    cmovs           topextq, r12
+    cmp          bottomextq, bhq
+    cmovns       bottomextq, r3
+    cmp             topextq, bhq
+    cmovg           topextq, r3
+
+    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+    lea           rightextq, [xq+bwq]
+    sub           rightextq, iwq
+    lea                  r2, [bwq-1]
+    cmovs         rightextq, r12
+
+    DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
+                bottomext, rightext
+
+    ; left_ext = iclip(-x, 0, bw - 1)
+    neg            leftextq
+    cmovs          leftextq, r12
+    cmp           rightextq, bwq
+    cmovns        rightextq, r2
+    cmp            leftextq, bwq
+    cmovns         leftextq, r2
+
+    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
+                dst, dstride, src, sstride, bottomext, rightext
+
+    ; center_h = bh - top_ext - bottom_ext
+    lea                  r3, [bottomextq+topextq]
+    sub            centerhq, r3
+
+    ; blk += top_ext * PXSTRIDE(dst_stride)
+    mov                  r2, topextq
+    imul                 r2, dstrideq
+    add                dstq, r2
+    mov                 r9m, dstq
+
+    ; center_w = bw - left_ext - right_ext
+    mov            centerwq, bwq
+    lea                  r3, [rightextq+leftextq]
+    sub            centerwq, r3
+
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+.v_loop_%3:
+%if %1
+    ; left extension
+    xor                  r3, r3
+    vpbroadcastb         m0, [srcq]
+.left_loop_%3:
+    mova          [dstq+r3], m0
+    add                  r3, 32
+    cmp                  r3, leftextq
+    jl .left_loop_%3
+
+    ; body
+    lea                 r12, [dstq+leftextq]
+%endif
+    xor                  r3, r3
+.body_loop_%3:
+    movu                 m0, [srcq+r3]
+%if %1
+    movu           [r12+r3], m0
+%else
+    movu          [dstq+r3], m0
+%endif
+    add                  r3, 32
+    cmp                  r3, centerwq
+    jl .body_loop_%3
+
+%if %2
+    ; right extension
+%if %1
+    add                 r12, centerwq
+%else
+    lea                 r12, [dstq+centerwq]
+%endif
+    xor                  r3, r3
+    vpbroadcastb         m0, [srcq+centerwq-1]
+.right_loop_%3:
+    movu           [r12+r3], m0
+    add                  r3, 32
+    cmp                  r3, rightextq
+    jl .right_loop_%3
+
+%endif
+    add                dstq, dstrideq
+    add                srcq, sstrideq
+    dec            centerhq
+    jg .v_loop_%3
+%endmacro
+
+    test           leftextq, leftextq
+    jnz .need_left_ext
+    test          rightextq, rightextq
+    jnz .need_right_ext
+    v_loop                0, 0, 0
+    jmp .body_done
+
+.need_left_ext:
+    test          rightextq, rightextq
+    jnz .need_left_right_ext
+    v_loop                1, 0, 1
+    jmp .body_done
+
+.need_left_right_ext:
+    v_loop                1, 1, 2
+    jmp .body_done
+
+.need_right_ext:
+    v_loop                0, 1, 3
+
+.body_done:
+    ; bottom edge extension
+    test         bottomextq, bottomextq
+    jz .top
+    mov                srcq, dstq
+    sub                srcq, dstrideq
+    xor                  r1, r1
+.bottom_x_loop:
+    mova                 m0, [srcq+r1]
+    lea                  r3, [dstq+r1]
+    mov                  r4, bottomextq
+.bottom_y_loop:
+    mova               [r3], m0
+    add                  r3, dstrideq
+    dec                  r4
+    jg .bottom_y_loop
+    add                  r1, 32
+    cmp                  r1, bwq
+    jl .bottom_x_loop
+
+.top:
+    ; top edge extension
+    test            topextq, topextq
+    jz .end
+    mov                srcq, r9m
+    mov                dstq, dstm
+    xor                  r1, r1
+.top_x_loop:
+    mova                 m0, [srcq+r1]
+    lea                  r3, [dstq+r1]
+    mov                  r4, topextq
+.top_y_loop:
+    mova               [r3], m0
+    add                  r3, dstrideq
+    dec                  r4
+    jg .top_y_loop
+    add                  r1, 32
+    cmp                  r1, bwq
+    jl .top_x_loop
+
+.end:
+    RET
+
+cextern resize_filter
+
+INIT_YMM avx2
+cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
+                           dst_w, h, src_w, dx, mx0
+    sub          dword mx0m, 4<<14
+    sub        dword src_wm, 8
+    vpbroadcastd         m5, dxm
+    vpbroadcastd         m8, mx0m
+    vpbroadcastd         m6, src_wm
+
+    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
+    LEA                  r7, $$
+%define base r7-$$
+
+    vpbroadcastd         m3, [base+pw_m256]
+    vpbroadcastd         m7, [base+pd_63]
+    vbroadcasti128      m15, [base+pb_8x0_8x8]
+    pmaddwd              m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
+    pslld                m5, 3                      ; dx*8
+    pslld                m6, 14
+    paddd                m8, m2                     ; mx+[0..7]*dx
+    pxor                 m2, m2
+
+    ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
+    ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8
+
+.loop_y:
+    xor                  xd, xd
+    mova                 m4, m8                     ; per-line working version of mx
+
+.loop_x:
+    pmaxsd               m0, m4, m2
+    psrad                m9, m4, 8                  ; filter offset (unmasked)
+    pminsd               m0, m6                     ; iclip(mx, 0, src_w-8)
+    psubd                m1, m4, m0                 ; pshufb offset
+    psrad                m0, 14                     ; clipped src_x offset
+    psrad                m1, 14                     ; pshufb edge_emu offset
+    pand                 m9, m7                     ; filter offset (masked)
+
+    ; load source pixels - this ugly code is vpgatherdq emulation since
+    ; directly using vpgatherdq on Haswell is quite a bit slower :(
+    movd                r8d, xm0
+    pextrd              r9d, xm0, 1
+    pextrd             r10d, xm0, 2
+    pextrd             r11d, xm0, 3
+    vextracti128        xm0, m0, 1
+    movq               xm12, [srcq+r8]
+    movq               xm13, [srcq+r10]
+    movhps             xm12, [srcq+r9]
+    movhps             xm13, [srcq+r11]
+    movd                r8d, xm0
+    pextrd              r9d, xm0, 1
+    pextrd             r10d, xm0, 2
+    pextrd             r11d, xm0, 3
+    vinserti128         m12, [srcq+r8], 1
+    vinserti128         m13, [srcq+r10], 1
+    vpbroadcastq        m10, [srcq+r9]
+    vpbroadcastq        m11, [srcq+r11]
+    vpblendd            m12, m12, m10, 11000000b
+    vpblendd            m13, m13, m11, 11000000b
+
+    ; if no emulation is required, we don't need to shuffle or emulate edges
+    ; this also saves 2 quasi-vpgatherdqs
+    vptest               m1, m1
+    jz .filter
+
+    movd                r8d, xm1
+    pextrd              r9d, xm1, 1
+    pextrd             r10d, xm1, 2
+    pextrd             r11d, xm1, 3
+    movsxd               r8, r8d
+    movsxd               r9, r9d
+    movsxd              r10, r10d
+    movsxd              r11, r11d
+    vextracti128        xm1, m1, 1
+    movq               xm14, [base+resize_shuf+4+r8]
+    movq                xm0, [base+resize_shuf+4+r10]
+    movhps             xm14, [base+resize_shuf+4+r9]
+    movhps              xm0, [base+resize_shuf+4+r11]
+    movd                r8d, xm1
+    pextrd              r9d, xm1, 1
+    pextrd             r10d, xm1, 2
+    pextrd             r11d, xm1, 3
+    movsxd               r8, r8d
+    movsxd               r9, r9d
+    movsxd              r10, r10d
+    movsxd              r11, r11d
+    vinserti128         m14, [base+resize_shuf+4+r8], 1
+    vinserti128          m0, [base+resize_shuf+4+r10], 1
+    vpbroadcastq        m10, [base+resize_shuf+4+r9]
+    vpbroadcastq        m11, [base+resize_shuf+4+r11]
+    vpblendd            m14, m14, m10, 11000000b
+    vpblendd             m0, m0, m11, 11000000b
+
+    paddb               m14, m15
+    paddb                m0, m15
+    pshufb              m12, m14
+    pshufb              m13, m0
+
+.filter:
+    movd                r8d, xm9
+    pextrd              r9d, xm9, 1
+    pextrd             r10d, xm9, 2
+    pextrd             r11d, xm9, 3
+    vextracti128        xm9, m9, 1
+    movq               xm10, [base+resize_filter+r8*8]
+    movq               xm11, [base+resize_filter+r10*8]
+    movhps             xm10, [base+resize_filter+r9*8]
+    movhps             xm11, [base+resize_filter+r11*8]
+    movd                r8d, xm9
+    pextrd              r9d, xm9, 1
+    pextrd             r10d, xm9, 2
+    pextrd             r11d, xm9, 3
+    vinserti128         m10, [base+resize_filter+r8*8], 1
+    vinserti128         m11, [base+resize_filter+r10*8], 1
+    vpbroadcastq        m14, [base+resize_filter+r9*8]
+    vpbroadcastq         m1, [base+resize_filter+r11*8]
+    vpblendd            m10, m10, m14, 11000000b
+    vpblendd            m11, m11, m1, 11000000b
+
+    pmaddubsw           m12, m10
+    pmaddubsw           m13, m11
+    phaddw              m12, m13
+    vextracti128       xm13, m12, 1
+    phaddsw            xm12, xm13
+    pmulhrsw           xm12, xm3                    ; x=(x+64)>>7
+    packuswb           xm12, xm12
+    movq          [dstq+xq], xm12
+
+    paddd                m4, m5
+    add                  xd, 8
+    cmp                  xd, dst_wd
+    jl .loop_x
+
+    add                dstq, dst_strideq
+    add                srcq, src_strideq
+    dec                  hd
+    jg .loop_y
+    RET
+
+INIT_YMM avx2
+PREP_BILIN
+PREP_8TAP
+AVG_FN
+W_AVG_FN
+MASK_FN
+
+cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx2_table
+    lea                  r7, [w_mask_420_avx2_table]
+    tzcnt                wd, wm
+    mov                 r6d, r7m ; sign
+    movifnidn            hd, hm
+    movsxd               wq, [r7+wq*4]
+    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+    vpbroadcastd         m7, [base+pw_2048]
+    pmovzxbd             m9, [base+deint_shuf4]
+    vpbroadcastd         m8, [base+wm_420_sign+r6*4] ; 258 - sign
+    add                  wq, r7
+    W_MASK                0, 4, 0, 1
+    mov               maskq, maskmp
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q ], xm1, 1
+    cmp                  hd, 8
+    jl .w4_end
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+    jg .w4_h16
+.w4_end:
+    vextracti128        xm0, m4, 1
+    vpblendd            xm1, xm4, xm0, 0x05
+    vpblendd            xm4, xm4, xm0, 0x0a
+    pshufd              xm1, xm1, q2301
+    psubw               xm4, xm8, xm4
+    psubw               xm4, xm1
+    psrlw               xm4, 2
+    packuswb            xm4, xm4
+    movq            [maskq], xm4
+    RET
+.w4_h16:
+    W_MASK                0, 5, 2, 3
+    lea                dstq, [dstq+strideq*4]
+    phaddd               m4, m5
+    vextracti128        xm1, m0, 1
+    psubw                m4, m8, m4
+    psrlw                m4, 2
+    vpermd               m4, m9, m4
+    vextracti128        xm5, m4, 1
+    packuswb            xm4, xm5
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q], xm1, 1
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+    mova            [maskq], xm4
+    RET
+.w8_loop:
+    add               tmp1q, 2*32
+    add               tmp2q, 2*32
+    W_MASK                0, 4, 0, 1
+    lea                dstq, [dstq+strideq*4]
+    add               maskq, 8
+.w8:
+    vextracti128        xm2, m4, 1
+    vextracti128        xm1, m0, 1
+    psubw               xm4, xm8, xm4
+    psubw               xm4, xm2
+    psrlw               xm4, 2
+    packuswb            xm4, xm4
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xm1
+    movq            [maskq], xm4
+    sub                  hd, 4
+    jg .w8_loop
+    RET
+.w16_loop:
+    add               tmp1q, 4*32
+    add               tmp2q, 4*32
+    W_MASK                0, 4, 0, 1
+    lea                dstq, [dstq+strideq*4]
+    add               maskq, 16
+.w16:
+    vpermq               m0, m0, q3120
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    W_MASK                0, 5, 2, 3
+    punpckhqdq           m1, m4, m5
+    punpcklqdq           m4, m5
+    psubw                m1, m8, m1
+    psubw                m1, m4
+    psrlw                m1, 2
+    vpermq               m0, m0, q3120
+    packuswb             m1, m1
+    vpermd               m1, m9, m1
+    mova         [dstq+strideq*2], xm0
+    vextracti128 [dstq+stride3q ], m0, 1
+    mova            [maskq], xm1
+    sub                  hd, 4
+    jg .w16_loop
+    RET
+.w32_loop:
+    add               tmp1q, 4*32
+    add               tmp2q, 4*32
+    W_MASK                0, 4, 0, 1
+    lea                dstq, [dstq+strideq*2]
+    add               maskq, 16
+.w32:
+    vpermq               m0, m0, q3120
+    mova   [dstq+strideq*0], m0
+    W_MASK                0, 5, 2, 3
+    psubw                m4, m8, m4
+    psubw                m4, m5
+    psrlw                m4, 2
+    vpermq               m0, m0, q3120
+    packuswb             m4, m4
+    vpermd               m4, m9, m4
+    mova   [dstq+strideq*1], m0
+    mova            [maskq], xm4
+    sub                  hd, 2
+    jg .w32_loop
+    RET
+.w64_loop_even:
+    psubw               m10, m8, m4
+    psubw               m11, m8, m5
+    dec                  hd
+.w64_loop:
+    add               tmp1q, 4*32
+    add               tmp2q, 4*32
+    W_MASK                0, 4, 0, 1
+    add                dstq, strideq
+.w64:
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*0], m0
+    W_MASK                0, 5, 2, 3
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*1], m0
+    test                 hd, 1
+    jz .w64_loop_even
+    psubw                m4, m10, m4
+    psubw                m5, m11, m5
+    psrlw                m4, 2
+    psrlw                m5, 2
+    packuswb             m4, m5
+    vpermd               m4, m9, m4
+    mova            [maskq], m4
+    add               maskq, 32
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128_loop_even:
+    psubw               m12, m8, m4
+    psubw               m13, m8, m5
+    dec                  hd
+.w128_loop:
+    W_MASK                0, 4, 0, 1
+    add                dstq, strideq
+.w128:
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*0], m0
+    W_MASK                0, 5, 2, 3
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*1], m0
+    add               tmp1q, 8*32
+    add               tmp2q, 8*32
+    test                 hd, 1
+    jz .w128_even
+    psubw                m4, m10, m4
+    psubw                m5, m11, m5
+    psrlw                m4, 2
+    psrlw                m5, 2
+    packuswb             m4, m5
+    vpermd               m4, m9, m4
+    mova       [maskq+32*0], m4
+    jmp .w128_odd
+.w128_even:
+    psubw               m10, m8, m4
+    psubw               m11, m8, m5
+.w128_odd:
+    W_MASK                0, 4, -4, -3
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*2], m0
+    W_MASK                0, 5, -2, -1
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*3], m0
+    test                 hd, 1
+    jz .w128_loop_even
+    psubw                m4, m12, m4
+    psubw                m5, m13, m5
+    psrlw                m4, 2
+    psrlw                m5, 2
+    packuswb             m4, m5
+    vpermd               m4, m9, m4
+    mova       [maskq+32*1], m4
+    add               maskq, 64
+    dec                  hd
+    jg .w128_loop
+    RET
+
+cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx2_table
+    lea                  r7, [w_mask_422_avx2_table]
+    tzcnt                wd, wm
+    mov                 r6d, r7m ; sign
+    movifnidn            hd, hm
+    pxor                 m9, m9
+    movsxd               wq, dword [r7+wq*4]
+    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+    vpbroadcastd         m7, [base+pw_2048]
+    pmovzxbd            m10, [base+deint_shuf4]
+    vpbroadcastd         m8, [base+wm_422_sign+r6*4] ; 128 - sign
+    add                  wq, r7
+    mov               maskq, maskmp
+    W_MASK                0, 4, 0, 1
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q ], xm1, 1
+    cmp                  hd, 8
+    jl .w4_end
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+    jg .w4_h16
+.w4_end:
+    vextracti128        xm5, m4, 1
+    packuswb            xm4, xm5
+    psubb               xm5, xm8, xm4
+    pavgb               xm5, xm9
+    pshufd              xm5, xm5, q3120
+    mova            [maskq], xm5
+    RET
+.w4_h16:
+    W_MASK                0, 5, 2, 3
+    lea                dstq, [dstq+strideq*4]
+    packuswb             m4, m5
+    psubb                m5, m8, m4
+    pavgb                m5, m9
+    vpermd               m5, m10, m5
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q ], xm1, 1
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+    mova            [maskq], m5
+    RET
+.w8_loop:
+    add               tmp1q, 32*2
+    add               tmp2q, 32*2
+    W_MASK                0, 4, 0, 1
+    lea                dstq, [dstq+strideq*4]
+    add               maskq, 16
+.w8:
+    vextracti128        xm5, m4, 1
+    vextracti128        xm1, m0, 1
+    packuswb            xm4, xm5
+    psubb               xm5, xm8, xm4
+    pavgb               xm5, xm9
+    pshufd              xm5, xm5, q3120
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xm1
+    mova            [maskq], xm5
+    sub                  hd, 4
+    jg .w8_loop
+    RET
+.w16_loop:
+    add               tmp1q, 32*4
+    add               tmp2q, 32*4
+    W_MASK                0, 4, 0, 1
+    lea                dstq, [dstq+strideq*4]
+    add               maskq, 32
+.w16:
+    vpermq               m0, m0, q3120
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    W_MASK                0, 5, 2, 3
+    packuswb             m4, m5
+    psubb                m5, m8, m4
+    pavgb                m5, m9
+    vpermq               m0, m0, q3120
+    vpermd               m5, m10, m5
+    mova         [dstq+strideq*2], xm0
+    vextracti128 [dstq+stride3q ], m0, 1
+    mova            [maskq], m5
+    sub                  hd, 4
+    jg .w16_loop
+    RET
+.w32_loop:
+    add               tmp1q, 32*4
+    add               tmp2q, 32*4
+    W_MASK                0, 4, 0, 1
+    lea                dstq, [dstq+strideq*2]
+    add               maskq, 32
+.w32:
+    vpermq               m0, m0, q3120
+    mova   [dstq+strideq*0], m0
+    W_MASK                0, 5, 2, 3
+    packuswb             m4, m5
+    psubb                m5, m8, m4
+    pavgb                m5, m9
+    vpermq               m0, m0, q3120
+    vpermd               m5, m10, m5
+    mova   [dstq+strideq*1], m0
+    mova            [maskq], m5
+    sub                  hd, 2
+    jg .w32_loop
+    RET
+.w64_loop:
+    add               tmp1q, 32*4
+    add               tmp2q, 32*4
+    W_MASK                0, 4, 0, 1
+    add                dstq, strideq
+    add               maskq, 32
+.w64:
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*0], m0
+    W_MASK                0, 5, 2, 3
+    packuswb             m4, m5
+    psubb                m5, m8, m4
+    pavgb                m5, m9
+    vpermq               m0, m0, q3120
+    vpermd               m5, m10, m5
+    mova        [dstq+32*1], m0
+    mova            [maskq], m5
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128_loop:
+    add               tmp1q, 32*8
+    add               tmp2q, 32*8
+    W_MASK                0, 4, 0, 1
+    add                dstq, strideq
+    add               maskq, 32*2
+.w128:
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*0], m0
+    W_MASK                0, 5, 2, 3
+    packuswb             m4, m5
+    psubb                m5, m8, m4
+    pavgb                m5, m9
+    vpermq               m0, m0, q3120
+    vpermd               m5, m10, m5
+    mova        [dstq+32*1], m0
+    mova       [maskq+32*0], m5
+    W_MASK                0, 4, 4, 5
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*2], m0
+    W_MASK                0, 5, 6, 7
+    packuswb             m4, m5
+    psubb                m5, m8, m4
+    pavgb                m5, m9
+    vpermq               m0, m0, q3120
+    vpermd               m5, m10, m5
+    mova        [dstq+32*3], m0
+    mova       [maskq+32*1], m5
+    dec                  hd
+    jg .w128_loop
+    RET
+
+cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx2_table
+    lea                  r7, [w_mask_444_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    mov               maskq, maskmp
+    movsxd               wq, dword [r7+wq*4]
+    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+    vpbroadcastd         m5, [base+pb_64]
+    vpbroadcastd         m7, [base+pw_2048]
+    add                  wq, r7
+    W_MASK                0, 4, 0, 1, 1
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q ], xm1, 1
+    mova       [maskq+32*0], m4
+    cmp                  hd, 8
+    jl .w4_end
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+    je .w4_end
+    W_MASK                0, 4, 2, 3, 1
+    lea                dstq, [dstq+strideq*4]
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q ], xm1, 1
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+    mova       [maskq+32*1], m4
+.w4_end:
+    RET
+.w8_loop:
+    add               tmp1q, 32*2
+    add               tmp2q, 32*2
+    W_MASK                0, 4, 0, 1, 1
+    lea                dstq, [dstq+strideq*4]
+    add               maskq, 32
+.w8:
+    vextracti128        xm1, m0, 1
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xm1
+    mova            [maskq], m4
+    sub                  hd, 4
+    jg .w8_loop
+    RET
+.w16_loop:
+    add               tmp1q, 32*2
+    add               tmp2q, 32*2
+    W_MASK                0, 4, 0, 1, 1
+    lea                dstq, [dstq+strideq*2]
+    add               maskq, 32
+.w16:
+    vpermq               m0, m0, q3120
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    mova            [maskq], m4
+    sub                  hd, 2
+    jg .w16_loop
+    RET
+.w32_loop:
+    add               tmp1q, 32*2
+    add               tmp2q, 32*2
+    W_MASK                0, 4, 0, 1, 1
+    add                dstq, strideq
+    add               maskq, 32
+.w32:
+    vpermq               m0, m0, q3120
+    mova             [dstq], m0
+    mova            [maskq], m4
+    dec                  hd
+    jg .w32_loop
+    RET
+.w64_loop:
+    add               tmp1q, 32*4
+    add               tmp2q, 32*4
+    W_MASK                0, 4, 0, 1, 1
+    add                dstq, strideq
+    add               maskq, 32*2
+.w64:
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*0], m0
+    mova       [maskq+32*0], m4
+    W_MASK                0, 4, 2, 3, 1
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*1], m0
+    mova       [maskq+32*1], m4
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128_loop:
+    add               tmp1q, 32*8
+    add               tmp2q, 32*8
+    W_MASK                0, 4, 0, 1, 1
+    add                dstq, strideq
+    add               maskq, 32*4
+.w128:
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*0], m0
+    mova       [maskq+32*0], m4
+    W_MASK                0, 4, 2, 3, 1
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*1], m0
+    mova       [maskq+32*1], m4
+    W_MASK                0, 4, 4, 5, 1
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*2], m0
+    mova       [maskq+32*2], m4
+    W_MASK                0, 4, 6, 7, 1
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*3], m0
+    mova       [maskq+32*3], m4
+    dec                  hd
+    jg .w128_loop
+    RET
+
+%if HAVE_AVX512ICL
+INIT_ZMM avx512icl
+PREP_BILIN
+PREP_8TAP
+AVG_FN
+W_AVG_FN
+MASK_FN
+
+cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx512icl_table
+    lea                  r7, [w_mask_420_avx512icl_table]
+    tzcnt                wd, wm
+    mov                 r6d, r7m ; sign
+    movifnidn            hd, hm
+    movsxd               wq, [r7+wq*4]
+    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+    vpbroadcastd         m7, [base+pw_2048]
+    vpbroadcastd         m9, [base+pb_m64]             ; -1 << 6
+    mova               ym10, [base+wm_420_mask+32]
+    vpbroadcastd         m8, [base+wm_sign_avx512+r6*8] ; (258 - sign) << 6
+    add                  wq, r7
+    mov               maskq, maskmp
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    mova                 m5, [wm_420_perm4]
+    cmp                  hd, 8
+    jg .w4_h16
+    WRAP_YMM W_MASK       0, 4, 0, 1
+    vinserti128         ym5, [wm_420_perm4+32], 1
+    vpermb              ym4, ym5, ym4
+    vpdpbusd            ym8, ym4, ym9
+    vextracti128       xmm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xmm1
+    pextrd [dstq+stride3q ], xmm1, 1
+    jl .w4_end
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xmm1, 2
+    pextrd [dstq+stride3q ], xmm1, 3
+.w4_end:
+    vpermb              ym8, ym10, ym8
+    movq            [maskq], xm8
+    RET
+.w4_h16:
+    vpbroadcastd        m11, strided
+    pmulld              m11, [bidir_sctr_w4]
+    W_MASK                0, 4, 0, 1
+    vpermb               m4, m5, m4
+    vpdpbusd             m8, m4, m9
+    kxnorw               k1, k1, k1
+    vpermb               m8, m10, m8
+    mova            [maskq], xm8
+    vpscatterdd [dstq+m11]{k1}, m0
+    RET
+.w8:
+    mova                 m5, [wm_420_perm8]
+    cmp                  hd, 4
+    jne .w8_h8
+    WRAP_YMM W_MASK       0, 4, 0, 1
+    vinserti128         ym5, [wm_420_perm8+32], 1
+    vpermb              ym4, ym5, ym4
+    vpdpbusd            ym8, ym4, ym9
+    vpermb               m8, m10, m8
+    mova            [maskq], xm8
+    vextracti128       xmm1, ym0, 1
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xmm1
+    RET
+.w8_loop:
+    add               tmp1q, 128
+    add               tmp2q, 128
+    add               maskq, 16
+    lea                dstq, [dstq+strideq*4]
+.w8_h8:
+    W_MASK                0, 4, 0, 1
+    vpermb               m4, m5, m4
+    mova                 m1, m8
+    vpdpbusd             m1, m4, m9
+    vpermb               m1, m10, m1
+    mova            [maskq], xm1
+    vextracti32x4      xmm1, ym0, 1
+    vextracti32x4      xmm2, m0, 2
+    vextracti32x4      xmm3, m0, 3
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xmm1
+    movq   [dstq+strideq*2], xmm2
+    movq   [dstq+stride3q ], xmm3
+    lea                dstq, [dstq+strideq*4]
+    movhps [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xmm2
+    movhps [dstq+stride3q ], xmm3
+    sub                  hd, 8
+    jg .w8_loop
+    RET
+.w16:
+    mova                 m5, [wm_420_perm16]
+.w16_loop:
+    W_MASK                0, 4, 0, 1
+    vpermb               m4, m5, m4
+    mova                 m1, m8
+    vpdpbusd             m1, m4, m9
+    add               tmp1q, 128
+    add               tmp2q, 128
+    vpermb               m1, m10, m1
+    vpermq               m0, m0, q3120
+    mova            [maskq], xm1
+    add               maskq, 16
+    mova          [dstq+strideq*0], xm0
+    vextracti32x4 [dstq+strideq*1], m0, 2
+    vextracti32x4 [dstq+strideq*2], ym0, 1
+    vextracti32x4 [dstq+stride3q ], m0, 3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w16_loop
+    RET
+.w32:
+    pmovzxbq             m5, [warp_8x8_shufA]
+.w32_loop:
+    W_MASK                0, 4, 0, 1
+    mova                 m1, m8
+    vpdpbusd             m1, m4, m9
+    add               tmp1q, 128
+    add               tmp2q, 128
+    vpermb               m1, m10, m1
+    vpermq               m0, m5, m0
+    mova            [maskq], xm1
+    add               maskq, 16
+    mova          [dstq+strideq*0], ym0
+    vextracti32x8 [dstq+strideq*1], m0, 1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w32_loop
+    RET
+.w64:
+    pmovzxbq            m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
+    psrlq               m13, m12, 4          ; 1, 3, 5, 7, 9, 11, 13, 15
+.w64_loop:
+    W_MASK                0, 4, 0, 2
+    W_MASK               11, 5, 1, 3
+    mova                 m2, m8
+    vpdpbusd             m2, m4, m9
+    mova                 m3, m8
+    vpdpbusd             m3, m5, m9
+    add               tmp1q, 256
+    add               tmp2q, 256
+    vpermt2b             m2, m10, m3
+    mova                 m1, m0
+    vpermt2q             m0, m12, m11
+    vpermt2q             m1, m13, m11
+    mova            [maskq], ym2
+    add               maskq, 32
+    mova   [dstq+strideq*0], m0
+    mova   [dstq+strideq*1], m1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w64_loop
+    RET
+.w128:
+    pmovzxbq            m14, [wm_420_perm64]
+    mova                m10, [wm_420_mask]
+    psrlq               m15, m14, 4
+.w128_loop:
+    W_MASK                0, 12, 0, 4
+    W_MASK               11, 13, 1, 5
+    mova                 m4, m8
+    vpdpbusd             m4, m12, m9
+    mova                 m5, m8
+    vpdpbusd             m5, m13, m9
+    mova                 m1, m0
+    vpermt2q             m0, m14, m11
+    vpermt2q             m1, m15, m11
+    mova [dstq+strideq*0+64*0], m0
+    mova [dstq+strideq*1+64*0], m1
+    W_MASK                0, 12, 2, 6
+    W_MASK               11, 13, 3, 7
+    vprold               m4, 16
+    vprold               m5, 16
+    vpdpbusd             m4, m12, m9
+    vpdpbusd             m5, m13, m9
+    add               tmp1q, 512
+    add               tmp2q, 512
+    vpermt2b             m4, m10, m5
+    mova                 m1, m0
+    vpermt2q             m0, m14, m11
+    vpermt2q             m1, m15, m11
+    mova            [maskq], m4
+    add               maskq, 64
+    mova [dstq+strideq*0+64*1], m0
+    mova [dstq+strideq*1+64*1], m1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w128_loop
+    RET
+
+cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx512icl_table
+    lea                  r7, [w_mask_422_avx512icl_table]
+    tzcnt                wd, wm
+    mov                 r6d, r7m ; sign
+    movifnidn            hd, hm
+    movsxd               wq, dword [r7+wq*4]
+    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+    vpbroadcastd         m7, [base+pw_2048]
+    vpbroadcastd         m9, [base+pw_m128]
+    mova                m10, [base+wm_422_mask]
+    vpbroadcastd        m11, [base+pb_127]
+    add                  wq, r7
+    vpbroadcastd         m8, [base+wm_sign_avx512+4+r6*4]
+    mov               maskq, maskmp
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    cmp                  hd, 8
+    jg .w4_h16
+    WRAP_YMM W_MASK       0, 4, 0, 1
+    movhps             xm10, [wm_422_mask+16]
+    vpdpwssd            ym8, ym4, ym9
+    vpermb              ym8, ym10, ym8
+    vextracti128       xmm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xmm1
+    pextrd [dstq+stride3q ], xmm1, 1
+    jl .w4_end
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xmm1, 2
+    pextrd [dstq+stride3q ], xmm1, 3
+.w4_end:
+    pand                xm8, xm11
+    mova            [maskq], xm8
+    RET
+.w4_h16:
+    vpbroadcastd         m5, strided
+    pmulld               m5, [bidir_sctr_w4]
+    W_MASK                0, 4, 0, 1
+    vpdpwssd             m8, m4, m9
+    kxnorw               k1, k1, k1
+    vpermb               m8, m10, m8
+    pand                ym8, ym11
+    mova            [maskq], ym8
+    vpscatterdd [dstq+m5]{k1}, m0
+    RET
+.w8:
+    cmp                  hd, 4
+    jne .w8_h8
+    WRAP_YMM W_MASK       0, 4, 0, 1
+    movhps             xm10, [wm_422_mask+16]
+    vpdpwssd            ym8, ym4, ym9
+    vpermb              ym8, ym10, ym8
+    pand                xm8, xm11
+    mova            [maskq], xm8
+    vextracti128       xmm1, ym0, 1
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xmm1
+    RET
+.w8_loop:
+    add               tmp1q, 128
+    add               tmp2q, 128
+    add               maskq, 32
+    lea                dstq, [dstq+strideq*4]
+.w8_h8:
+    W_MASK                0, 4, 0, 1
+    mova                 m1, m8
+    vpdpwssd             m1, m4, m9
+    vpermb               m1, m10, m1
+    pand                ym1, ym11
+    mova            [maskq], ym1
+    vextracti32x4      xmm1, ym0, 1
+    vextracti32x4      xmm2, m0, 2
+    vextracti32x4      xmm3, m0, 3
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xmm1
+    movq   [dstq+strideq*2], xmm2
+    movq   [dstq+stride3q ], xmm3
+    lea                dstq, [dstq+strideq*4]
+    movhps [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xmm2
+    movhps [dstq+stride3q ], xmm3
+    sub                  hd, 8
+    jg .w8_loop
+    RET
+.w16_loop:
+    add               tmp1q, 128
+    add               tmp2q, 128
+    add               maskq, 32
+    lea                dstq, [dstq+strideq*4]
+.w16:
+    W_MASK                0, 4, 0, 1
+    mova                 m1, m8
+    vpdpwssd             m1, m4, m9
+    vpermb               m1, m10, m1
+    vpermq               m0, m0, q3120
+    pand                ym1, ym11
+    mova            [maskq], ym1
+    mova          [dstq+strideq*0], xm0
+    vextracti32x4 [dstq+strideq*1], m0, 2
+    vextracti32x4 [dstq+strideq*2], ym0, 1
+    vextracti32x4 [dstq+stride3q ], m0, 3
+    sub                  hd, 4
+    jg .w16_loop
+    RET
+.w32:
+    pmovzxbq             m5, [warp_8x8_shufA]
+.w32_loop:
+    W_MASK                0, 4, 0, 1
+    mova                 m1, m8
+    vpdpwssd             m1, m4, m9
+    add               tmp1q, 128
+    add               tmp2q, 128
+    vpermb               m1, m10, m1
+    vpermq               m0, m5, m0
+    pand                ym1, ym11
+    mova            [maskq], ym1
+    add               maskq, 32
+    mova          [dstq+strideq*0], ym0
+    vextracti32x8 [dstq+strideq*1], m0, 1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w32_loop
+    RET
+.w64:
+    pmovzxbq             m5, [warp_8x8_shufA]
+.w64_loop:
+    W_MASK                0, 4, 0, 1
+    mova                 m1, m8
+    vpdpwssd             m1, m4, m9
+    add               tmp1q, 128
+    add               tmp2q, 128
+    vpermb               m1, m10, m1
+    vpermq               m0, m5, m0
+    pand                ym1, ym11
+    mova            [maskq], ym1
+    add               maskq, 32
+    mova             [dstq], m0
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128:
+    pmovzxbq            m13, [warp_8x8_shufA]
+.w128_loop:
+    W_MASK                0, 4, 0, 1
+    W_MASK               12, 5, 2, 3
+    mova                 m2, m8
+    vpdpwssd             m2, m4, m9
+    mova                 m3, m8
+    vpdpwssd             m3, m5, m9
+    add               tmp1q, 256
+    add               tmp2q, 256
+    vpermt2b             m2, m10, m3
+    vpermq               m0, m13, m0
+    vpermq               m1, m13, m12
+    pand                 m2, m11
+    mova            [maskq], m2
+    add               maskq, 64
+    mova        [dstq+64*0], m0
+    mova        [dstq+64*1], m1
+    add                dstq, strideq
+    dec                  hd
+    jg .w128_loop
+    RET
+
+cglobal w_mask_444, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx512icl_table
+    lea                  r7, [w_mask_444_avx512icl_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, dword [r7+wq*4]
+    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+    vpbroadcastd         m5, [base+pb_64]
+    vpbroadcastd         m7, [base+pw_2048]
+    mova                 m8, [base+wm_444_mask]
+    add                  wq, r7
+    mov               maskq, maskmp
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    cmp                  hd, 8
+    jg .w4_h16
+    WRAP_YMM W_MASK       0, 4, 0, 1, 1
+    vinserti128         ym8, [wm_444_mask+32], 1
+    vpermb              ym4, ym8, ym4
+    mova            [maskq], ym4
+    vextracti128       xmm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xmm1
+    pextrd [dstq+stride3q ], xmm1, 1
+    jl .w4_end
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xmm1, 2
+    pextrd [dstq+stride3q ], xmm1, 3
+.w4_end:
+    RET
+.w4_h16:
+    vpbroadcastd         m9, strided
+    pmulld               m9, [bidir_sctr_w4]
+    W_MASK                0, 4, 0, 1, 1
+    vpermb               m4, m8, m4
+    kxnorw               k1, k1, k1
+    mova            [maskq], m4
+    vpscatterdd [dstq+m9]{k1}, m0
+    RET
+.w8:
+    cmp                  hd, 4
+    jne .w8_h8
+    WRAP_YMM W_MASK       0, 4, 0, 1, 1
+    vinserti128         ym8, [wm_444_mask+32], 1
+    vpermb              ym4, ym8, ym4
+    mova            [maskq], ym4
+    vextracti128       xmm1, ym0, 1
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xmm1
+    RET
+.w8_loop:
+    add               tmp1q, 128
+    add               tmp2q, 128
+    add               maskq, 64
+    lea                dstq, [dstq+strideq*4]
+.w8_h8:
+    W_MASK                0, 4, 0, 1, 1
+    vpermb               m4, m8, m4
+    mova            [maskq], m4
+    vextracti32x4      xmm1, ym0, 1
+    vextracti32x4      xmm2, m0, 2
+    vextracti32x4      xmm3, m0, 3
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xmm1
+    movq   [dstq+strideq*2], xmm2
+    movq   [dstq+stride3q ], xmm3
+    lea                dstq, [dstq+strideq*4]
+    movhps [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xmm2
+    movhps [dstq+stride3q ], xmm3
+    sub                  hd, 8
+    jg .w8_loop
+    RET
+.w16_loop:
+    add               tmp1q, 128
+    add               tmp2q, 128
+    add               maskq, 64
+    lea                dstq, [dstq+strideq*4]
+.w16:
+    W_MASK                0, 4, 0, 1, 1
+    vpermb               m4, m8, m4
+    vpermq               m0, m0, q3120
+    mova            [maskq], m4
+    mova          [dstq+strideq*0], xm0
+    vextracti32x4 [dstq+strideq*1], m0, 2
+    vextracti32x4 [dstq+strideq*2], ym0, 1
+    vextracti32x4 [dstq+stride3q ], m0, 3
+    sub                  hd, 4
+    jg .w16_loop
+    RET
+.w32:
+    pmovzxbq             m9, [warp_8x8_shufA]
+.w32_loop:
+    W_MASK                0, 4, 0, 1, 1
+    vpermb               m4, m8, m4
+    add               tmp1q, 128
+    add               tmp2q, 128
+    vpermq               m0, m9, m0
+    mova            [maskq], m4
+    add               maskq, 64
+    mova          [dstq+strideq*0], ym0
+    vextracti32x8 [dstq+strideq*1], m0, 1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w32_loop
+    RET
+.w64:
+    pmovzxbq             m9, [warp_8x8_shufA]
+.w64_loop:
+    W_MASK                0, 4, 0, 1, 1
+    vpermb               m4, m8, m4
+    add               tmp1q, 128
+    add               tmp2q, 128
+    vpermq               m0, m9, m0
+    mova            [maskq], m4
+    add               maskq, 64
+    mova             [dstq], m0
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128:
+    pmovzxbq            m11, [warp_8x8_shufA]
+.w128_loop:
+    W_MASK                0, 4, 0, 1, 1
+    W_MASK               10, 9, 2, 3, 1
+    vpermb               m4, m8, m4
+    vpermb               m9, m8, m9
+    add               tmp1q, 256
+    add               tmp2q, 256
+    vpermq               m0, m11, m0
+    vpermq              m10, m11, m10
+    mova       [maskq+64*0], m4
+    mova       [maskq+64*1], m9
+    add               maskq, 128
+    mova        [dstq+64*0], m0
+    mova        [dstq+64*1], m10
+    add                dstq, strideq
+    dec                  hd
+    jg .w128_loop
+    RET
+
+%endif ; HAVE_AVX512ICL
+
+%endif ; ARCH_X86_64
diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c
new file mode 100644 (file)
index 0000000..a01ac14
--- /dev/null
@@ -0,0 +1,322 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/mc.h"
+
+decl_mc_fn(dav1d_put_8tap_regular_avx2);
+decl_mc_fn(dav1d_put_8tap_regular_ssse3);
+decl_mc_fn(dav1d_put_8tap_regular_smooth_avx2);
+decl_mc_fn(dav1d_put_8tap_regular_smooth_ssse3);
+decl_mc_fn(dav1d_put_8tap_regular_sharp_avx2);
+decl_mc_fn(dav1d_put_8tap_regular_sharp_ssse3);
+decl_mc_fn(dav1d_put_8tap_smooth_avx2);
+decl_mc_fn(dav1d_put_8tap_smooth_ssse3);
+decl_mc_fn(dav1d_put_8tap_smooth_regular_avx2);
+decl_mc_fn(dav1d_put_8tap_smooth_regular_ssse3);
+decl_mc_fn(dav1d_put_8tap_smooth_sharp_avx2);
+decl_mc_fn(dav1d_put_8tap_smooth_sharp_ssse3);
+decl_mc_fn(dav1d_put_8tap_sharp_avx2);
+decl_mc_fn(dav1d_put_8tap_sharp_ssse3);
+decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2);
+decl_mc_fn(dav1d_put_8tap_sharp_regular_ssse3);
+decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2);
+decl_mc_fn(dav1d_put_8tap_sharp_smooth_ssse3);
+decl_mc_fn(dav1d_put_bilin_avx2);
+decl_mc_fn(dav1d_put_bilin_ssse3);
+
+decl_mct_fn(dav1d_prep_8tap_regular_avx512icl);
+decl_mct_fn(dav1d_prep_8tap_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_sse2);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_sse2);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_sse2);
+decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl);
+decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_sse2);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_sse2);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_sse2);
+decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl);
+decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_sse2);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_sse2);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_sse2);
+decl_mct_fn(dav1d_prep_bilin_avx512icl);
+decl_mct_fn(dav1d_prep_bilin_avx2);
+decl_mct_fn(dav1d_prep_bilin_ssse3);
+decl_mct_fn(dav1d_prep_bilin_sse2);
+
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2);
+
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2);
+
+decl_avg_fn(dav1d_avg_avx512icl);
+decl_avg_fn(dav1d_avg_avx2);
+decl_avg_fn(dav1d_avg_ssse3);
+decl_w_avg_fn(dav1d_w_avg_avx512icl);
+decl_w_avg_fn(dav1d_w_avg_avx2);
+decl_w_avg_fn(dav1d_w_avg_ssse3);
+decl_mask_fn(dav1d_mask_avx512icl);
+decl_mask_fn(dav1d_mask_avx2);
+decl_mask_fn(dav1d_mask_ssse3);
+decl_w_mask_fn(dav1d_w_mask_420_avx512icl);
+decl_w_mask_fn(dav1d_w_mask_420_avx2);
+decl_w_mask_fn(dav1d_w_mask_420_ssse3);
+decl_w_mask_fn(dav1d_w_mask_422_avx512icl);
+decl_w_mask_fn(dav1d_w_mask_422_avx2);
+decl_w_mask_fn(dav1d_w_mask_444_avx512icl);
+decl_w_mask_fn(dav1d_w_mask_444_avx2);
+decl_blend_fn(dav1d_blend_avx2);
+decl_blend_fn(dav1d_blend_ssse3);
+decl_blend_dir_fn(dav1d_blend_v_avx2);
+decl_blend_dir_fn(dav1d_blend_v_ssse3);
+decl_blend_dir_fn(dav1d_blend_h_avx2);
+decl_blend_dir_fn(dav1d_blend_h_ssse3);
+
+decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
+decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4);
+decl_warp8x8_fn(dav1d_warp_affine_8x8_ssse3);
+decl_warp8x8_fn(dav1d_warp_affine_8x8_sse2);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_ssse3);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse2);
+
+decl_emu_edge_fn(dav1d_emu_edge_avx2);
+decl_emu_edge_fn(dav1d_emu_edge_ssse3);
+
+decl_resize_fn(dav1d_resize_avx2);
+decl_resize_fn(dav1d_resize_ssse3);
+
+COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+#define init_mc_fn(type, name, suffix) \
+    c->mc[type] = dav1d_put_##name##_##suffix
+#define init_mct_fn(type, name, suffix) \
+    c->mct[type] = dav1d_prep_##name##_##suffix
+#define init_mc_scaled_fn(type, name, suffix) \
+    c->mc_scaled[type] = dav1d_put_##name##_##suffix
+#define init_mct_scaled_fn(type, name, suffix) \
+    c->mct_scaled[type] = dav1d_prep_##name##_##suffix
+
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
+        return;
+
+#if BITDEPTH == 8
+    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               sse2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        sse2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  sse2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         sse2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   sse2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  sse2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   sse2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          sse2);
+
+    c->warp8x8  = dav1d_warp_affine_8x8_sse2;
+    c->warp8x8t = dav1d_warp_affine_8x8t_sse2;
+#endif
+
+    if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
+        return;
+
+#if BITDEPTH == 8
+    init_mc_fn(FILTER_2D_BILINEAR,            bilin,               ssse3);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        ssse3);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  ssse3);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         ssse3);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   ssse3);
+    init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  ssse3);
+    init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   ssse3);
+    init_mc_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          ssse3);
+
+    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               ssse3);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        ssse3);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  ssse3);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         ssse3);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   ssse3);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  ssse3);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   ssse3);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          ssse3);
+
+    c->avg = dav1d_avg_ssse3;
+    c->w_avg = dav1d_w_avg_ssse3;
+    c->mask = dav1d_mask_ssse3;
+    c->w_mask[2] = dav1d_w_mask_420_ssse3;
+    c->blend = dav1d_blend_ssse3;
+    c->blend_v = dav1d_blend_v_ssse3;
+    c->blend_h = dav1d_blend_h_ssse3;
+
+    c->warp8x8  = dav1d_warp_affine_8x8_ssse3;
+    c->warp8x8t = dav1d_warp_affine_8x8t_ssse3;
+
+    c->emu_edge = dav1d_emu_edge_ssse3;
+    c->resize = dav1d_resize_ssse3;
+#endif
+
+    if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
+        return;
+
+#if BITDEPTH == 8
+    c->warp8x8  = dav1d_warp_affine_8x8_sse4;
+    c->warp8x8t = dav1d_warp_affine_8x8t_sse4;
+#endif
+
+#if ARCH_X86_64
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
+        return;
+
+#if BITDEPTH == 8
+    init_mc_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx2);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx2);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx2);
+    init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx2);
+    init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx2);
+    init_mc_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx2);
+    init_mc_fn(FILTER_2D_BILINEAR,            bilin,               avx2);
+
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx2);
+    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               avx2);
+
+    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
+    init_mc_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               avx2);
+
+    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
+    init_mct_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               avx2);
+
+    c->avg = dav1d_avg_avx2;
+    c->w_avg = dav1d_w_avg_avx2;
+    c->mask = dav1d_mask_avx2;
+    c->w_mask[0] = dav1d_w_mask_444_avx2;
+    c->w_mask[1] = dav1d_w_mask_422_avx2;
+    c->w_mask[2] = dav1d_w_mask_420_avx2;
+    c->blend = dav1d_blend_avx2;
+    c->blend_v = dav1d_blend_v_avx2;
+    c->blend_h = dav1d_blend_h_avx2;
+
+    c->warp8x8  = dav1d_warp_affine_8x8_avx2;
+    c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
+
+    c->emu_edge = dav1d_emu_edge_avx2;
+    c->resize = dav1d_resize_avx2;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
+        return;
+
+#if HAVE_AVX512ICL && BITDEPTH == 8
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx512icl);
+    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               avx512icl);
+
+    c->avg = dav1d_avg_avx512icl;
+    c->w_avg = dav1d_w_avg_avx512icl;
+    c->mask = dav1d_mask_avx512icl;
+    c->w_mask[0] = dav1d_w_mask_444_avx512icl;
+    c->w_mask[1] = dav1d_w_mask_422_avx512icl;
+    c->w_mask[2] = dav1d_w_mask_420_avx512icl;
+#endif
+#endif
+}
diff --git a/src/x86/mc_sse.asm b/src/x86/mc_sse.asm
new file mode 100644 (file)
index 0000000..d98ac62
--- /dev/null
@@ -0,0 +1,5935 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+; dav1d_obmc_masks[] with 64-x interleaved
+obmc_masks: db  0,  0,  0,  0
+            ; 2 @4
+            db 45, 19, 64,  0
+            ; 4 @8
+            db 39, 25, 50, 14, 59,  5, 64,  0
+            ; 8 @16
+            db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
+            ; 16 @32
+            db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+            db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
+            ; 32 @64
+            db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+            db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
+            db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
+
+warp_8x8_shufA: db 0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
+warp_8x8_shufB: db 4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
+warp_8x8_shufC: db 2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
+warp_8x8_shufD: db 6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
+blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
+subpel_h_shuf4: db 0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
+                db 2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+subpel_h_shufB: db 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+subpel_h_shufC: db 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+bilin_h_shuf4:  db 1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
+bilin_h_shuf8:  db 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+
+pb_8x0_8x8: times 8 db 0
+            times 8 db 8
+resize_mul: dd 0, 1, 2, 3
+resize_shuf: times 5 db 0
+             db 1, 2, 3, 4, 5, 6
+             times 5+16 db 7
+
+pb_64:    times 16 db 64
+pw_m256:  times 8 dw -256
+pw_1:     times 8 dw 1
+pw_2:     times 8 dw 2
+pw_8:     times 8 dw 8
+pw_26:    times 8 dw 26
+pw_34:    times 8 dw 34
+pw_512:   times 8 dw 512
+pw_1024:  times 8 dw 1024
+pw_2048:  times 8 dw 2048
+pw_6903:  times 8 dw 6903
+pw_8192:  times 8 dw 8192
+pd_32:    times 4 dd 32
+pd_63:    times 4 dd 63
+pd_512:   times 4 dd 512
+pd_16384: times 4 dd 16484
+pd_32768: times 4 dd 32768
+pd_262144:times 4 dd 262144
+
+pw_258:  times 2 dw 258
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BIDIR_JMP_TABLE 1-*
+    ;evaluated at definition time (in loop below)
+    %xdefine %1_table (%%table - 2*%2)
+    %xdefine %%base %1_table
+    %xdefine %%prefix mangle(private_prefix %+ _%1)
+    ; dynamically generated label
+    %%table:
+    %rep %0 - 1 ; repeat for num args
+        dd %%prefix %+ .w%2 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg_ssse3,        4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg_ssse3,      4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask_ssse3,       4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16
+BIDIR_JMP_TABLE blend_ssse3,      4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16
+
+%macro BASE_JMP_TABLE 3-*
+    %xdefine %1_%2_table (%%table - %3)
+    %xdefine %%base %1_%2
+    %%table:
+    %rep %0 - 2
+        dw %%base %+ _w%3 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_sse2.prep)
+%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put)
+%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep)
+
+BASE_JMP_TABLE put,  ssse3, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, ssse3,    4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
+    %xdefine %%base %1_%3
+    %assign %%types %4
+    %if %%types & 1
+        %xdefine %1_%2_h_%3_table  (%%h  - %5)
+        %%h:
+        %rep %0 - 4
+            dw %%prefix %+ .h_w%5 - %%base
+            %rotate 1
+        %endrep
+        %rotate 4
+    %endif
+    %if %%types & 2
+        %xdefine %1_%2_v_%3_table  (%%v  - %5)
+        %%v:
+        %rep %0 - 4
+            dw %%prefix %+ .v_w%5 - %%base
+            %rotate 1
+        %endrep
+        %rotate 4
+    %endif
+    %if %%types & 4
+        %xdefine %1_%2_hv_%3_table (%%hv - %5)
+        %%hv:
+        %rep %0 - 4
+            dw %%prefix %+ .hv_w%5 - %%base
+            %rotate 1
+        %endrep
+    %endif
+%endmacro
+
+HV_JMP_TABLE prep,  8tap,  sse2, 1,    4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin,  sse2, 7,    4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put,   8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep,  8tap, ssse3, 1,    4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put,  bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, ssse3, 7,    4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+cextern mc_warp_filter
+
+SECTION .text
+
+INIT_XMM ssse3
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1
+ %define base t0-put_ssse3
+%else
+ DECLARE_REG_TMP 7
+ %define base 0
+%endif
+;
+%macro RESTORE_DSQ_32 1
+ %if ARCH_X86_32
+   mov                  %1, dsm ; restore dsq
+ %endif
+%endmacro
+;
+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
+    movifnidn          mxyd, r6m ; mx
+    LEA                  t0, put_ssse3
+    tzcnt                wd, wm
+    mov                  hd, hm
+    test               mxyd, mxyd
+    jnz .h
+    mov                mxyd, r7m ; my
+    test               mxyd, mxyd
+    jnz .v
+.put:
+    movzx                wd, word [t0+wq*2+table_offset(put,)]
+    add                  wq, t0
+    RESTORE_DSQ_32       t0
+    jmp                  wq
+.put_w2:
+    movzx               r4d, word [srcq+ssq*0]
+    movzx               r6d, word [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mov        [dstq+dsq*0], r4w
+    mov        [dstq+dsq*1], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w2
+    RET
+.put_w4:
+    mov                 r4d, [srcq+ssq*0]
+    mov                 r6d, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mov        [dstq+dsq*0], r4d
+    mov        [dstq+dsq*1], r6d
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w4
+    RET
+.put_w8:
+    movq                 m0, [srcq+ssq*0]
+    movq                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movq       [dstq+dsq*0], m0
+    movq       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w8
+    RET
+.put_w16:
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mova       [dstq+dsq*0], m0
+    mova       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w16
+    RET
+.put_w32:
+    movu                 m0, [srcq+ssq*0+16*0]
+    movu                 m1, [srcq+ssq*0+16*1]
+    movu                 m2, [srcq+ssq*1+16*0]
+    movu                 m3, [srcq+ssq*1+16*1]
+    lea                srcq, [srcq+ssq*2]
+    mova  [dstq+dsq*0+16*0], m0
+    mova  [dstq+dsq*0+16*1], m1
+    mova  [dstq+dsq*1+16*0], m2
+    mova  [dstq+dsq*1+16*1], m3
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w32
+    RET
+.put_w64:
+    movu                 m0, [srcq+16*0]
+    movu                 m1, [srcq+16*1]
+    movu                 m2, [srcq+16*2]
+    movu                 m3, [srcq+16*3]
+    add                srcq, ssq
+    mova        [dstq+16*0], m0
+    mova        [dstq+16*1], m1
+    mova        [dstq+16*2], m2
+    mova        [dstq+16*3], m3
+    add                dstq, dsq
+    dec                  hd
+    jg .put_w64
+    RET
+.put_w128:
+    movu                 m0, [srcq+16*0]
+    movu                 m1, [srcq+16*1]
+    movu                 m2, [srcq+16*2]
+    movu                 m3, [srcq+16*3]
+    mova        [dstq+16*0], m0
+    mova        [dstq+16*1], m1
+    mova        [dstq+16*2], m2
+    mova        [dstq+16*3], m3
+    movu                 m0, [srcq+16*4]
+    movu                 m1, [srcq+16*5]
+    movu                 m2, [srcq+16*6]
+    movu                 m3, [srcq+16*7]
+    mova        [dstq+16*4], m0
+    mova        [dstq+16*5], m1
+    mova        [dstq+16*6], m2
+    mova        [dstq+16*7], m3
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .put_w128
+    RET
+.h:
+    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+    imul               mxyd, 0xff01
+    mova                 m4, [base+bilin_h_shuf8]
+    mova                 m0, [base+bilin_h_shuf4]
+    add                mxyd, 16 << 8
+    movd                 m5, mxyd
+    mov                mxyd, r7m ; my
+    pshuflw              m5, m5, q0000
+    punpcklqdq           m5, m5
+    test               mxyd, mxyd
+    jnz .hv
+    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_h)]
+    mova                 m3, [base+pw_2048]
+    add                  wq, t0
+    RESTORE_DSQ_32       t0
+    jmp                  wq
+.h_w2:
+    pshufd               m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
+.h_w2_loop:
+    movd                 m0, [srcq+ssq*0]
+    movd                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpckldq            m0, m1
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+    pmulhrsw             m0, m3
+    packuswb             m0, m0
+    movd                r6d, m0
+    mov        [dstq+dsq*0], r6w
+    shr                 r6d, 16
+    mov        [dstq+dsq*1], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w2_loop
+    RET
+.h_w4:
+    movq                 m4, [srcq+ssq*0]
+    movhps               m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb               m4, m0
+    pmaddubsw            m4, m5
+    pmulhrsw             m4, m3
+    packuswb             m4, m4
+    movd       [dstq+dsq*0], m4
+    psrlq                m4, 32
+    movd       [dstq+dsq*1], m4
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w4
+    RET
+.h_w8:
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    movq       [dstq+dsq*0], m0
+    movhps     [dstq+dsq*1], m0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w8
+    RET
+.h_w16:
+    movu                 m0, [srcq+8*0]
+    movu                 m1, [srcq+8*1]
+    add                srcq, ssq
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w16
+    RET
+.h_w32:
+    movu                 m0, [srcq+mmsize*0+8*0]
+    movu                 m1, [srcq+mmsize*0+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    movu                 m1, [srcq+mmsize*1+8*0]
+    movu                 m2, [srcq+mmsize*1+8*1]
+    add                srcq, ssq
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmulhrsw             m1, m3
+    pmulhrsw             m2, m3
+    packuswb             m1, m2
+    mova        [dstq+16*0], m0
+    mova        [dstq+16*1], m1
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w32
+    RET
+.h_w64:
+    mov                  r6, -16*3
+.h_w64_loop:
+    movu                 m0, [srcq+r6+16*3+8*0]
+    movu                 m1, [srcq+r6+16*3+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    mova     [dstq+r6+16*3], m0
+    add                  r6, 16
+    jle .h_w64_loop
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w64
+    RET
+.h_w128:
+    mov                  r6, -16*7
+.h_w128_loop:
+    movu                 m0, [srcq+r6+16*7+8*0]
+    movu                 m1, [srcq+r6+16*7+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    mova     [dstq+r6+16*7], m0
+    add                  r6, 16
+    jle .h_w128_loop
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w128
+    RET
+.v:
+    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_v)]
+    imul               mxyd, 0xff01
+    mova                 m5, [base+pw_2048]
+    add                mxyd, 16 << 8
+    add                  wq, t0
+    movd                 m4, mxyd
+    pshuflw              m4, m4, q0000
+    punpcklqdq           m4, m4
+    RESTORE_DSQ_32       t0
+    jmp                  wq
+.v_w2:
+    movd                 m0, [srcq+ssq*0]
+.v_w2_loop:
+    pinsrw               m0, [srcq+ssq*1], 1 ; 0 1
+    lea                srcq, [srcq+ssq*2]
+    pshuflw              m2, m0, q2301
+    pinsrw               m0, [srcq+ssq*0], 0 ; 2 1
+    punpcklbw            m1, m0, m2
+    pmaddubsw            m1, m4
+    pmulhrsw             m1, m5
+    packuswb             m1, m1
+    movd                r6d, m1
+    mov        [dstq+dsq*1], r6w
+    shr                 r6d, 16
+    mov        [dstq+dsq*0], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w2_loop
+    RET
+.v_w4:
+    movd                 m0, [srcq+ssq*0]
+.v_w4_loop:
+    movd                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpckldq            m2, m0, m1 ; 0 1
+    movd                 m0, [srcq+ssq*0]
+    punpckldq            m1, m0  ; 1 2
+    punpcklbw            m1, m2
+    pmaddubsw            m1, m4
+    pmulhrsw             m1, m5
+    packuswb             m1, m1
+    movd       [dstq+dsq*0], m1
+    psrlq                m1, 32
+    movd       [dstq+dsq*1], m1
+    ;
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w4_loop
+    RET
+.v_w8:
+    movq                 m0, [srcq+ssq*0]
+.v_w8_loop:
+    movq                 m3, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpcklbw            m1, m3, m0
+    movq                 m0, [srcq+ssq*0]
+    punpcklbw            m2, m0, m3
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    packuswb             m1, m2
+    movq       [dstq+dsq*0], m1
+    movhps     [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w8_loop
+    RET
+    ;
+%macro PUT_BILIN_V_W16 0
+    movu                 m0, [srcq+ssq*0]
+%%loop:
+    movu                 m3, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpcklbw            m1, m3, m0
+    punpckhbw            m2, m3, m0
+    movu                 m0, [srcq+ssq*0]
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    packuswb             m1, m2
+    mova       [dstq+dsq*0], m1
+    punpcklbw            m1, m0, m3
+    punpckhbw            m2, m0, m3
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    packuswb             m1, m2
+    mova       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg %%loop
+%endmacro
+    ;
+.v_w16:
+    PUT_BILIN_V_W16
+    RET
+.v_w16gt:
+    mov                  r4, dstq
+    mov                  r6, srcq
+.v_w16gt_loop:
+%if ARCH_X86_32
+    mov                bakm, t0q
+    RESTORE_DSQ_32       t0
+    PUT_BILIN_V_W16
+    mov                 t0q, bakm
+%else
+    PUT_BILIN_V_W16
+%endif
+    mov                  hw, t0w
+    add                  r4, mmsize
+    add                  r6, mmsize
+    mov                dstq, r4
+    mov                srcq, r6
+    sub                 t0d, 1<<16
+    jg .v_w16gt
+    RET
+.v_w32:
+    lea                 t0d, [hq+(1<<16)]
+    jmp .v_w16gt
+.v_w64:
+    lea                 t0d, [hq+(3<<16)]
+    jmp .v_w16gt
+.v_w128:
+    lea                 t0d, [hq+(7<<16)]
+    jmp .v_w16gt
+.hv:
+    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
+    WIN64_SPILL_XMM       8
+    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
+    mova                 m7, [base+pw_2048]
+    movd                 m6, mxyd
+    add                  wq, t0
+    pshuflw              m6, m6, q0000
+    punpcklqdq           m6, m6
+    jmp                  wq
+.hv_w2:
+    RESTORE_DSQ_32       t0
+    movd                 m0, [srcq+ssq*0]
+    pshufd               m0, m0, q0000      ; src[x - src_stride]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w2_loop:
+    movd                 m1, [srcq+ssq*1]   ; src[x]
+    lea                srcq, [srcq+ssq*2]
+    movhps               m1, [srcq+ssq*0]   ; src[x + src_stride]
+    pshufd               m1, m1, q3120
+    pshufb               m1, m4
+    pmaddubsw            m1, m5             ; 1 _ 2 _
+    shufps               m2, m0, m1, q1032  ; 0 _ 1 _
+    mova                 m0, m1
+    psubw                m1, m2   ; src[x + src_stride] - src[x]
+    paddw                m1, m1
+    pmulhw               m1, m6   ; (my * (src[x + src_stride] - src[x])
+    paddw                m1, m2   ; src[x] + (my * (src[x + src_stride] - src[x])
+    pmulhrsw             m1, m7
+    packuswb             m1, m1
+%if ARCH_X86_64
+    movq                 r6, m1
+%else
+    pshuflw              m1, m1, q2020
+    movd                r6d, m1
+%endif
+    mov        [dstq+dsq*0], r6w
+    shr                  r6, gprsize*4
+    mov        [dstq+dsq*1], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w2_loop
+    RET
+.hv_w4:
+    mova                 m4, [base+bilin_h_shuf4]
+    RESTORE_DSQ_32       t0
+    movddup             xm0, [srcq+ssq*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w4_loop:
+    movq                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movhps               m1, [srcq+ssq*0]
+    pshufb               m1, m4
+    pmaddubsw            m1, m5           ; 1 2
+    shufps               m2, m0, m1, q1032 ; 0 1
+    mova                 m0, m1
+    psubw                m1, m2
+    paddw                m1, m1
+    pmulhw               m1, m6
+    paddw                m1, m2
+    pmulhrsw             m1, m7
+    packuswb             m1, m1
+    movd       [dstq+dsq*0], m1
+    psrlq                m1, 32
+    movd       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    RESTORE_DSQ_32       t0
+    movu                 m0, [srcq+ssq*0+8*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w8_loop:
+    movu                 m2, [srcq+ssq*1+8*0]
+    lea                srcq, [srcq+ssq*2]
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m1, m2, m0
+    paddw                m1, m1
+    pmulhw               m1, m6
+    paddw                m1, m0
+    movu                 m0, [srcq+ssq*0+8*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+    psubw                m3, m0, m2
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m2
+    pmulhrsw             m1, m7
+    pmulhrsw             m3, m7
+    packuswb             m1, m3
+    movq       [dstq+dsq*0], m1
+    movhps     [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    RET
+.hv_w16:
+    xor                 t0d, t0d
+.hv_w16gt:
+    mov                  r4, dstq
+    mov                  r6, srcq
+ %if WIN64
+    movaps              r4m, xmm8
+ %endif
+.hv_w16_loop0:
+    movu                 m0,     [srcq+8*0]
+    movu                 m1,     [srcq+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+.hv_w16_loop:
+%if ARCH_X86_32
+ %define m0tmp [dstq]
+%else
+ %define m0tmp m8
+%endif
+    add                srcq, ssq
+    movu                 m2, [srcq+8*0]
+    movu                 m3, [srcq+8*1]
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova              m0tmp, m2
+    psubw                m2, m0
+    paddw                m2, m2
+    pmulhw               m2, m6
+    paddw                m2, m0
+    mova                 m0, m3
+    psubw                m3, m1
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m1
+    mova                 m1, m0
+    mova                 m0, m0tmp
+    pmulhrsw             m2, m7
+    pmulhrsw             m3, m7
+    packuswb             m2, m3
+    mova             [dstq], m2
+    add                dstq, dsmp
+    dec                  hd
+    jg .hv_w16_loop
+    movzx                hd, t0w
+    add                  r4, mmsize
+    add                  r6, mmsize
+    mov                dstq, r4
+    mov                srcq, r6
+    sub                 t0d, 1<<16
+    jg .hv_w16_loop0
+ %if WIN64
+    movaps             xmm8, r4m
+ %endif
+    RET
+.hv_w32:
+    lea                 t0d, [hq+(1<<16)]
+    jmp .hv_w16gt
+.hv_w64:
+    lea                 t0d, [hq+(3<<16)]
+    jmp .hv_w16gt
+.hv_w128:
+    lea                 t0d, [hq+(7<<16)]
+    jmp .hv_w16gt
+
+%macro PSHUFB_0X1X 1-2 ; dst[, src]
+ %if cpuflag(ssse3)
+    pshufb               %1, %2
+ %else
+    punpcklbw            %1, %1
+    psraw                %1, 8
+    pshufd               %1, %1, q0000
+ %endif
+%endmacro
+
+%macro PSHUFB_BILIN_H8 2 ; dst, src
+ %if cpuflag(ssse3)
+    pshufb               %1, %2
+ %else
+    mova                 %2, %1
+    psrldq               %1, 1
+    punpcklbw            %1, %2
+ %endif
+%endmacro
+
+%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp
+ %if cpuflag(ssse3)
+    pshufb               %1, %2
+ %else
+    mova                 %2, %1
+    psrldq               %1, 1
+    punpckhbw            %3, %1, %2
+    punpcklbw            %1, %2
+    punpcklqdq           %1, %3
+ %endif
+%endmacro
+
+%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero
+ %if cpuflag(ssse3)
+    pmaddubsw            %1, %2
+ %else
+  %if %5 == 1
+    pxor                 %3, %3
+  %endif
+    punpckhbw            %4, %1, %3
+    punpcklbw            %1, %1, %3
+    pmaddwd              %4, %2
+    pmaddwd              %1, %2
+    packssdw             %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift
+ %if cpuflag(ssse3)
+    pmulhrsw             %1, %2
+ %else
+    punpckhwd            %3, %1, %4
+    punpcklwd            %1, %4
+    pmaddwd              %3, %2
+    pmaddwd              %1, %2
+    psrad                %3, %5
+    psrad                %1, %5
+    packssdw             %1, %3
+ %endif
+%endmacro
+
+%macro PREP_BILIN 0
+
+DECLARE_REG_TMP 3, 5, 6
+%if ARCH_X86_32
+ %define base        t2-prep%+SUFFIX
+%else
+ %define base        0
+%endif
+
+cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+    movifnidn          mxyd, r5m ; mx
+    LEA                  t2, prep%+SUFFIX
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    test               mxyd, mxyd
+    jnz .h
+    mov                mxyd, r6m ; my
+    test               mxyd, mxyd
+    jnz .v
+.prep:
+%if notcpuflag(ssse3)
+    add                  t2, prep_ssse3 - prep_sse2
+    jmp prep_ssse3
+%else
+    movzx                wd, word [t2+wq*2+table_offset(prep,)]
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.prep_w4:
+    movd                 m0, [srcq+strideq*0]
+    movd                 m1, [srcq+strideq*1]
+    movd                 m2, [srcq+strideq*2]
+    movd                 m3, [srcq+stride3q ]
+    punpckldq            m0, m1
+    punpckldq            m2, m3
+    lea                srcq, [srcq+strideq*4]
+    pxor                 m1, m1
+    punpcklbw            m0, m1
+    punpcklbw            m2, m1
+    psllw                m0, 4
+    psllw                m2, 4
+    mova    [tmpq+mmsize*0], m0
+    mova    [tmpq+mmsize*1], m2
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .prep_w4
+    RET
+.prep_w8:
+    movq                 m0, [srcq+strideq*0]
+    movq                 m1, [srcq+strideq*1]
+    movq                 m2, [srcq+strideq*2]
+    movq                 m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    pxor                 m4, m4
+    punpcklbw            m0, m4
+    punpcklbw            m1, m4
+    punpcklbw            m2, m4
+    punpcklbw            m3, m4
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
+    add                tmpq, 16*4
+    sub                  hd, 4
+    jg .prep_w8
+    RET
+.prep_w16:
+    movq                 m0, [srcq+strideq*0+8*0]
+    movq                 m1, [srcq+strideq*0+8*1]
+    movq                 m2, [srcq+strideq*1+8*0]
+    movq                 m3, [srcq+strideq*1+8*1]
+    lea                srcq, [srcq+strideq*2]
+    pxor                 m4, m4
+    punpcklbw            m0, m4
+    punpcklbw            m1, m4
+    punpcklbw            m2, m4
+    punpcklbw            m3, m4
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
+    add                tmpq, 16*4
+    sub                  hd, 2
+    jg .prep_w16
+    RET
+.prep_w32:
+    mov                 t2d, 1
+    jmp .prep_w32_vloop
+.prep_w64:
+    mov                 t2d, 2
+    jmp .prep_w32_vloop
+.prep_w128:
+    mov                 t2d, 4
+.prep_w32_vloop:
+    mov                 t1q, srcq
+    mov                 r3d, t2d
+.prep_w32_hloop:
+    movq                 m0, [t1q+8*0]
+    movq                 m1, [t1q+8*1]
+    movq                 m2, [t1q+8*2]
+    movq                 m3, [t1q+8*3]
+    pxor                 m4, m4
+    punpcklbw            m0, m4
+    punpcklbw            m1, m4
+    punpcklbw            m2, m4
+    punpcklbw            m3, m4
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
+    add                tmpq, 16*4
+    add                 t1q, 32
+    dec                 r3d
+    jg .prep_w32_hloop
+    lea                srcq, [srcq+strideq]
+    dec                  hd
+    jg .prep_w32_vloop
+    RET
+%endif
+.h:
+    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+    ; = (16 - mx) * src[x] + mx * src[x + 1]
+    imul               mxyd, 0xff01
+%if cpuflag(ssse3)
+    mova                 m4, [base+bilin_h_shuf8]
+%endif
+    add                mxyd, 16 << 8
+    movd                 m5, mxyd
+    mov                mxyd, r6m ; my
+%if cpuflag(ssse3)
+    pshuflw              m5, m5, q0000
+    punpcklqdq           m5, m5
+%else
+    PSHUFB_0X1X          m5
+%endif
+    test               mxyd, mxyd
+    jnz .hv
+%if ARCH_X86_32
+    mov                  t1, t2 ; save base reg for w4
+%endif
+    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+%if notcpuflag(ssse3)
+    WIN64_SPILL_XMM 8
+    pxor                 m6, m6
+%endif
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.h_w4:
+%if cpuflag(ssse3)
+ %if ARCH_X86_32
+    mova                 m4, [t1-prep_ssse3+bilin_h_shuf4]
+ %else
+    mova                 m4, [bilin_h_shuf4]
+ %endif
+%endif
+.h_w4_loop:
+    movq                 m0, [srcq+strideq*0]
+    movhps               m0, [srcq+strideq*1]
+    movq                 m1, [srcq+strideq*2]
+    movhps               m1, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    PSHUFB_BILIN_H4      m0, m4, m2
+    PMADDUBSW            m0, m5, m6, m2, 0
+    PSHUFB_BILIN_H4      m1, m4, m2
+    PMADDUBSW            m1, m5, m6, m2, 0
+    mova          [tmpq+0 ], m0
+    mova          [tmpq+16], m1
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .h_w4_loop
+    RET
+.h_w8:
+    movu                 m0, [srcq+strideq*0]
+    movu                 m1, [srcq+strideq*1]
+    movu                 m2, [srcq+strideq*2]
+    movu                 m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    PSHUFB_BILIN_H8      m0, m4
+    PSHUFB_BILIN_H8      m1, m4
+    PSHUFB_BILIN_H8      m2, m4
+    PSHUFB_BILIN_H8      m3, m4
+    PMADDUBSW            m0, m5, m6, m7, 0
+    PMADDUBSW            m1, m5, m6, m7, 0
+    PMADDUBSW            m2, m5, m6, m7, 0
+    PMADDUBSW            m3, m5, m6, m7, 0
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
+    add                tmpq, 16*4
+    sub                  hd, 4
+    jg .h_w8
+    RET
+.h_w16:
+    movu                 m0, [srcq+strideq*0+8*0]
+    movu                 m1, [srcq+strideq*0+8*1]
+    movu                 m2, [srcq+strideq*1+8*0]
+    movu                 m3, [srcq+strideq*1+8*1]
+    lea                srcq, [srcq+strideq*2]
+    PSHUFB_BILIN_H8      m0, m4
+    PSHUFB_BILIN_H8      m1, m4
+    PSHUFB_BILIN_H8      m2, m4
+    PSHUFB_BILIN_H8      m3, m4
+    PMADDUBSW            m0, m5, m6, m7, 0
+    PMADDUBSW            m1, m5, m6, m7, 0
+    PMADDUBSW            m2, m5, m6, m7, 0
+    PMADDUBSW            m3, m5, m6, m7, 0
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
+    add                tmpq, 16*4
+    sub                  hd, 2
+    jg .h_w16
+    RET
+.h_w32:
+    mov                 t2d, 1 << 0
+    jmp .h_w32_vloop
+.h_w64:
+    mov                 t2d, 1 << 1
+    jmp .h_w32_vloop
+.h_w128:
+    mov                 t2d, 1 << 3
+.h_w32_vloop:
+    mov                 t1q, srcq
+    mov                 r3d, t2d
+.h_w32_hloop:
+    movu                 m0, [t1q+8*0]
+    movu                 m1, [t1q+8*1]
+    movu                 m2, [t1q+8*2]
+    movu                 m3, [t1q+8*3]
+    PSHUFB_BILIN_H8      m0, m4
+    PSHUFB_BILIN_H8      m1, m4
+    PSHUFB_BILIN_H8      m2, m4
+    PSHUFB_BILIN_H8      m3, m4
+    PMADDUBSW            m0, m5, m6, m7, 0
+    PMADDUBSW            m1, m5, m6, m7, 0
+    PMADDUBSW            m2, m5, m6, m7, 0
+    PMADDUBSW            m3, m5, m6, m7, 0
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
+    add                tmpq, 16*4
+    add                 t1q, 32
+    shr                 r3d, 1
+    jnz .h_w32_hloop
+    lea                srcq, [srcq+strideq]
+    sub                  hd, 1
+    jg .h_w32_vloop
+    RET
+.v:
+%if notcpuflag(ssse3)
+ %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM 8
+%endif
+    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
+    imul               mxyd, 0xff01
+    add                mxyd, 16 << 8
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    movd                 m5, mxyd
+%if cpuflag(ssse3)
+    pshuflw              m5, m5, q0000
+    punpcklqdq           m5, m5
+%else
+    PSHUFB_0X1X          m5
+    pxor                 m6, m6
+%endif
+    jmp                  wq
+.v_w4:
+    movd                 m0, [srcq+strideq*0]
+.v_w4_loop:
+    movd                 m1, [srcq+strideq*1]
+    movd                 m2, [srcq+strideq*2]
+    movd                 m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    punpcklwd            m0, m1  ; 0 1 _ _
+    punpcklwd            m1, m2  ; 1 2 _ _
+    punpcklbw            m1, m0
+    PMADDUBSW            m1, m5, m6, m7, 0
+    pshufd               m1, m1, q3120
+    mova        [tmpq+16*0], m1
+    movd                 m0, [srcq+strideq*0]
+    punpcklwd            m2, m3  ; 2 3 _ _
+    punpcklwd            m3, m0  ; 3 4 _ _
+    punpcklbw            m3, m2
+    PMADDUBSW            m3, m5, m6, m7, 0
+    pshufd               m3, m3, q3120
+    mova        [tmpq+16*1], m3
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .v_w4_loop
+    RET
+.v_w8:
+    movq                 m0, [srcq+strideq*0]
+.v_w8_loop:
+    movq                 m1, [srcq+strideq*2]
+    movq                 m2, [srcq+strideq*1]
+    movq                 m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    shufpd               m4, m0, m1, 0x0c       ; 0 2
+    movq                 m0, [srcq+strideq*0]
+    shufpd               m2, m3, 0x0c           ; 1 3
+    shufpd               m1, m0, 0x0c           ; 2 4
+    punpcklbw            m3, m2, m4
+    PMADDUBSW            m3, m5, m6, m7, 0
+    mova        [tmpq+16*0], m3
+    punpckhbw            m3, m2, m4
+    PMADDUBSW            m3, m5, m6, m7, 0
+    mova        [tmpq+16*2], m3
+    punpcklbw            m3, m1, m2
+    punpckhbw            m1, m2
+    PMADDUBSW            m3, m5, m6, m7, 0
+    PMADDUBSW            m1, m5, m6, m7, 0
+    mova        [tmpq+16*1], m3
+    mova        [tmpq+16*3], m1
+    add                tmpq, 16*4
+    sub                  hd, 4
+    jg .v_w8_loop
+    RET
+.v_w16:
+    movu                 m0, [srcq+strideq*0]
+.v_w16_loop:
+    movu                 m1, [srcq+strideq*1]
+    movu                 m2, [srcq+strideq*2]
+    punpcklbw            m3, m1, m0
+    punpckhbw            m4, m1, m0
+    PMADDUBSW            m3, m5, m6, m7, 0
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*0], m3
+    mova        [tmpq+16*1], m4
+    punpcklbw            m3, m2, m1
+    punpckhbw            m4, m2, m1
+    PMADDUBSW            m3, m5, m6, m7, 0
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*2], m3
+    mova        [tmpq+16*3], m4
+    movu                 m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    movu                 m0, [srcq+strideq*0]
+    add                tmpq, 16*8
+    punpcklbw            m1, m3, m2
+    punpckhbw            m4, m3, m2
+    PMADDUBSW            m1, m5, m6, m7, 0
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq-16*4], m1
+    mova        [tmpq-16*3], m4
+    punpcklbw            m1, m0, m3
+    punpckhbw            m2, m0, m3
+    PMADDUBSW            m1, m5, m6, m7, 0
+    PMADDUBSW            m2, m5, m6, m7, 0
+    mova        [tmpq-16*2], m1
+    mova        [tmpq-16*1], m2
+    sub                  hd, 4
+    jg .v_w16_loop
+    RET
+.v_w32:
+    lea                 t2d, [hq+(0<<16)]
+    mov                 t0d, 64
+    jmp .v_w32_start
+.v_w64:
+    lea                 t2d, [hq+(1<<16)]
+    mov                 t0d, 128
+    jmp .v_w32_start
+.v_w128:
+    lea                 t2d, [hq+(3<<16)]
+    mov                 t0d, 256
+.v_w32_start:
+%if ARCH_X86_64
+ %if WIN64
+    PUSH                 r7
+ %endif
+    mov                  r7, tmpq
+%endif
+    mov                  t1, srcq
+.v_w32_hloop:
+    movu                 m0, [srcq+strideq*0+16*0]
+    movu                 m1, [srcq+strideq*0+16*1]
+.v_w32_vloop:
+    movu                 m2, [srcq+strideq*1+16*0]
+    movu                 m3, [srcq+strideq*1+16*1]
+    lea                srcq, [srcq+strideq*2]
+    punpcklbw            m4, m2, m0
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*0], m4
+    punpckhbw            m4, m2, m0
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*1], m4
+    punpcklbw            m4, m3, m1
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*2], m4
+    punpckhbw            m4, m3, m1
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*3], m4
+    add                tmpq, t0q
+    movu                 m0, [srcq+strideq*0+16*0]
+    movu                 m1, [srcq+strideq*0+16*1]
+    punpcklbw            m4, m0, m2
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*0], m4
+    punpckhbw            m4, m0, m2
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*1], m4
+    punpcklbw            m4, m1, m3
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*2], m4
+    punpckhbw            m4, m1, m3
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*3], m4
+    add                tmpq, t0q
+    sub                  hd, 2
+    jg .v_w32_vloop
+    movzx                hd, t2w
+    add                  t1, 32
+    mov                srcq, t1
+%if ARCH_X86_64
+    add                  r7, 2*16*2
+    mov                tmpq, r7
+%else
+    mov                tmpq, tmpmp
+    add                tmpq, 2*16*2
+    mov               tmpmp, tmpq
+%endif
+    sub                 t2d, 1<<16
+    jg .v_w32_hloop
+%if WIN64
+    POP                  r7
+%endif
+    RET
+.hv:
+    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+%assign stack_offset stack_offset - stack_size_padded
+%if cpuflag(ssse3)
+    WIN64_SPILL_XMM 8
+%else
+    WIN64_SPILL_XMM 10
+%endif
+    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+%if cpuflag(ssse3)
+    shl                mxyd, 11
+%else
+ %if ARCH_X86_64
+    mova                 m8, [pw_8]
+ %else
+  %define m8 [pw_8]
+ %endif
+    pxor                 m7, m7
+%endif
+    movd                 m6, mxyd
+    add                  wq, t2
+    pshuflw              m6, m6, q0000
+%if cpuflag(ssse3)
+    punpcklqdq           m6, m6
+%else
+ %if ARCH_X86_64
+    psrlw                m0, m8, 3
+    punpcklwd            m6, m0
+ %else
+    punpcklwd            m6, [base+pw_1]
+ %endif
+%endif
+%if ARCH_X86_32
+    mov                  t1, t2 ; save base reg for w4
+%endif
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.hv_w4:
+%if cpuflag(ssse3)
+ %if ARCH_X86_32
+    mova                 m4, [t1-prep_ssse3+bilin_h_shuf4]
+ %else
+    mova                 m4, [bilin_h_shuf4]
+ %endif
+%endif
+    movhps               m0, [srcq+strideq*0]
+    PSHUFB_BILIN_H4      m0, m4, m3
+    PMADDUBSW            m0, m5, m7, m4, 0 ; _ 0
+.hv_w4_loop:
+    movq                 m1, [srcq+strideq*1]
+    movhps               m1, [srcq+strideq*2]
+    movq                 m2, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    movhps               m2, [srcq+strideq*0]
+    PSHUFB_BILIN_H4      m1, m4, m3
+    PSHUFB_BILIN_H4      m2, m4, m3
+    PMADDUBSW            m1, m5, m7, m4, 0 ; 1 2
+    shufpd               m3, m0, m1, 0x01  ; 0 1
+    mova                 m0, m2
+    PMADDUBSW            m0, m5, m7, m4, 0 ; 3 4
+    shufpd               m2, m1, m0, 0x01  ; 2 3
+    psubw                m1, m3
+    PMULHRSW             m1, m6, m4, m8, 4
+    paddw                m1, m3
+    psubw                m3, m0, m2
+    PMULHRSW             m3, m6, m4, m8, 4
+    paddw                m3, m2
+    mova        [tmpq+16*0], m1
+    mova        [tmpq+16*1], m3
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    movu                 m0, [srcq+strideq*0]
+    PSHUFB_BILIN_H8      m0, m4
+    PMADDUBSW            m0, m5, m7, m4, 0 ; 0
+.hv_w8_loop:
+    movu                 m1, [srcq+strideq*1]
+    movu                 m2, [srcq+strideq*2]
+    PSHUFB_BILIN_H8      m1, m4
+    PSHUFB_BILIN_H8      m2, m4
+    PMADDUBSW            m1, m5, m7, m4, 0 ; 1
+    PMADDUBSW            m2, m5, m7, m4, 0 ; 2
+    psubw                m3, m1, m0
+    PMULHRSW             m3, m6, m4, m8, 4
+    paddw                m3, m0
+%if notcpuflag(ssse3) && ARCH_X86_64
+    SWAP                 m9, m7
+%endif
+    psubw                m7, m2, m1
+    PMULHRSW             m7, m6, m4, m8, 4
+    paddw                m7, m1
+    mova        [tmpq+16*0], m3
+    mova        [tmpq+16*1], m7
+%if notcpuflag(ssse3) && ARCH_X86_64
+    SWAP                 m7, m9
+%endif
+    movu                 m1, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    movu                 m0, [srcq+strideq*0]
+    PSHUFB_BILIN_H8      m1, m4
+    PSHUFB_BILIN_H8      m0, m4
+    PMADDUBSW            m1, m5, m7, m4, ARCH_X86_32 ; 3
+    PMADDUBSW            m0, m5, m7, m4, 0           ; 4
+    psubw                m3, m1, m2
+    PMULHRSW             m3, m6, m4, m8, 4
+    paddw                m3, m2
+%if notcpuflag(ssse3) && ARCH_X86_64
+    SWAP                 m9, m7
+%endif
+    psubw                m7, m0, m1
+    PMULHRSW             m7, m6, m4, m8, 4
+    paddw                m7, m1
+    mova        [tmpq+16*2], m3
+    mova        [tmpq+16*3], m7
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m7, m9
+ %else
+    pxor                 m7, m7
+ %endif
+%endif
+    add                tmpq, 16*4
+    sub                  hd, 4
+    jg .hv_w8_loop
+    RET
+.hv_w16:
+    mov                 t2d, hd
+    mov                 t0d, 32
+    jmp .hv_w16_start
+.hv_w32:
+    lea                 t2d, [hq+(1<<16)]
+    mov                 t0d, 64
+    jmp .hv_w16_start
+.hv_w64:
+    lea                 t2d, [hq+(3<<16)]
+    mov                 t0d, 128
+    jmp .hv_w16_start
+.hv_w128:
+    lea                 t2d, [hq+(7<<16)]
+    mov                 t0d, 256
+.hv_w16_start:
+%if ARCH_X86_64
+ %if WIN64
+    PUSH                 r7
+ %endif
+    mov                  r7, tmpq
+%endif
+    mov                  t1, srcq
+.hv_w16_hloop:
+    movu                 m0, [srcq+strideq*0+8*0]
+    movu                 m1, [srcq+strideq*0+8*1]
+    PSHUFB_BILIN_H8      m0, m4
+    PSHUFB_BILIN_H8      m1, m4
+    PMADDUBSW            m0, m5, m7, m4, 0 ; 0a
+    PMADDUBSW            m1, m5, m7, m4, 0 ; 0b
+.hv_w16_vloop:
+    movu                 m2, [srcq+strideq*1+8*0]
+    PSHUFB_BILIN_H8      m2, m4
+    PMADDUBSW            m2, m5, m7, m4, 0 ; 1a
+    psubw                m3, m2, m0
+    PMULHRSW             m3, m6, m4, m8, 4
+    paddw                m3, m0
+    mova        [tmpq+16*0], m3
+    movu                 m3, [srcq+strideq*1+8*1]
+    lea                srcq, [srcq+strideq*2]
+    PSHUFB_BILIN_H8      m3, m4
+    PMADDUBSW            m3, m5, m7, m4, 0 ; 1b
+    psubw                m0, m3, m1
+    PMULHRSW             m0, m6, m4, m8, 4
+    paddw                m0, m1
+    mova        [tmpq+16*1], m0
+    add                tmpq, t0q
+    movu                 m0, [srcq+strideq*0+8*0]
+    PSHUFB_BILIN_H8      m0, m4
+    PMADDUBSW            m0, m5, m7, m4, 0 ; 2a
+    psubw                m1, m0, m2
+    PMULHRSW             m1, m6, m4, m8, 4
+    paddw                m1, m2
+    mova        [tmpq+16*0], m1
+    movu                 m1, [srcq+strideq*0+8*1]
+    PSHUFB_BILIN_H8      m1, m4
+    PMADDUBSW            m1, m5, m7, m4, 0 ; 2b
+    psubw                m2, m1, m3
+    PMULHRSW             m2, m6, m4, m8, 4
+    paddw                m2, m3
+    mova        [tmpq+16*1], m2
+    add                tmpq, t0q
+    sub                  hd, 2
+    jg .hv_w16_vloop
+    movzx                hd, t2w
+    add                  t1, 16
+    mov                srcq, t1
+%if ARCH_X86_64
+    add                  r7, 2*16
+    mov                tmpq, r7
+%else
+    mov                tmpq, tmpmp
+    add                tmpq, 2*16
+    mov               tmpmp, tmpq
+%endif
+    sub                 t2d, 1<<16
+    jg .hv_w16_hloop
+%if WIN64
+    POP                  r7
+%endif
+    RET
+%endmacro
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
+%assign FILTER_SHARP   (2*15 << 16) | 3*15
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2
+%elif WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%macro PUT_8TAP_FN 3 ; type, type_h, type_v
+cglobal put_8tap_%1
+    mov                 t0d, FILTER_%2
+    mov                 t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+    jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+PUT_8TAP_FN regular,        REGULAR, REGULAR
+PUT_8TAP_FN regular_sharp,  REGULAR, SHARP
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR
+PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH
+PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
+PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR
+PUT_8TAP_FN sharp,          SHARP,   SHARP
+PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
+
+%if ARCH_X86_32
+ %define base_reg r1
+ %define base base_reg-put_ssse3
+ %define W32_RESTORE_DSQ mov dsq, dsm
+ %define W32_RESTORE_SSQ mov ssq, ssm
+%else
+ %define base_reg r8
+ %define base 0
+ %define W32_RESTORE_DSQ
+ %define W32_RESTORE_SSQ
+%endif
+
+cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+%assign org_stack_offset stack_offset
+    imul                mxd, mxm, 0x010101
+    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+%if ARCH_X86_64
+    imul                myd, mym, 0x010101
+    add                 myd, t1d ; 8tap_v, my, 4tap_v
+%else
+    imul                ssd, mym, 0x010101
+    add                 ssd, t1d ; 8tap_v, my, 4tap_v
+    mov                srcq, srcm
+%endif
+    mov                  wd, wm
+    movifnidn            hd, hm
+    LEA            base_reg, put_ssse3
+    test                mxd, 0xf00
+    jnz .h
+%if ARCH_X86_32
+    test                ssd, 0xf00
+%else
+    test                myd, 0xf00
+%endif
+    jnz .v
+    tzcnt                wd, wd
+    movzx                wd, word [base_reg+wq*2+table_offset(put,)]
+    add                  wq, base_reg
+; put_bilin mangling jump
+%assign stack_offset org_stack_offset
+%if ARCH_X86_32
+    mov                 dsq, dsm
+    mov                 ssq, ssm
+%elif WIN64
+    pop                  r8
+%endif
+    lea                  r6, [ssq*3]
+    jmp                  wq
+.h:
+%if ARCH_X86_32
+    test                ssd, 0xf00
+%else
+    test                myd, 0xf00
+%endif
+    jnz .hv
+    W32_RESTORE_SSQ
+    WIN64_SPILL_XMM      12
+    cmp                  wd, 4
+    jl .h_w2
+    je .h_w4
+    tzcnt                wd, wd
+%if ARCH_X86_64
+    mova                m10, [base+subpel_h_shufA]
+    mova                m11, [base+subpel_h_shufB]
+    mova                 m9, [base+subpel_h_shufC]
+%endif
+    shr                 mxd, 16
+    sub                srcq, 3
+    movzx                wd, word [base_reg+wq*2+table_offset(put, _8tap_h)]
+    movd                 m5, [base_reg+mxq*8+subpel_filters-put_ssse3+0]
+    pshufd               m5, m5, q0000
+    movd                 m6, [base_reg+mxq*8+subpel_filters-put_ssse3+4]
+    pshufd               m6, m6, q0000
+    mova                 m7, [base+pw_34] ; 2 + (8 << 2)
+    add                  wq, base_reg
+    jmp                  wq
+.h_w2:
+%if ARCH_X86_32
+    and                 mxd, 0x7f
+%else
+    movzx               mxd, mxb
+%endif
+    dec                srcq
+    mova                 m4, [base+subpel_h_shuf4]
+    movd                 m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+    pshufd               m3, m3, q0000
+    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
+    W32_RESTORE_DSQ
+.h_w2_loop:
+    movq                 m0, [srcq+ssq*0]
+    movhps               m0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb               m0, m4
+    pmaddubsw            m0, m3
+    phaddw               m0, m0
+    paddw                m0, m5 ; pw34
+    psraw                m0, 6
+    packuswb             m0, m0
+    movd                r4d, m0
+    mov        [dstq+dsq*0], r4w
+    shr                 r4d, 16
+    mov        [dstq+dsq*1], r4w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w2_loop
+    RET
+.h_w4:
+%if ARCH_X86_32
+    and                 mxd, 0x7f
+%else
+    movzx               mxd, mxb
+%endif
+    dec                srcq
+    movd                 m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+    pshufd               m3, m3, q0000
+    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
+    mova                 m6, [base+subpel_h_shufA]
+    W32_RESTORE_DSQ
+.h_w4_loop:
+    movq                 m0, [srcq+ssq*0] ; 1
+    movq                 m1, [srcq+ssq*1] ; 2
+    lea                srcq, [srcq+ssq*2]
+    pshufb               m0, m6 ; subpel_h_shufA
+    pshufb               m1, m6 ; subpel_h_shufA
+    pmaddubsw            m0, m3 ; subpel_filters
+    pmaddubsw            m1, m3 ; subpel_filters
+    phaddw               m0, m1
+    paddw                m0, m5 ; pw34
+    psraw                m0, 6
+    packuswb             m0, m0
+    movd       [dstq+dsq*0], m0
+    psrlq                m0, 32
+    movd       [dstq+dsq*1], m0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w4_loop
+    RET
+    ;
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ %if ARCH_X86_32
+    pshufb              %2, %1, [base+subpel_h_shufB]
+    pshufb              %3, %1, [base+subpel_h_shufC]
+    pshufb              %1,     [base+subpel_h_shufA]
+ %else
+    pshufb              %2, %1, m11; subpel_h_shufB
+    pshufb              %3, %1, m9 ; subpel_h_shufC
+    pshufb              %1, m10    ; subpel_h_shufA
+ %endif
+    pmaddubsw           %4, %2, m5 ; subpel +0 B0
+    pmaddubsw           %2, m6     ; subpel +4 B4
+    pmaddubsw           %3, m6     ; C4
+    pmaddubsw           %1, m5     ; A0
+    paddw               %3, %4     ; C4+B0
+    paddw               %1, %2     ; A0+B4
+    phaddw              %1, %3
+    paddw               %1, m7     ; pw34
+    psraw               %1, 6
+%endmacro
+    ;
+.h_w8:
+    movu                 m0,     [srcq+ssq*0]
+    movu                 m1,     [srcq+ssq*1]
+    PUT_8TAP_H           m0, m2, m3, m4
+    lea                srcq, [srcq+ssq*2]
+    PUT_8TAP_H           m1, m2, m3, m4
+    packuswb             m0, m1
+%if ARCH_X86_32
+    movq       [dstq      ], m0
+    add                dstq, dsm
+    movhps     [dstq      ], m0
+    add                dstq, dsm
+%else
+    movq       [dstq+dsq*0], m0
+    movhps     [dstq+dsq*1], m0
+    lea                dstq, [dstq+dsq*2]
+%endif
+    sub                  hd, 2
+    jg .h_w8
+    RET
+.h_w16:
+    xor                 r6d, r6d
+    jmp .h_start
+.h_w32:
+    mov                  r6, -16*1
+    jmp .h_start
+.h_w64:
+    mov                  r6, -16*3
+    jmp .h_start
+.h_w128:
+    mov                  r6, -16*7
+.h_start:
+    sub                srcq, r6
+    sub                dstq, r6
+    mov                  r4, r6
+.h_loop:
+    movu                 m0, [srcq+r6+8*0]
+    movu                 m1, [srcq+r6+8*1]
+    PUT_8TAP_H           m0, m2, m3, m4
+    PUT_8TAP_H           m1, m2, m3, m4
+    packuswb             m0, m1
+    mova          [dstq+r6], m0
+    add                  r6, mmsize
+    jle .h_loop
+    add                srcq, ssq
+%if ARCH_X86_32
+    add                dstq, dsm
+%else
+    add                dstq, dsq
+%endif
+    mov                  r6, r4
+    dec                  hd
+    jg .h_loop
+    RET
+.v:
+%if ARCH_X86_32
+    movzx               mxd, ssb
+    shr                 ssd, 16
+    cmp                  hd, 6
+    cmovs               ssd, mxd
+    lea                 ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
+%else
+ %assign stack_offset org_stack_offset
+    WIN64_SPILL_XMM      16
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    lea                 myq, [base_reg+myq*8+subpel_filters-put_ssse3]
+%endif
+    tzcnt               r6d, wd
+    movzx               r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)]
+    mova                 m7, [base+pw_512]
+    psrlw                m2, m7, 1 ; 0x0100
+    add                  r6, base_reg
+%if ARCH_X86_32
+ %define            subpel0  [rsp+mmsize*0]
+ %define            subpel1  [rsp+mmsize*1]
+ %define            subpel2  [rsp+mmsize*2]
+ %define            subpel3  [rsp+mmsize*3]
+%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
+    ALLOC_STACK   -mmsize*4
+%assign regs_used 7
+    movd                 m0, [ssq+0]
+    pshufb               m0, m2
+    mova            subpel0, m0
+    movd                 m0, [ssq+2]
+    pshufb               m0, m2
+    mova            subpel1, m0
+    movd                 m0, [ssq+4]
+    pshufb               m0, m2
+    mova            subpel2, m0
+    movd                 m0, [ssq+6]
+    pshufb               m0, m2
+    mova            subpel3, m0
+    mov                 ssq, [rstk+stack_offset+gprsize*4]
+    lea                 ssq, [ssq*3]
+    sub                srcq, ssq
+    mov                 ssq, [rstk+stack_offset+gprsize*4]
+    mov                 dsq, [rstk+stack_offset+gprsize*2]
+%else
+ %define            subpel0  m8
+ %define            subpel1  m9
+ %define            subpel2  m10
+ %define            subpel3  m11
+    movd            subpel0, [myq+0]
+    pshufb          subpel0, m2
+    movd            subpel1, [myq+2]
+    pshufb          subpel1, m2
+    movd            subpel2, [myq+4]
+    pshufb          subpel2, m2
+    movd            subpel3, [myq+6]
+    pshufb          subpel3, m2
+    lea                ss3q, [ssq*3]
+    sub                srcq, ss3q
+%endif
+    jmp                  r6
+.v_w2:
+    movd                 m2, [srcq+ssq*0]    ; 0
+    pinsrw               m2, [srcq+ssq*1], 2 ; 0 1
+    pinsrw               m2, [srcq+ssq*2], 4 ; 0 1 2
+%if ARCH_X86_32
+    lea                srcq, [srcq+ssq*2]
+    add                srcq, ssq
+    pinsrw               m2, [srcq+ssq*0], 6 ; 0 1 2 3
+    add                srcq, ssq
+%else
+    pinsrw               m2, [srcq+ss3q ], 6 ; 0 1 2 3
+    lea                srcq, [srcq+ssq*4]
+%endif
+    movd                 m3, [srcq+ssq*0]    ; 4
+    movd                 m1, [srcq+ssq*1]    ; 5
+    movd                 m0, [srcq+ssq*2]    ; 6
+%if ARCH_X86_32
+    lea                srcq, [srcq+ssq*2]
+    add                srcq, ssq
+%else
+    add                srcq, ss3q
+%endif
+    punpckldq            m3, m1              ; 4 5 _ _
+    punpckldq            m1, m0              ; 5 6 _ _
+    palignr              m4, m3, m2, 4       ; 1 2 3 4
+    punpcklbw            m3, m1              ; 45 56
+    punpcklbw            m1, m2, m4          ; 01 12
+    punpckhbw            m2, m4              ; 23 34
+.v_w2_loop:
+    pmaddubsw            m5, m1, subpel0     ; a0 b0
+    mova                 m1, m2
+    pmaddubsw            m2, subpel1         ; a1 b1
+    paddw                m5, m2
+    mova                 m2, m3
+    pmaddubsw            m3, subpel2         ; a2 b2
+    paddw                m5, m3
+    movd                 m4, [srcq+ssq*0]    ; 7
+    punpckldq            m3, m0, m4          ; 6 7 _ _
+    movd                 m0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpckldq            m4, m0              ; 7 8 _ _
+    punpcklbw            m3, m4              ; 67 78
+    pmaddubsw            m4, m3, subpel3     ; a3 b3
+    paddw                m5, m4
+    pmulhrsw             m5, m7
+    packuswb             m5, m5
+    pshuflw              m5, m5, q2020
+    movd                r6d, m5
+    mov        [dstq+dsq*0], r6w
+    shr                 r6d, 16
+    mov        [dstq+dsq*1], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w2_loop
+    RET
+.v_w4:
+%if ARCH_X86_32
+.v_w8:
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+%endif ; ARCH_X86_32
+    lea                 r6d, [wq - 4] ; horizontal loop
+    mov                  r4, dstq
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+ %define               srcm [rsp+mmsize*4+gprsize]
+%endif
+    mov                srcm, srcq
+%else
+    mov                  r7, srcq
+%endif
+    shl                 r6d, (16 - 2)  ; (wq / 4) << 16
+    mov                 r6w, hw
+.v_w4_loop0:
+    movd                 m2, [srcq+ssq*0] ; 0
+    movhps               m2, [srcq+ssq*2] ; 0 _ 2
+    movd                 m3, [srcq+ssq*1] ; 1
+%if ARCH_X86_32
+    lea                srcq, [srcq+ssq*2]
+    add                srcq, ssq
+    movhps               m3, [srcq+ssq*0] ; 1 _ 3
+    lea                srcq, [srcq+ssq*1]
+%else
+    movhps               m3, [srcq+ss3q ] ; 1 _ 3
+    lea                srcq, [srcq+ssq*4]
+%endif
+    pshufd               m2, m2, q2020    ; 0 2 0 2
+    pshufd               m3, m3, q2020    ; 1 3 1 3
+    punpckldq            m2, m3           ; 0 1 2 3
+    movd                 m3, [srcq+ssq*0] ; 4
+    movd                 m1, [srcq+ssq*1] ; 5
+    movd                 m0, [srcq+ssq*2] ; 6
+%if ARCH_X86_32
+    lea                srcq, [srcq+ssq*2]
+    add                srcq, ssq
+%else
+    add                srcq, ss3q
+%endif
+    punpckldq            m3, m1           ; 4 5 _ _
+    punpckldq            m1, m0           ; 5 6 _ _
+    palignr              m4, m3, m2, 4    ; 1 2 3 4
+    punpcklbw            m3, m1           ; 45 56
+    punpcklbw            m1, m2, m4       ; 01 12
+    punpckhbw            m2, m4           ; 23 34
+.v_w4_loop:
+    pmaddubsw            m5, m1, subpel0  ; a0 b0
+    mova                 m1, m2
+    pmaddubsw            m2, subpel1      ; a1 b1
+    paddw                m5, m2
+    mova                 m2, m3
+    pmaddubsw            m3, subpel2      ; a2 b2
+    paddw                m5, m3
+    movd                 m4, [srcq+ssq*0]
+    punpckldq            m3, m0, m4       ; 6 7 _ _
+    movd                 m0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpckldq            m4, m0           ; 7 8 _ _
+    punpcklbw            m3, m4           ; 67 78
+    pmaddubsw            m4, m3, subpel3  ; a3 b3
+    paddw                m5, m4
+    pmulhrsw             m5, m7
+    packuswb             m5, m5
+    movd       [dstq+dsq*0], m5
+    pshufd               m5, m5, q0101
+    movd       [dstq+dsq*1], m5
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w4_loop
+    mov                  hw, r6w ; reset vertical loop
+    add                  r4, 4
+    mov                dstq, r4
+%if ARCH_X86_32
+    mov                srcq, srcm
+    add                srcq, 4
+    mov                srcm, srcq
+%else
+    add                  r7, 4
+    mov                srcq, r7
+%endif
+    sub                 r6d, 1<<16 ; horizontal--
+    jg .v_w4_loop0
+    RET
+%if ARCH_X86_64
+.v_w8:
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+    lea                 r6d, [wq - 8] ; horizontal loop
+    mov                  r4, dstq
+    mov                  r7, srcq
+    shl                 r6d, 8 - 3; (wq / 8) << 8
+    mov                 r6b, hb
+.v_w8_loop0:
+    movq                 m4, [srcq+ssq*0]   ; 0
+    movq                 m5, [srcq+ssq*1]   ; 1
+    lea                srcq, [srcq+ssq*2]
+    movq                 m6, [srcq+ssq*0]   ; 2
+    movq                 m0, [srcq+ssq*1]   ; 3
+    lea                srcq, [srcq+ssq*2]
+    movq                 m1, [srcq+ssq*0]   ; 4
+    movq                 m2, [srcq+ssq*1]   ; 5
+    lea                srcq, [srcq+ssq*2]   ;
+    movq                 m3, [srcq+ssq*0]   ; 6
+    shufpd               m4, m0, 0x0c
+    shufpd               m5, m1, 0x0c
+    punpcklbw            m1, m4, m5 ; 01
+    punpckhbw            m4, m5     ; 34
+    shufpd               m6, m2, 0x0c
+    punpcklbw            m2, m5, m6 ; 12
+    punpckhbw            m5, m6     ; 45
+    shufpd               m0, m3, 0x0c
+    punpcklbw            m3, m6, m0 ; 23
+    punpckhbw            m6, m0     ; 56
+.v_w8_loop:
+    movq                m12, [srcq+ssq*1]   ; 8
+    lea                srcq, [srcq+ssq*2]
+    movq                m13, [srcq+ssq*0]   ; 9
+    pmaddubsw           m14, m1, subpel0 ; a0
+    pmaddubsw           m15, m2, subpel0 ; b0
+    mova                 m1, m3
+    mova                 m2, m4
+    pmaddubsw            m3, subpel1 ; a1
+    pmaddubsw            m4, subpel1 ; b1
+    paddw               m14, m3
+    paddw               m15, m4
+    mova                 m3, m5
+    mova                 m4, m6
+    pmaddubsw            m5, subpel2 ; a2
+    pmaddubsw            m6, subpel2 ; b2
+    paddw               m14, m5
+    paddw               m15, m6
+    shufpd               m6, m0, m12, 0x0d
+    shufpd               m0, m12, m13, 0x0c
+    punpcklbw            m5, m6, m0  ; 67
+    punpckhbw            m6, m0      ; 78
+    pmaddubsw           m12, m5, subpel3 ; a3
+    pmaddubsw           m13, m6, subpel3 ; b3
+    paddw               m14, m12
+    paddw               m15, m13
+    pmulhrsw            m14, m7
+    pmulhrsw            m15, m7
+    packuswb            m14, m15
+    movq       [dstq+dsq*0], xm14
+    movhps     [dstq+dsq*1], xm14
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w8_loop
+    movzx                hd, r6b ; reset vertical loop
+    add                  r4, 8
+    add                  r7, 8
+    mov                dstq, r4
+    mov                srcq, r7
+    sub                 r6d, 1<<8 ; horizontal--
+    jg .v_w8_loop0
+    RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+.hv:
+    %assign stack_offset org_stack_offset
+    cmp                  wd, 4
+    jg .hv_w8
+%if ARCH_X86_32
+    and                 mxd, 0x7f
+%else
+    movzx               mxd, mxb
+%endif
+    dec                srcq
+    movd                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+%if ARCH_X86_32
+    movzx               mxd, ssb
+    shr                 ssd, 16
+    cmp                  hd, 6
+    cmovs               ssd, mxd
+    movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
+    W32_RESTORE_SSQ
+    lea                  r6, [ssq*3]
+    sub                srcq, r6
+ %define           base_reg  r6
+    mov                  r6, r1; use as new base
+ %assign regs_used 2
+    ALLOC_STACK  -mmsize*14
+ %assign regs_used 7
+    mov                 dsq, [rstk+stack_offset+gprsize*2]
+ %define           subpelv0  [rsp+mmsize*0]
+ %define           subpelv1  [rsp+mmsize*1]
+ %define           subpelv2  [rsp+mmsize*2]
+ %define           subpelv3  [rsp+mmsize*3]
+    punpcklqdq           m0, m0
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    pshufd               m6, m0, q0000
+    mova           subpelv0, m6
+    pshufd               m6, m0, q1111
+    mova           subpelv1, m6
+    pshufd               m6, m0, q2222
+    mova           subpelv2, m6
+    pshufd               m6, m0, q3333
+    mova           subpelv3, m6
+%else
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
+    ALLOC_STACK   mmsize*14, 14
+    lea                ss3q, [ssq*3]
+    sub                srcq, ss3q
+ %define           subpelv0  m10
+ %define           subpelv1  m11
+ %define           subpelv2  m12
+ %define           subpelv3  m13
+    punpcklqdq           m0, m0
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    mova                 m8, [base+pw_8192]
+    mova                 m9, [base+pd_512]
+    pshufd              m10, m0, q0000
+    pshufd              m11, m0, q1111
+    pshufd              m12, m0, q2222
+    pshufd              m13, m0, q3333
+%endif
+    pshufd               m7, m1, q0000
+    cmp                  wd, 4
+    je .hv_w4
+.hv_w2:
+    mova                 m6, [base+subpel_h_shuf4]
+    ;
+    movq                 m2, [srcq+ssq*0]     ; 0
+    movhps               m2, [srcq+ssq*1]     ; 0 _ 1
+    movq                 m0, [srcq+ssq*2]     ; 2
+%if ARCH_X86_32
+ %define           w8192reg  [base+pw_8192]
+ %define            d512reg  [base+pd_512]
+    lea                srcq, [srcq+ssq*2]
+    add                srcq, ssq
+    movhps               m0, [srcq+ssq*0]     ; 2 _ 3
+    lea                srcq, [srcq+ssq*1]
+%else
+ %define           w8192reg  m8
+ %define            d512reg  m9
+    movhps               m0, [srcq+ss3q ]     ; 2 _ 3
+    lea                srcq, [srcq+ssq*4]
+%endif
+    pshufb               m2, m6 ; 0 ~ 1 ~
+    pshufb               m0, m6 ; 2 ~ 3 ~
+    pmaddubsw            m2, m7 ; subpel_filters
+    pmaddubsw            m0, m7 ; subpel_filters
+    phaddw               m2, m0 ; 0 1 2 3
+    pmulhrsw             m2, w8192reg
+    ;
+    movq                 m3, [srcq+ssq*0]     ; 4
+    movhps               m3, [srcq+ssq*1]     ; 4 _ 5
+    movq                 m0, [srcq+ssq*2]     ; 6
+%if ARCH_X86_32
+    lea                srcq, [srcq+ssq*2]
+    add                srcq, ssq
+%else
+    add                srcq, ss3q
+%endif
+    pshufb               m3, m6 ; 4 ~ 5 ~
+    pshufb               m0, m6 ; 6 ~
+    pmaddubsw            m3, m7 ; subpel_filters
+    pmaddubsw            m0, m7 ; subpel_filters
+    phaddw               m3, m0 ; 4 5 6 _
+    pmulhrsw             m3, w8192reg
+    ;
+    palignr              m4, m3, m2, 4; V        1 2 3 4
+    punpcklwd            m1, m2, m4   ; V 01 12    0 1 1 2
+    punpckhwd            m2, m4       ; V 23 34    2 3 3 4
+    pshufd               m0, m3, q2121; V          5 6 5 6
+    punpcklwd            m3, m0       ; V 45 56    4 5 5 6
+.hv_w2_loop:
+    pmaddwd              m5, m1, subpelv0; V a0 b0
+    mova                 m1, m2       ; V
+    pmaddwd              m2, subpelv1 ; V a1 b1
+    paddd                m5, m2       ; V
+    mova                 m2, m3       ; V
+    pmaddwd              m3, subpelv2 ; a2 b2
+    paddd                m5, m3       ; V
+    movq                 m4, [srcq+ssq*0] ; V 7
+    movhps               m4, [srcq+ssq*1] ; V 7 8
+    lea                srcq, [srcq+ssq*2] ; V
+    pshufb               m4, m6
+    pmaddubsw            m4, m7
+    phaddw               m4, m4
+    pmulhrsw             m4, w8192reg
+    palignr              m3, m4, m0, 12
+    mova                 m0, m4
+    punpcklwd            m3, m0           ; V 67 78
+    pmaddwd              m4, m3, subpelv3 ; V a3 b3
+    paddd                m5, d512reg
+    paddd                m5, m4
+    psrad                m5, 10
+    packssdw             m5, m5
+    packuswb             m5, m5
+    movd                r4d, m5
+    mov        [dstq+dsq*0], r4w
+    shr                 r4d, 16
+    mov        [dstq+dsq*1], r4w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w2_loop
+    RET
+%undef w8192reg
+%undef d512reg
+    ;
+.hv_w4:
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+    ;
+%macro SAVELINE_W4 3
+    mova     [rsp+mmsize*hv4_line_%3_%2], %1
+%endmacro
+%macro RESTORELINE_W4 3
+    mova     %1, [rsp+mmsize*hv4_line_%3_%2]
+%endmacro
+    ;
+%if ARCH_X86_32
+ %define           w8192reg  [base+pw_8192]
+ %define            d512reg  [base+pd_512]
+%else
+ %define           w8192reg  m8
+ %define            d512reg  m9
+%endif
+    ; lower shuffle 0 1 2 3 4
+    mova                 m6, [base+subpel_h_shuf4]
+    movq                 m5, [srcq+ssq*0]   ; 0 _ _ _
+    movhps               m5, [srcq+ssq*1]   ; 0 _ 1 _
+    movq                 m4, [srcq+ssq*2]   ; 2 _ _ _
+%if ARCH_X86_32
+    lea                srcq, [srcq+ssq*2]
+    add                srcq, ssq
+    movhps               m4, [srcq+ssq*0]   ; 2 _ 3 _
+    add                srcq, ssq
+%else
+    movhps               m4, [srcq+ss3q ]   ; 2 _ 3 _
+    lea                srcq, [srcq+ssq*4]
+%endif
+    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+    pmaddubsw            m2, m7 ;H subpel_filters
+    pmaddubsw            m0, m7 ;H subpel_filters
+    phaddw               m2, m0 ;H 0 1 2 3
+    pmulhrsw             m2, w8192reg ;H pw_8192
+    SAVELINE_W4          m2, 2, 0
+    ; upper shuffle 2 3 4 5 6
+    mova                 m6, [base+subpel_h_shuf4+16]
+    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+    pmaddubsw            m2, m7 ;H subpel_filters
+    pmaddubsw            m0, m7 ;H subpel_filters
+    phaddw               m2, m0 ;H 0 1 2 3
+    pmulhrsw             m2, w8192reg ;H pw_8192
+    ;
+    ; lower shuffle
+    mova                 m6, [base+subpel_h_shuf4]
+    movq                 m5, [srcq+ssq*0]   ; 4 _ _ _
+    movhps               m5, [srcq+ssq*1]   ; 4 _ 5 _
+    movq                 m4, [srcq+ssq*2]   ; 6 _ _ _
+    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+    pmaddubsw            m3, m7 ;H subpel_filters
+    pmaddubsw            m0, m7 ;H subpel_filters
+    phaddw               m3, m0 ;H 4 5 6 7
+    pmulhrsw             m3, w8192reg ;H pw_8192
+    SAVELINE_W4          m3, 3, 0
+    ; upper shuffle
+    mova                 m6, [base+subpel_h_shuf4+16]
+    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+    pmaddubsw            m3, m7 ;H subpel_filters
+    pmaddubsw            m0, m7 ;H subpel_filters
+    phaddw               m3, m0 ;H 4 5 6 7
+    pmulhrsw             m3, w8192reg ;H pw_8192
+    ;
+%if ARCH_X86_32
+    lea                srcq, [srcq+ssq*2]
+    add                srcq, ssq
+%else
+    add                srcq, ss3q
+%endif
+    ;process high
+    palignr              m4, m3, m2, 4;V 1 2 3 4
+    punpcklwd            m1, m2, m4  ; V 01 12
+    punpckhwd            m2, m4      ; V 23 34
+    pshufd               m0, m3, q2121;V 5 6 5 6
+    punpcklwd            m3, m0      ; V 45 56
+    SAVELINE_W4          m0, 0, 1
+    SAVELINE_W4          m1, 1, 1
+    SAVELINE_W4          m2, 2, 1
+    SAVELINE_W4          m3, 3, 1
+    ;process low
+    RESTORELINE_W4       m2, 2, 0
+    RESTORELINE_W4       m3, 3, 0
+    palignr              m4, m3, m2, 4;V 1 2 3 4
+    punpcklwd            m1, m2, m4  ; V 01 12
+    punpckhwd            m2, m4      ; V 23 34
+    pshufd               m0, m3, q2121;V 5 6 5 6
+    punpcklwd            m3, m0      ; V 45 56
+.hv_w4_loop:
+    ;process low
+    pmaddwd              m5, m1, subpelv0 ; V a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, subpelv1; V a1 b1
+    paddd                m5, m2
+    mova                 m2, m3
+    pmaddwd              m3, subpelv2; V a2 b2
+    paddd                m5, m3
+    ;
+    mova                 m6, [base+subpel_h_shuf4]
+    movq                 m4, [srcq+ssq*0] ; 7
+    movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
+    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+    pmaddubsw            m4, m7 ;H subpel_filters
+    phaddw               m4, m4 ;H                7 8 7 8
+    pmulhrsw             m4, w8192reg ;H pw_8192
+    palignr              m3, m4, m0, 12         ; 6 7 8 7
+    mova                 m0, m4
+    punpcklwd            m3, m4      ; 67 78
+    pmaddwd              m4, m3, subpelv3; a3 b3
+    paddd                m5, d512reg ; pd_512
+    paddd                m5, m4
+    psrad                m5, 10
+    SAVELINE_W4          m0, 0, 0
+    SAVELINE_W4          m1, 1, 0
+    SAVELINE_W4          m2, 2, 0
+    SAVELINE_W4          m3, 3, 0
+    SAVELINE_W4          m5, 5, 0
+    ;process high
+    RESTORELINE_W4       m0, 0, 1
+    RESTORELINE_W4       m1, 1, 1
+    RESTORELINE_W4       m2, 2, 1
+    RESTORELINE_W4       m3, 3, 1
+    pmaddwd              m5, m1, subpelv0; V a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, subpelv1; V a1 b1
+    paddd                m5, m2
+    mova                 m2, m3
+    pmaddwd              m3, subpelv2; V a2 b2
+    paddd                m5, m3
+    ;
+    mova                 m6, [base+subpel_h_shuf4+16]
+    movq                 m4, [srcq+ssq*0] ; 7
+    movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
+    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+    pmaddubsw            m4, m7 ;H subpel_filters
+    phaddw               m4, m4 ;H                7 8 7 8
+    pmulhrsw             m4, w8192reg ;H pw_8192
+    palignr              m3, m4, m0, 12         ; 6 7 8 7
+    mova                 m0, m4
+    punpcklwd            m3, m4      ; 67 78
+    pmaddwd              m4, m3, subpelv3; a3 b3
+    paddd                m5, d512reg ; pd_512
+    paddd                m5, m4
+    psrad                m4, m5, 10
+    ;
+    RESTORELINE_W4       m5, 5, 0
+    packssdw             m5, m4 ; d -> w
+    packuswb             m5, m5 ; w -> b
+    pshuflw              m5, m5, q3120
+    lea                srcq, [srcq+ssq*2]
+    movd       [dstq+dsq*0], m5
+    psrlq                m5, 32
+    movd       [dstq+dsq*1], m5
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    SAVELINE_W4          m0, 0, 1
+    SAVELINE_W4          m1, 1, 1
+    SAVELINE_W4          m2, 2, 1
+    SAVELINE_W4          m3, 3, 1
+    RESTORELINE_W4       m0, 0, 0
+    RESTORELINE_W4       m1, 1, 0
+    RESTORELINE_W4       m2, 2, 0
+    RESTORELINE_W4       m3, 3, 0
+    jg .hv_w4_loop
+    RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+    ;
+.hv_w8:
+    %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+%macro SAVELINE_W8 2
+    mova     [rsp+hv8_line_%1*mmsize], %2
+%endmacro
+%macro RESTORELINE_W8 2
+    mova     %2, [rsp+hv8_line_%1*mmsize]
+%endmacro
+    shr                 mxd, 16
+    sub                srcq, 3
+%if ARCH_X86_32
+ %define           base_reg  r1
+ %define           subpelh0  [rsp+mmsize*5]
+ %define           subpelh1  [rsp+mmsize*6]
+ %define           subpelv0  [rsp+mmsize*7]
+ %define           subpelv1  [rsp+mmsize*8]
+ %define           subpelv2  [rsp+mmsize*9]
+ %define           subpelv3  [rsp+mmsize*10]
+ %define             accuv0  [rsp+mmsize*11]
+ %define             accuv1  [rsp+mmsize*12]
+    movq                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
+    movzx               mxd, ssb
+    shr                 ssd, 16
+    cmp                  hd, 6
+    cmovs               ssd, mxd
+    movq                 m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
+    mov                 ssq, ssmp
+    ALLOC_STACK  -mmsize*13
+%if STACK_ALIGNMENT < 16
+ %define               srcm  [rsp+mmsize*13+gprsize*1]
+ %define                dsm  [rsp+mmsize*13+gprsize*2]
+    mov                  r6, [rstk+stack_offset+gprsize*2]
+    mov                 dsm, r6
+%endif
+    pshufd               m0, m1, q0000
+    pshufd               m1, m1, q1111
+    punpcklbw            m5, m5
+    psraw                m5, 8 ; sign-extend
+    pshufd               m2, m5, q0000
+    pshufd               m3, m5, q1111
+    pshufd               m4, m5, q2222
+    pshufd               m5, m5, q3333
+    mova           subpelh0, m0
+    mova           subpelh1, m1
+    mova           subpelv0, m2
+    mova           subpelv1, m3
+    mova           subpelv2, m4
+    mova           subpelv3, m5
+    lea                  r6, [ssq*3]
+    sub                srcq, r6
+    mov                srcm, srcq
+%else
+    ALLOC_STACK    mmsize*5, 16
+ %define           subpelh0  m10
+ %define           subpelh1  m11
+ %define           subpelv0  m12
+ %define           subpelv1  m13
+ %define           subpelv2  m14
+ %define           subpelv3  m15
+ %define             accuv0  m8
+ %define             accuv1  m9
+    movq                 m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    movq                 m1, [base_reg+myq*8+subpel_filters-put_ssse3]
+    pshufd         subpelh0, m0, q0000
+    pshufd         subpelh1, m0, q1111
+    punpcklqdq           m1, m1
+    punpcklbw            m1, m1
+    psraw                m1, 8 ; sign-extend
+    pshufd         subpelv0, m1, q0000
+    pshufd         subpelv1, m1, q1111
+    pshufd         subpelv2, m1, q2222
+    pshufd         subpelv3, m1, q3333
+    lea                ss3q, [ssq*3]
+    sub                srcq, ss3q
+    mov                  r7, srcq
+%endif
+    lea                 r6d, [wq-4]
+    mov                  r4, dstq
+    shl                 r6d, (16 - 2)
+    mov                 r6w, hw
+.hv_w8_loop0:
+    movu                 m4, [srcq+ssq*0] ; 0 = _ _
+    movu                 m5, [srcq+ssq*1] ; 1 = _ _
+    lea                srcq, [srcq+ssq*2]
+    ;
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ %if ARCH_X86_32
+    pshufb               %3, %1, [base+subpel_h_shufB]
+    pshufb               %4, %1, [base+subpel_h_shufC]
+    pshufb               %1,     [base+subpel_h_shufA]
+ %else
+    pshufb               %3, %1, %6  ; subpel_h_shufB
+    pshufb               %4, %1, %7  ; subpel_h_shufC
+    pshufb               %1, %5      ; subpel_h_shufA
+ %endif
+    pmaddubsw            %2, %3, subpelh0 ; subpel +0 C0
+    pmaddubsw            %4, subpelh1; subpel +4 B4
+    pmaddubsw            %3, subpelh1; C4
+    pmaddubsw            %1, subpelh0; A0
+    paddw                %2, %4      ; C0+B4
+    paddw                %1, %3      ; A0+C4
+    phaddw               %1, %2
+%endmacro
+    ;
+%if ARCH_X86_64
+    mova                 m7, [base+subpel_h_shufA]
+    mova                 m8, [base+subpel_h_shufB]
+    mova                 m9, [base+subpel_h_shufC]
+%endif
+    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
+    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+    movu                 m6, [srcq+ssq*0] ; 2 = _ _
+    movu                 m0, [srcq+ssq*1] ; 3 = _ _
+    lea                srcq, [srcq+ssq*2]
+    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
+    HV_H_W8              m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
+    ;
+    mova                 m7, [base+pw_8192]
+    pmulhrsw             m4, m7 ; H pw_8192
+    pmulhrsw             m5, m7 ; H pw_8192
+    pmulhrsw             m6, m7 ; H pw_8192
+    pmulhrsw             m0, m7 ; H pw_8192
+    punpcklwd            m1, m4, m5  ; 0 1 ~
+    punpcklwd            m2, m5, m6  ; 1 2 ~
+    punpcklwd            m3, m6, m0  ; 2 3 ~
+    SAVELINE_W8           1, m1
+    SAVELINE_W8           2, m2
+    SAVELINE_W8           3, m3
+    ;
+    mova                 m7, [base+subpel_h_shufA]
+    movu                 m4, [srcq+ssq*0]       ; 4 = _ _
+    movu                 m5, [srcq+ssq*1]       ; 5 = _ _
+    lea                srcq, [srcq+ssq*2]
+    movu                 m6, [srcq+ssq*0]       ; 6 = _ _
+    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
+    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
+    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
+    mova                 m7, [base+pw_8192]
+    pmulhrsw             m1, m4, m7 ; H pw_8192 4 ~
+    pmulhrsw             m2, m5, m7 ; H pw_8192 5 ~
+    pmulhrsw             m3, m6, m7 ; H pw_8192 6 ~
+    punpcklwd            m4, m0, m1  ; 3 4 ~
+    punpcklwd            m5, m1, m2  ; 4 5 ~
+    punpcklwd            m6, m2, m3  ; 5 6 ~
+    ;
+    SAVELINE_W8           6, m3
+    RESTORELINE_W8        1, m1
+    RESTORELINE_W8        2, m2
+    RESTORELINE_W8        3, m3
+.hv_w8_loop:
+    ; m8 accu for V a
+    ; m9 accu for V b
+    SAVELINE_W8           1, m3
+    SAVELINE_W8           2, m4
+    SAVELINE_W8           3, m5
+    SAVELINE_W8           4, m6
+%if ARCH_X86_32
+    pmaddwd              m0, m1, subpelv0 ; a0
+    pmaddwd              m7, m2, subpelv0 ; b0
+    pmaddwd              m3, subpelv1     ; a1
+    pmaddwd              m4, subpelv1     ; b1
+    paddd                m0, m3
+    paddd                m7, m4
+    pmaddwd              m5, subpelv2     ; a2
+    pmaddwd              m6, subpelv2     ; b2
+    paddd                m0, m5
+    paddd                m7, m6
+    mova                 m5, [base+pd_512]
+    paddd                m0, m5 ;   pd_512
+    paddd                m7, m5 ;   pd_512
+    mova             accuv0, m0
+    mova             accuv1, m7
+%else
+    pmaddwd              m8, m1, subpelv0 ; a0
+    pmaddwd              m9, m2, subpelv0 ; b0
+    pmaddwd              m3, subpelv1     ; a1
+    pmaddwd              m4, subpelv1     ; b1
+    paddd                m8, m3
+    paddd                m9, m4
+    pmaddwd              m5, subpelv2     ; a2
+    pmaddwd              m6, subpelv2     ; b2
+    paddd                m8, m5
+    paddd                m9, m6
+    mova                 m7, [base+pd_512]
+    paddd                m8, m7 ;   pd_512
+    paddd                m9, m7 ;   pd_512
+    mova                 m7, [base+subpel_h_shufB]
+    mova                 m6, [base+subpel_h_shufC]
+    mova                 m5, [base+subpel_h_shufA]
+%endif
+    movu                 m0, [srcq+ssq*1] ; 7
+    movu                 m4, [srcq+ssq*2] ; 8
+    lea                srcq, [srcq+ssq*2]
+    HV_H_W8              m0, m1, m2, m3, m5, m7, m6
+    HV_H_W8              m4, m1, m2, m3, m5, m7, m6
+    mova                 m5, [base+pw_8192]
+    pmulhrsw             m0, m5 ; H pw_8192
+    pmulhrsw             m4, m5 ; H pw_8192
+    RESTORELINE_W8        6, m6
+    punpcklwd            m5, m6, m0  ; 6 7  ~
+    punpcklwd            m6, m0, m4  ; 7 8 ~
+    pmaddwd              m1, m5, subpelv3 ; a3
+    paddd                m2, m1, accuv0
+    pmaddwd              m1, m6, subpelv3 ; b3
+    paddd                m1, m1, accuv1 ; H + V
+    psrad                m2, 10
+    psrad                m1, 10
+    packssdw             m2, m1  ; d -> w
+    packuswb             m2, m1 ; w -> b
+    movd       [dstq+dsq*0], m2
+    psrlq                m2, 32
+%if ARCH_X86_32
+    add                dstq, dsm
+    movd       [dstq+dsq*0], m2
+    add                dstq, dsm
+%else
+    movd       [dstq+dsq*1], m2
+    lea                dstq, [dstq+dsq*2]
+%endif
+    sub                  hd, 2
+    jle .hv_w8_outer
+    SAVELINE_W8           6, m4
+    RESTORELINE_W8        1, m1
+    RESTORELINE_W8        2, m2
+    RESTORELINE_W8        3, m3
+    RESTORELINE_W8        4, m4
+    jmp .hv_w8_loop
+.hv_w8_outer:
+    movzx                hd, r6w
+    add                  r4, 4
+    mov                dstq, r4
+%if ARCH_X86_32
+    mov                srcq, srcm
+    add                srcq, 4
+    mov                srcm, srcq
+%else
+    add                  r7, 4
+    mov                srcq, r7
+%endif
+    sub                 r6d, 1<<16
+    jg .hv_w8_loop0
+    RET
+
+%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask
+ %if cpuflag(ssse3)
+    pshufb               %1, %2
+ %else
+  %if %5 == 1
+    pcmpeqd              %2, %2
+    psrlq                %2, 32
+  %endif
+    psrldq               %3, %1, 1
+    pshufd               %3, %3, q2301
+    pand                 %1, %2
+    pandn                %4, %2, %3
+    por                  %1, %4
+ %endif
+%endmacro
+
+%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %ifnidn %1, %2
+    mova                 %1, %2
+ %endif
+    PSHUFB_SUBPEL_H_4    %1, %3, %4, %5, %6
+%endmacro
+
+%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %if notcpuflag(ssse3)
+    psrlq                %1, %2, 16
+ %elifnidn %1, %2
+    mova                 %1, %2
+ %endif
+    PSHUFB_SUBPEL_H_4    %1, %3, %4, %5, %6
+%endmacro
+
+%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp]
+ %if cpuflag(ssse3)
+    palignr              %1, %2, %3, %4
+ %else
+  %if %0 == 4
+   %assign %%i regnumof%+%1 + 1
+   %define %%tmp m %+ %%i
+  %else
+   %define %%tmp %5
+  %endif
+    psrldq               %1, %3, %4
+    pslldq            %%tmp, %2, 16-%4
+    por                  %1, %%tmp
+ %endif
+%endmacro
+
+%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1
+ %if cpuflag(ssse3)
+    phaddw               %1, %2
+ %else
+  %ifnidn %1, %2
+   %if %4 == 1
+    mova                 %3, [pw_1]
+   %endif
+    pmaddwd              %1, %3
+    pmaddwd              %2, %3
+    packssdw             %1, %2
+  %else
+   %if %4 == 1
+    pmaddwd              %1, [pw_1]
+   %else
+    pmaddwd              %1, %3
+   %endif
+    packssdw             %1, %1
+  %endif
+ %endif
+%endmacro
+
+%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift
+ %if cpuflag(ssse3)
+    pmulhrsw             %1, %2, %3
+ %else
+    paddw                %1, %2, %3
+    psraw                %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW_8192 3 ; dst, src1, src2
+    PMULHRSW_POW2        %1, %2, %3, 2
+%endmacro
+
+%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2]
+   movd                  %1, [%2+0]
+   movd                  %3, [%2+1]
+   movd                  %4, [%2+2]
+   movd                  %5, [%2+3]
+   punpckldq             %1, %3
+   punpckldq             %4, %5
+   punpcklqdq            %1, %4
+%endmacro
+
+%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc
+ %if cpuflag(ssse3)
+    movu                m%1, [%2]
+    pshufb               m2, m%1, m11 ; subpel_h_shufB
+    pshufb               m3, m%1, m9  ; subpel_h_shufC
+    pshufb              m%1, m10      ; subpel_h_shufA
+ %else
+  %if ARCH_X86_64
+    SWAP                m12, m5
+    SWAP                m13, m6
+    SWAP                m14, m7
+   %define %%mx0 m%+%%i
+   %define %%mx1 m%+%%j
+   %assign %%i 0
+   %rep 12
+    movd              %%mx0, [%2+%%i]
+    %assign %%i %%i+1
+   %endrep
+   %assign %%i 0
+   %rep 6
+    %assign %%j %%i+1
+    punpckldq         %%mx0, %%mx1
+    %assign %%i %%i+2
+   %endrep
+   %assign %%i 0
+   %rep 3
+    %assign %%j %%i+2
+    punpcklqdq        %%mx0, %%mx1
+    %assign %%i %%i+4
+   %endrep
+    SWAP                m%1, m0
+    SWAP                 m2, m4
+    SWAP                 m3, m8
+    SWAP                 m5, m12
+    SWAP                 m6, m13
+    SWAP                 m7, m14
+  %else
+    PREP_8TAP_H_LOAD4    m0, %2+0, m1, m4, m7
+    PREP_8TAP_H_LOAD4    m2, %2+4, m1, m4, m7
+    PREP_8TAP_H_LOAD4    m3, %2+8, m1, m4, m7
+    SWAP                m%1, m0
+  %endif
+ %endif
+%endmacro
+
+%macro PREP_8TAP_H 2 ; dst, src_memloc
+    PREP_8TAP_H_LOAD     %1, %2
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+    SWAP                 m8, m1
+    SWAP                 m9, m7
+ %endif
+ %xdefine mX m%+%1
+ %assign %%i regnumof%+mX
+ %define mX m%+%%i
+    mova                 m4, m2
+    PMADDUBSW            m4, m5, m1, m7, 1  ; subpel +0 B0
+    PMADDUBSW            m2, m6, m1, m7, 0  ; subpel +4 B4
+    PMADDUBSW            m3, m6, m1, m7, 0  ; subpel +4 C4
+    PMADDUBSW            mX, m5, m1, m7, 0  ; subpel +0 A0
+ %undef mX
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+    SWAP                 m1, m8
+    SWAP                 m7, m9
+ %endif
+    paddw                m3, m4
+    paddw               m%1, m2
+    PHADDW              m%1, m3, m15, ARCH_X86_32
+ %if ARCH_X86_64 || cpuflag(ssse3)
+    PMULHRSW_8192       m%1, m%1, m7
+ %else
+    PMULHRSW_8192       m%1, m%1, [base+pw_2]
+ %endif
+%endmacro
+
+%macro PREP_8TAP_HV_LOAD 4 ; dst0, src_memloc, tmp[1-2]
+ %if cpuflag(ssse3)
+    movu                 %1, [%2]
+    pshufb               m2, %1, shufB
+    pshufb               m3, %1, shufC
+    pshufb               %1, shufA
+ %else
+    PREP_8TAP_H_LOAD4    %1, %2+0, m1, %3, %4
+    PREP_8TAP_H_LOAD4    m2, %2+4, m1, %3, %4
+    PREP_8TAP_H_LOAD4    m3, %2+8, m1, %3, %4
+ %endif
+%endmacro
+
+%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2]
+    PREP_8TAP_HV_LOAD %{1:4}
+    mova                 m1, m2
+    PMADDUBSW            m1, subpelh0, %3, %4, 1 ; subpel +0 C0
+    PMADDUBSW            m3, subpelh1, %3, %4, 0 ; subpel +4 B4
+    PMADDUBSW            m2, subpelh1, %3, %4, 0 ; C4
+    PMADDUBSW            %1, subpelh0, %3, %4, 0 ; A0
+    paddw                m1, m3           ; C0+B4
+    paddw                %1, m2           ; A0+C4
+    PHADDW               %1, m1, %3, 1
+%endmacro
+
+%macro PREP_8TAP_FN 3 ; type, type_h, type_v
+cglobal prep_8tap_%1
+    mov                 t0d, FILTER_%2
+    mov                 t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+    jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+%macro PREP_8TAP 0
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1, 2
+%elif WIN64
+ DECLARE_REG_TMP 6, 4
+%else
+ DECLARE_REG_TMP 6, 7
+%endif
+PREP_8TAP_FN regular,        REGULAR, REGULAR
+PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
+PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
+PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
+PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
+PREP_8TAP_FN sharp,          SHARP,   SHARP
+PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
+
+%if ARCH_X86_32
+ %define base_reg r2
+ %define base base_reg-prep%+SUFFIX
+ %define W32_RESTORE_SSQ mov strideq, stridem
+%else
+ %define base_reg r7
+ %define base 0
+ %define W32_RESTORE_SSQ
+%endif
+cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+%assign org_stack_offset stack_offset
+    imul                mxd, mxm, 0x010101
+    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+    imul                myd, mym, 0x010101
+    add                 myd, t1d ; 8tap_v, my, 4tap_v
+    movsxd               wq, wm
+    movifnidn          srcd, srcm
+    movifnidn            hd, hm
+    test                mxd, 0xf00
+    jnz .h
+    test                myd, 0xf00
+    jnz .v
+    LEA            base_reg, prep_ssse3
+    tzcnt                wd, wd
+    movzx                wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
+    add                  wq, base_reg
+    movifnidn       strided, stridem
+    lea                  r6, [strideq*3]
+    %assign stack_offset org_stack_offset
+%if WIN64
+    pop                  r8
+    pop                  r7
+%endif
+    jmp                  wq
+.h:
+    LEA            base_reg, prep%+SUFFIX
+    test                myd, 0xf00
+    jnz .hv
+%if cpuflag(ssse3)
+    WIN64_SPILL_XMM      12
+%else
+    WIN64_SPILL_XMM      16
+%endif
+    cmp                  wd, 4
+    je .h_w4
+    tzcnt                wd, wd
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
+    mova                m10, [base+subpel_h_shufA]
+    mova                m11, [base+subpel_h_shufB]
+    mova                 m9, [base+subpel_h_shufC]
+ %else
+  %define m10 [base+subpel_h_shufA]
+  %define m11 [base+subpel_h_shufB]
+  %define m9  [base+subpel_h_shufC]
+ %endif
+%endif
+    shr                 mxd, 16
+    sub                srcq, 3
+    movzx                wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
+    movd                 m5, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+0]
+    pshufd               m5, m5, q0000
+    movd                 m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+4]
+    pshufd               m6, m6, q0000
+%if cpuflag(ssse3)
+    mova                 m7, [base+pw_8192]
+%else
+    punpcklbw            m5, m5
+    punpcklbw            m6, m6
+    psraw                m5, 8
+    psraw                m6, 8
+ %if ARCH_X86_64
+    mova                 m7, [pw_2]
+    mova                m15, [pw_1]
+ %else
+  %define m15 m4
+ %endif
+%endif
+    add                  wq, base_reg
+    jmp                  wq
+.h_w4:
+%if ARCH_X86_32
+    and                 mxd, 0x7f
+%else
+    movzx               mxd, mxb
+%endif
+    dec                srcq
+    movd                 m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
+    pshufd               m4, m4, q0000
+%if cpuflag(ssse3)
+    mova                 m6, [base+pw_8192]
+    mova                 m5, [base+subpel_h_shufA]
+%else
+    mova                 m6, [base+pw_2]
+ %if ARCH_X86_64
+    mova                m14, [pw_1]
+ %else
+  %define m14 m7
+ %endif
+    punpcklbw            m4, m4
+    psraw                m4, 8
+%endif
+    W32_RESTORE_SSQ
+%if ARCH_X86_64
+    lea            stride3q, [strideq*3]
+%endif
+.h_w4_loop:
+%if cpuflag(ssse3)
+    movq                 m0, [srcq+strideq*0] ; 0
+    movq                 m1, [srcq+strideq*1] ; 1
+ %if ARCH_X86_32
+    lea                srcq, [srcq+strideq*2]
+    movq                 m2, [srcq+strideq*0] ; 2
+    movq                 m3, [srcq+strideq*1] ; 3
+    lea                srcq, [srcq+strideq*2]
+ %else
+    movq                 m2, [srcq+strideq*2] ; 2
+    movq                 m3, [srcq+stride3q ] ; 3
+    lea                srcq, [srcq+strideq*4]
+ %endif
+    pshufb               m0, m5
+    pshufb               m1, m5
+    pshufb               m2, m5
+    pshufb               m3, m5
+%else
+ %if ARCH_X86_64
+    movd                 m0, [srcq+strideq*0+0]
+    movd                m12, [srcq+strideq*0+1]
+    movd                 m1, [srcq+strideq*1+0]
+    movd                 m5, [srcq+strideq*1+1]
+    movd                 m2, [srcq+strideq*2+0]
+    movd                m13, [srcq+strideq*2+1]
+    movd                 m3, [srcq+stride3q +0]
+    movd                 m7, [srcq+stride3q +1]
+    punpckldq            m0, m12
+    punpckldq            m1, m5
+    punpckldq            m2, m13
+    punpckldq            m3, m7
+    movd                m12, [srcq+strideq*0+2]
+    movd                 m8, [srcq+strideq*0+3]
+    movd                 m5, [srcq+strideq*1+2]
+    movd                 m9, [srcq+strideq*1+3]
+    movd                m13, [srcq+strideq*2+2]
+    movd                m10, [srcq+strideq*2+3]
+    movd                 m7, [srcq+stride3q +2]
+    movd                m11, [srcq+stride3q +3]
+    lea                srcq, [srcq+strideq*4]
+    punpckldq           m12, m8
+    punpckldq            m5, m9
+    punpckldq           m13, m10
+    punpckldq            m7, m11
+    punpcklqdq           m0, m12 ; 0
+    punpcklqdq           m1, m5  ; 1
+    punpcklqdq           m2, m13 ; 2
+    punpcklqdq           m3, m7  ; 3
+ %else
+    movd                 m0, [srcq+strideq*0+0]
+    movd                 m1, [srcq+strideq*0+1]
+    movd                 m2, [srcq+strideq*0+2]
+    movd                 m3, [srcq+strideq*0+3]
+    punpckldq            m0, m1
+    punpckldq            m2, m3
+    punpcklqdq           m0, m2 ; 0
+    movd                 m1, [srcq+strideq*1+0]
+    movd                 m2, [srcq+strideq*1+1]
+    movd                 m3, [srcq+strideq*1+2]
+    movd                 m7, [srcq+strideq*1+3]
+    lea                srcq, [srcq+strideq*2]
+    punpckldq            m1, m2
+    punpckldq            m3, m7
+    punpcklqdq           m1, m3 ; 1
+    movd                 m2, [srcq+strideq*0+0]
+    movd                 m3, [srcq+strideq*0+1]
+    movd                 m7, [srcq+strideq*0+2]
+    movd                 m5, [srcq+strideq*0+3]
+    punpckldq            m2, m3
+    punpckldq            m7, m5
+    punpcklqdq           m2, m7 ; 2
+    movd                 m3, [srcq+strideq*1+0]
+    movd                 m7, [srcq+strideq*1+1]
+    punpckldq            m3, m7
+    movd                 m7, [srcq+strideq*1+2]
+    movd                 m5, [srcq+strideq*1+3]
+    lea                srcq, [srcq+strideq*2]
+    punpckldq            m7, m5
+    punpcklqdq           m3, m7 ; 3
+ %endif
+%endif
+    PMADDUBSW            m0, m4, m5, m7, 1 ; subpel_filters + 2
+    PMADDUBSW            m1, m4, m5, m7, 0
+    PMADDUBSW            m2, m4, m5, m7, 0
+    PMADDUBSW            m3, m4, m5, m7, 0
+    PHADDW               m0, m1, m14, ARCH_X86_32
+    PHADDW               m2, m3, m14, 0
+    PMULHRSW_8192        m0, m0, m6
+    PMULHRSW_8192        m2, m2, m6
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m2
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .h_w4_loop
+    RET
+    ;
+.h_w8:
+%if ARCH_X86_32
+    mov                  r3, r2
+ %define           base_reg  r3
+    W32_RESTORE_SSQ
+%endif
+.h_w8_loop:
+%if cpuflag(ssse3)
+    PREP_8TAP_H           0, srcq+strideq*0
+    PREP_8TAP_H           1, srcq+strideq*1
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    lea                srcq, [srcq+strideq*2]
+    add                tmpq, 32
+    sub                  hd, 2
+%else
+    PREP_8TAP_H           0, srcq
+    mova             [tmpq], m0
+    add                srcq, strideq
+    add                tmpq, 16
+    dec                  hd
+%endif
+    jg .h_w8_loop
+    RET
+.h_w16:
+    mov                  r6, -16*1
+    jmp .h_start
+.h_w32:
+    mov                  r6, -16*2
+    jmp .h_start
+.h_w64:
+    mov                  r6, -16*4
+    jmp .h_start
+.h_w128:
+    mov                  r6, -16*8
+.h_start:
+%if ARCH_X86_32
+    mov                  r3, r2
+ %define           base_reg  r3
+%endif
+    sub                srcq, r6
+    mov                  r5, r6
+    W32_RESTORE_SSQ
+.h_loop:
+%if cpuflag(ssse3)
+    PREP_8TAP_H           0, srcq+r6+8*0
+    PREP_8TAP_H           1, srcq+r6+8*1
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    add                tmpq, 32
+    add                  r6, 16
+%else
+    PREP_8TAP_H           0, srcq+r6
+    mova             [tmpq], m0
+    add                tmpq, 16
+    add                  r6, 8
+%endif
+    jl .h_loop
+    add                srcq, strideq
+    mov                  r6, r5
+    dec                  hd
+    jg .h_loop
+    RET
+%if ARCH_X86_32
+ %define            base_reg r2
+%endif
+    ;
+.v:
+    LEA            base_reg, prep%+SUFFIX
+%if ARCH_X86_32
+    mov                 mxd, myd
+    and                 mxd, 0x7f
+%else
+ %assign stack_offset org_stack_offset
+    WIN64_SPILL_XMM      16
+    movzx               mxd, myb
+%endif
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    lea                 myq, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+%if cpuflag(ssse3)
+    mova                 m2, [base+pw_512]
+    psrlw                m2, m2, 1 ; 0x0100
+    mova                 m7, [base+pw_8192]
+%endif
+%if ARCH_X86_32
+ %define            subpel0  [rsp+mmsize*0]
+ %define            subpel1  [rsp+mmsize*1]
+ %define            subpel2  [rsp+mmsize*2]
+ %define            subpel3  [rsp+mmsize*3]
+%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
+ %if cpuflag(ssse3)
+    ALLOC_STACK   -mmsize*4
+ %else
+    ALLOC_STACK   -mmsize*5
+ %endif
+%assign regs_used 7
+    movd                 m0, [myq+0]
+    PSHUFB_0X1X          m0, m2
+    mova            subpel0, m0
+    movd                 m0, [myq+2]
+    PSHUFB_0X1X          m0, m2
+    mova            subpel1, m0
+    movd                 m0, [myq+4]
+    PSHUFB_0X1X          m0, m2
+    mova            subpel2, m0
+    movd                 m0, [myq+6]
+    PSHUFB_0X1X          m0, m2
+    mova            subpel3, m0
+ %if notcpuflag(ssse3)
+    mov                  r6, base_reg
+  %define base_reg r6
+ %endif
+    mov             strideq, [rstk+stack_offset+gprsize*3]
+    lea             strideq, [strideq*3]
+    sub [rstk+stack_offset+gprsize*2], strideq
+    mov             strideq, [rstk+stack_offset+gprsize*3]
+    mov                srcq, [rstk+stack_offset+gprsize*2]
+%else
+ %define            subpel0  m8
+ %define            subpel1  m9
+ %define            subpel2  m10
+ %define            subpel3  m11
+    movd            subpel0, [myq+0]
+    PSHUFB_0X1X     subpel0, m2
+    movd            subpel1, [myq+2]
+    PSHUFB_0X1X     subpel1, m2
+    movd            subpel2, [myq+4]
+    PSHUFB_0X1X     subpel2, m2
+    movd            subpel3, [myq+6]
+    PSHUFB_0X1X     subpel3, m2
+    lea            stride3q, [strideq*3]
+    sub                srcq, stride3q
+    cmp                  wd, 8
+    jns .v_w8
+%endif
+.v_w4:
+%if notcpuflag(ssse3)
+    pxor                 m6, m6
+ %if ARCH_X86_64
+    mova                 m7, [base+pw_2]
+ %endif
+%endif
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < mmsize
+  %define srcm [esp+stack_size+gprsize*1]
+  %define tmpm [esp+stack_size+gprsize*2]
+ %endif
+    mov                tmpm, tmpq
+    mov                srcm, srcq
+    lea                 r5d, [wq - 4] ; horizontal loop
+    shl                 r5d, (16 - 2)  ; (wq / 4) << 16
+    mov                 r5w, hw
+.v_w4_loop0:
+%endif
+    movd                 m2, [srcq+strideq*0] ; 0
+    movhps               m2, [srcq+strideq*2] ; 0 _ 2
+    movd                 m3, [srcq+strideq*1] ; 1
+%if ARCH_X86_32
+    lea                srcq, [srcq+strideq*2]
+    movhps               m3, [srcq+strideq*1] ; 1 _ 3
+    lea                srcq, [srcq+strideq*2]
+%else
+    movhps               m3, [srcq+stride3q ] ; 1 _ 3
+    lea                srcq, [srcq+strideq*4]
+%endif
+    pshufd               m2, m2, q2020    ; 0 2 0 2
+    pshufd               m3, m3, q2020    ; 1 3 1 3
+    punpckldq            m2, m3           ; 0 1 2 3
+    movd                 m3, [srcq+strideq*0] ; 4
+    movd                 m1, [srcq+strideq*1] ; 5
+    movd                 m0, [srcq+strideq*2] ; 6
+%if ARCH_X86_32
+    lea                srcq, [srcq+strideq*2]
+    add                srcq, strideq
+%else
+    add                srcq, stride3q
+%endif
+    punpckldq            m3, m1           ; 4 5 _ _
+    punpckldq            m1, m0           ; 5 6 _ _
+    PALIGNR              m4, m3, m2, 4    ; 1 2 3 4
+    punpcklbw            m3, m1           ; 45 56
+    punpcklbw            m1, m2, m4       ; 01 12
+    punpckhbw            m2, m4           ; 23 34
+.v_w4_loop:
+%if ARCH_X86_32 && notcpuflag(ssse3)
+    mova                 m7, subpel0
+ %define subpel0 m7
+%endif
+    mova                 m5, m1
+    PMADDUBSW            m5, subpel0, m6, m4, 0  ; a0 b0
+%if ARCH_X86_32 && notcpuflag(ssse3)
+    mova                 m7, subpel1
+ %define subpel1 m7
+%endif
+    mova                 m1, m2
+    PMADDUBSW            m2, subpel1, m6, m4, 0  ; a1 b1
+    paddw                m5, m2
+%if ARCH_X86_32 && notcpuflag(ssse3)
+    mova                 m7, subpel2
+ %define subpel2 m7
+%endif
+    mova                 m2, m3
+    PMADDUBSW            m3, subpel2, m6, m4, 0  ; a2 b2
+    paddw                m5, m3
+    movd                 m4, [srcq+strideq*0]
+    punpckldq            m3, m0, m4       ; 6 7 _ _
+    movd                 m0, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    punpckldq            m4, m0           ; 7 8 _ _
+    punpcklbw            m3, m4           ; 67 78
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                m12, m0
+ %else
+    mova     [esp+mmsize*4], m0
+    mova                 m7, subpel3
+  %define subpel3 m7
+ %endif
+%endif
+    mova                 m4, m3
+    PMADDUBSW            m4, subpel3, m6, m0, 0  ; a3 b3
+    paddw                m5, m4
+%if ARCH_X86_64 || cpuflag(ssse3)
+ %if notcpuflag(ssse3)
+    SWAP                 m0, m12
+ %endif
+    PMULHRSW_8192        m5, m5, m7
+%else
+    mova                 m0, [esp+mmsize*4]
+    PMULHRSW_8192        m5, m5, [base+pw_2]
+%endif
+    movq        [tmpq+wq*0], m5
+    movhps      [tmpq+wq*2], m5
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jg .v_w4_loop
+%if ARCH_X86_32
+    mov                  hw, r5w ; reset vertical loop
+    mov                tmpq, tmpm
+    mov                srcq, srcm
+    add                tmpq, 8
+    add                srcq, 4
+    mov                tmpm, tmpq
+    mov                srcm, srcq
+    sub                 r5d, 1<<16 ; horizontal--
+    jg .v_w4_loop0
+%endif
+    RET
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ %define base_reg r2
+%endif
+    ;
+%if ARCH_X86_64
+.v_w8:
+    lea                 r5d, [wq - 8] ; horizontal loop
+    mov                  r8, tmpq
+    mov                  r6, srcq
+    shl                 r5d, 8 - 3; (wq / 8) << 8
+    mov                 r5b, hb
+.v_w8_loop0:
+    movq                 m4, [srcq+strideq*0]
+    movq                 m5, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movq                 m6, [srcq+strideq*0]
+    movq                 m0, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movq                 m1, [srcq+strideq*0]
+    movq                 m2, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movq                 m3, [srcq+strideq*0]
+    shufpd               m4, m0, 0x0c
+    shufpd               m5, m1, 0x0c
+    punpcklbw            m1, m4, m5 ; 01
+    punpckhbw            m4, m5     ; 34
+    shufpd               m6, m2, 0x0c
+    punpcklbw            m2, m5, m6 ; 12
+    punpckhbw            m5, m6     ; 45
+    shufpd               m0, m3, 0x0c
+    punpcklbw            m3, m6, m0 ; 23
+    punpckhbw            m6, m0     ; 56
+.v_w8_loop:
+%if cpuflag(ssse3)
+    movq                m12, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movq                m13, [srcq+strideq*0]
+    pmaddubsw           m14, m1, subpel0 ; a0
+    pmaddubsw           m15, m2, subpel0 ; b0
+    mova                 m1, m3
+    mova                 m2, m4
+    pmaddubsw            m3, subpel1 ; a1
+    pmaddubsw            m4, subpel1 ; b1
+    paddw               m14, m3
+    paddw               m15, m4
+    mova                 m3, m5
+    mova                 m4, m6
+    pmaddubsw            m5, subpel2 ; a2
+    pmaddubsw            m6, subpel2 ; b2
+    paddw               m14, m5
+    paddw               m15, m6
+    shufpd               m6, m0, m12, 0x0d
+    shufpd               m0, m12, m13, 0x0c
+    punpcklbw            m5, m6, m0  ; 67
+    punpckhbw            m6, m0      ; 78
+    pmaddubsw           m12, m5, subpel3 ; a3
+    pmaddubsw           m13, m6, subpel3 ; b3
+    paddw               m14, m12
+    paddw               m15, m13
+    pmulhrsw            m14, m7
+    pmulhrsw            m15, m7
+    movu        [tmpq+wq*0], m14
+    movu        [tmpq+wq*2], m15
+%else
+    mova                m14, m1
+    PMADDUBSW           m14, subpel0, m7, m12, 1 ; a0
+    mova                 m1, m3
+    PMADDUBSW            m3, subpel1, m7, m12, 0 ; a1
+    paddw               m14, m3
+    mova                 m3, m5
+    PMADDUBSW            m5, subpel2, m7, m12, 0 ; a2
+    paddw               m14, m5
+    movq                m12, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movq                m13, [srcq+strideq*0]
+    shufpd              m15, m0, m12, 0x0d
+    shufpd               m0, m12, m13, 0x0c
+    punpcklbw            m5, m15, m0  ; 67
+    punpckhbw           m15, m0       ; 78
+    mova                m13, m5
+    PMADDUBSW           m13, subpel3, m7, m12, 0 ; a3
+    paddw               m14, m13
+    PMULHRSW_8192       m14, m14, [base+pw_2]
+    movu        [tmpq+wq*0], m14
+    mova                m14, m2
+    PMADDUBSW           m14, subpel0, m7, m12, 0 ; b0
+    mova                 m2, m4
+    PMADDUBSW            m4, subpel1, m7, m12, 0 ; b1
+    paddw               m14, m4
+    mova                 m4, m6
+    PMADDUBSW            m6, subpel2, m7, m12, 0 ; b2
+    paddw               m14, m6
+    mova                 m6, m15
+    PMADDUBSW           m15, subpel3, m7, m12, 0 ; b3
+    paddw               m14, m15
+    PMULHRSW_8192       m14, m14, [base+pw_2]
+    movu        [tmpq+wq*2], m14
+%endif
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jg .v_w8_loop
+    movzx                hd, r5b ; reset vertical loop
+    add                  r8, 16
+    add                  r6, 8
+    mov                tmpq, r8
+    mov                srcq, r6
+    sub                 r5d, 1<<8 ; horizontal--
+    jg .v_w8_loop0
+    RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+    ;
+.hv:
+    %assign stack_offset org_stack_offset
+    cmp                  wd, 4
+    jg .hv_w8
+    and                 mxd, 0x7f
+    movd                 m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
+%if ARCH_X86_32
+    mov                 mxd, myd
+    shr                 myd, 16
+    and                 mxd, 0x7f
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    movq                 m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+    mov                  r5, r2; use as new base
+ %define           base_reg  r5
+ %assign regs_used 2
+    ALLOC_STACK  -mmsize*14
+ %assign regs_used 7
+    mov             strideq, [rstk+stack_offset+gprsize*3]
+    lea             strideq, [strideq*3 + 1]
+    sub [rstk+stack_offset+gprsize*2], strideq
+    mov             strideq, [rstk+stack_offset+gprsize*3]
+    mov                srcq, [rstk+stack_offset+gprsize*2]
+ %define           subpelv0  [rsp+mmsize*0]
+ %define           subpelv1  [rsp+mmsize*1]
+ %define           subpelv2  [rsp+mmsize*2]
+ %define           subpelv3  [rsp+mmsize*3]
+    punpcklbw            m0, m0
+    psraw                m0, 8
+    pshufd               m6, m0, q0000
+    mova           subpelv0, m6
+    pshufd               m6, m0, q1111
+    mova           subpelv1, m6
+    pshufd               m6, m0, q2222
+    mova           subpelv2, m6
+    pshufd               m6, m0, q3333
+    mova           subpelv3, m6
+%else
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    movq                 m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
+    ALLOC_STACK   mmsize*14, 14
+ %else
+    ALLOC_STACK   mmsize*14, 16
+ %endif
+    lea            stride3q, [strideq*3]
+    sub                srcq, stride3q
+    dec                srcq
+ %define           subpelv0  m10
+ %define           subpelv1  m11
+ %define           subpelv2  m12
+ %define           subpelv3  m13
+    punpcklbw            m0, m0
+    psraw                m0, 8
+ %if cpuflag(ssse3)
+    mova                 m8, [base+pw_8192]
+ %else
+    mova                 m8, [base+pw_2]
+ %endif
+    mova                 m9, [base+pd_32]
+    pshufd              m10, m0, q0000
+    pshufd              m11, m0, q1111
+    pshufd              m12, m0, q2222
+    pshufd              m13, m0, q3333
+%endif
+    pshufd               m7, m1, q0000
+%if notcpuflag(ssse3)
+    punpcklbw            m7, m7
+    psraw                m7, 8
+%endif
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+%if ARCH_X86_32
+ %if cpuflag(ssse3)
+  %define           w8192reg  [base+pw_8192]
+ %else
+  %define           w8192reg  [base+pw_2]
+ %endif
+ %define             d32reg  [base+pd_32]
+%else
+ %define           w8192reg  m8
+ %define             d32reg  m9
+%endif
+    ; lower shuffle 0 1 2 3 4
+%if cpuflag(ssse3)
+    mova                 m6, [base+subpel_h_shuf4]
+%else
+ %if ARCH_X86_64
+    mova                m15, [pw_1]
+ %else
+  %define               m15 m1
+ %endif
+%endif
+    movq                 m5, [srcq+strideq*0]   ; 0 _ _ _
+    movhps               m5, [srcq+strideq*1]   ; 0 _ 1 _
+    movq                 m4, [srcq+strideq*2]   ; 2 _ _ _
+%if ARCH_X86_32
+    lea                srcq, [srcq+strideq*2]
+    add                srcq, strideq
+    movhps               m4, [srcq+strideq*0]   ; 2 _ 3 _
+    add                srcq, strideq
+%else
+    movhps               m4, [srcq+stride3q ]   ; 2 _ 3 _
+    lea                srcq, [srcq+strideq*4]
+%endif
+    PSHUFB_SUBPEL_H_4a   m2, m5, m6, m1, m3, 1    ;H subpel_h_shuf4 0~1~
+    PSHUFB_SUBPEL_H_4a   m0, m4, m6, m1, m3, 0    ;H subpel_h_shuf4 2~3~
+    PMADDUBSW            m2, m7, m1, m3, 1        ;H subpel_filters
+    PMADDUBSW            m0, m7, m1, m3, 0        ;H subpel_filters
+    PHADDW               m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+    PMULHRSW_8192        m2, m2, w8192reg
+    SAVELINE_W4          m2, 2, 0
+    ; upper shuffle 2 3 4 5 6
+%if cpuflag(ssse3)
+    mova                 m6, [base+subpel_h_shuf4+16]
+%endif
+    PSHUFB_SUBPEL_H_4b   m2, m5, m6, m1, m3, 0    ;H subpel_h_shuf4 0~1~
+    PSHUFB_SUBPEL_H_4b   m0, m4, m6, m1, m3, 0    ;H subpel_h_shuf4 2~3~
+    PMADDUBSW            m2, m7, m1, m3, 1        ;H subpel_filters
+    PMADDUBSW            m0, m7, m1, m3, 0        ;H subpel_filters
+    PHADDW               m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+    PMULHRSW_8192        m2, m2, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                m14, m2
+ %else
+    mova     [esp+mmsize*4], m2
+ %endif
+%endif
+    ; lower shuffle
+%if cpuflag(ssse3)
+    mova                 m6, [base+subpel_h_shuf4]
+%endif
+    movq                 m5, [srcq+strideq*0]   ; 4 _ _ _
+    movhps               m5, [srcq+strideq*1]   ; 4 _ 5 _
+    movq                 m4, [srcq+strideq*2]   ; 6 _ _ _
+    PSHUFB_SUBPEL_H_4a   m3, m5, m6, m1, m2, 0    ;H subpel_h_shuf4 4~5~
+    PSHUFB_SUBPEL_H_4a   m0, m4, m6, m1, m2, 0    ;H subpel_h_shuf4 6~6~
+    PMADDUBSW            m3, m7, m1, m2, 1        ;H subpel_filters
+    PMADDUBSW            m0, m7, m1, m2, 0        ;H subpel_filters
+    PHADDW               m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+    PMULHRSW_8192        m3, m3, w8192reg
+    SAVELINE_W4          m3, 3, 0
+    ; upper shuffle
+%if cpuflag(ssse3)
+    mova                 m6, [base+subpel_h_shuf4+16]
+%endif
+    PSHUFB_SUBPEL_H_4b   m3, m5, m6, m1, m2, 0    ;H subpel_h_shuf4 4~5~
+    PSHUFB_SUBPEL_H_4b   m0, m4, m6, m1, m2, 0    ;H subpel_h_shuf4 6~6~
+    PMADDUBSW            m3, m7, m1, m2, 1        ;H subpel_filters
+    PMADDUBSW            m0, m7, m1, m2, 0        ;H subpel_filters
+    PHADDW               m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+    PMULHRSW_8192        m3, m3, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m2, m14
+ %else
+    mova                 m2, [esp+mmsize*4]
+ %endif
+%endif
+%if ARCH_X86_32
+    lea                srcq, [srcq+strideq*2]
+    add                srcq, strideq
+%else
+    add                srcq, stride3q
+%endif
+    ;process high
+    PALIGNR              m4, m3, m2, 4;V 1 2 3 4
+    punpcklwd            m1, m2, m4  ; V 01 12
+    punpckhwd            m2, m4      ; V 23 34
+    pshufd               m0, m3, q2121;V 5 6 5 6
+    punpcklwd            m3, m0      ; V 45 56
+    SAVELINE_W4          m0, 0, 1
+    SAVELINE_W4          m1, 1, 1
+    SAVELINE_W4          m2, 2, 1
+    SAVELINE_W4          m3, 3, 1
+    ;process low
+    RESTORELINE_W4       m2, 2, 0
+    RESTORELINE_W4       m3, 3, 0
+    PALIGNR              m4, m3, m2, 4;V 1 2 3 4
+    punpcklwd            m1, m2, m4  ; V 01 12
+    punpckhwd            m2, m4      ; V 23 34
+    pshufd               m0, m3, q2121;V 5 6 5 6
+    punpcklwd            m3, m0      ; V 45 56
+.hv_w4_loop:
+    ;process low
+    pmaddwd              m5, m1, subpelv0 ; V a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, subpelv1; V a1 b1
+    paddd                m5, m2
+    mova                 m2, m3
+    pmaddwd              m3, subpelv2; V a2 b2
+    paddd                m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                m14, m5
+ %else
+    mova     [esp+mmsize*4], m5
+  %define m15 m3
+ %endif
+%endif
+    ;
+%if cpuflag(ssse3)
+    mova                 m6, [base+subpel_h_shuf4]
+%endif
+    movq                 m4, [srcq+strideq*0] ; 7
+    movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
+    PSHUFB_SUBPEL_H_4a   m4, m4, m6, m3, m5, 0    ; H subpel_h_shuf4 7~8~
+    PMADDUBSW            m4, m7, m3, m5, 1        ; H subpel_filters
+    PHADDW               m4, m4, m15, ARCH_X86_32 ; H                7878
+    PMULHRSW_8192        m4, m4, w8192reg
+    PALIGNR              m3, m4, m0, 12, m5       ;                  6787
+    mova                 m0, m4
+    punpcklwd            m3, m4      ; 67 78
+    pmaddwd              m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m5, m14
+ %else
+    mova                 m5, [esp+mmsize*4]
+ %endif
+%endif
+    paddd                m5, d32reg ; pd_32
+    paddd                m5, m4
+    psrad                m5, 6
+    SAVELINE_W4          m0, 0, 0
+    SAVELINE_W4          m1, 1, 0
+    SAVELINE_W4          m2, 2, 0
+    SAVELINE_W4          m3, 3, 0
+    SAVELINE_W4          m5, 5, 0
+    ;process high
+    RESTORELINE_W4       m0, 0, 1
+    RESTORELINE_W4       m1, 1, 1
+    RESTORELINE_W4       m2, 2, 1
+    RESTORELINE_W4       m3, 3, 1
+    pmaddwd              m5, m1, subpelv0; V a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, subpelv1; V a1 b1
+    paddd                m5, m2
+    mova                 m2, m3
+    pmaddwd              m3, subpelv2; V a2 b2
+    paddd                m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                m14, m5
+ %else
+    mova         [esp+0xA0], m5
+ %endif
+%endif
+    ;
+%if cpuflag(ssse3)
+    mova                 m6, [base+subpel_h_shuf4+16]
+%endif
+    movq                 m4, [srcq+strideq*0] ; 7
+    movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
+    PSHUFB_SUBPEL_H_4b   m4, m4, m6, m3, m5, 0    ; H subpel_h_shuf4 7~8~
+    PMADDUBSW            m4, m7, m3, m5, 1        ; H subpel_filters
+    PHADDW               m4, m4, m15, ARCH_X86_32 ; H                7878
+    PMULHRSW_8192        m4, m4, w8192reg
+    PALIGNR              m3, m4, m0, 12, m5       ;                  6787
+    mova                 m0, m4
+    punpcklwd            m3, m4      ; 67 78
+    pmaddwd              m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m5, m14
+ %else
+    mova                 m5, [esp+0xA0]
+ %endif
+%endif
+    paddd                m5, d32reg ; pd_32
+    paddd                m5, m4
+    psrad                m4, m5, 6
+    ;
+    RESTORELINE_W4       m5, 5, 0
+    packssdw             m5, m4
+    pshufd               m5, m5, q3120
+    movu             [tmpq], m5
+    lea                srcq, [srcq+strideq*2]
+    add                tmpq, 16
+    sub                  hd, 2
+    SAVELINE_W4          m0, 0, 1
+    SAVELINE_W4          m1, 1, 1
+    SAVELINE_W4          m2, 2, 1
+    SAVELINE_W4          m3, 3, 1
+    RESTORELINE_W4       m0, 0, 0
+    RESTORELINE_W4       m1, 1, 0
+    RESTORELINE_W4       m2, 2, 0
+    RESTORELINE_W4       m3, 3, 0
+    jg .hv_w4_loop
+    RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+    ;
+.hv_w8:
+    %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+    shr                 mxd, 16
+%if ARCH_X86_32
+ %define           base_reg  r2
+ %define           subpelh0  [rsp+mmsize*5]
+ %define           subpelh1  [rsp+mmsize*6]
+ %define           subpelv0  [rsp+mmsize*7]
+ %define           subpelv1  [rsp+mmsize*8]
+ %define           subpelv2  [rsp+mmsize*9]
+ %define           subpelv3  [rsp+mmsize*10]
+ %define             accuv0  [rsp+mmsize*11]
+ %define             accuv1  [rsp+mmsize*12]
+    movq                 m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
+    mov                 mxd, myd
+    shr                 myd, 16
+    and                 mxd, 0x7f
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    movq                 m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+    ALLOC_STACK  -mmsize*13
+ %if STACK_ALIGNMENT < mmsize
+    mov                rstk, r2m
+  %define               tmpm  [rsp+mmsize*13+gprsize*1]
+  %define               srcm  [rsp+mmsize*13+gprsize*2]
+  %define            stridem  [rsp+mmsize*13+gprsize*3]
+    mov             stridem, rstk
+ %endif
+    mov                  r6, r2
+ %define base_reg r6
+    pshufd               m0, m1, q0000
+    pshufd               m1, m1, q1111
+    punpcklbw            m5, m5
+ %if notcpuflag(ssse3)
+    punpcklbw            m0, m0
+    punpcklbw            m1, m1
+ %endif
+    psraw                m5, 8
+ %if notcpuflag(ssse3)
+    psraw                m0, 8
+    psraw                m1, 8
+ %endif
+    pshufd               m2, m5, q0000
+    pshufd               m3, m5, q1111
+    pshufd               m4, m5, q2222
+    pshufd               m5, m5, q3333
+    mova           subpelh0, m0
+    mova           subpelh1, m1
+    mova           subpelv0, m2
+    mova           subpelv1, m3
+    mova           subpelv2, m4
+    mova           subpelv3, m5
+    W32_RESTORE_SSQ
+    lea             strided, [strided*3]
+    sub                srcd, strided
+    sub                srcd, 3
+    mov                srcm, srcd
+    W32_RESTORE_SSQ
+%else
+    ALLOC_STACK    mmsize*5, 16
+ %define           subpelh0  m10
+ %define           subpelh1  m11
+ %define           subpelv0  m12
+ %define           subpelv1  m13
+ %define           subpelv2  m14
+ %define           subpelv3  m15
+ %define             accuv0  m8
+ %define             accuv1  m9
+    movq                 m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    movq                 m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+    pshufd         subpelh0, m0, q0000
+    pshufd         subpelh1, m0, q1111
+    punpcklbw            m1, m1
+ %if notcpuflag(ssse3)
+    punpcklbw      subpelh0, subpelh0
+    punpcklbw      subpelh1, subpelh1
+ %endif
+    psraw                m1, 8
+ %if notcpuflag(ssse3)
+    psraw          subpelh0, 8
+    psraw          subpelh1, 8
+ %endif
+    pshufd         subpelv0, m1, q0000
+    pshufd         subpelv1, m1, q1111
+    pshufd         subpelv2, m1, q2222
+    pshufd         subpelv3, m1, q3333
+ %if notcpuflag(ssse3)
+    mova                 m7, [base+pw_2]
+ %endif
+    lea                stride3q, [strideq*3]
+    sub                srcq, 3
+    sub                srcq, stride3q
+    mov                  r6, srcq
+%endif
+    lea                 r5d, [wq-4]
+%if ARCH_X86_64
+    mov                  r8, tmpq
+%else
+    mov                tmpm, tmpq
+%endif
+    shl                 r5d, (16 - 2)
+    mov                 r5w, hw
+.hv_w8_loop0:
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
+    mova                 m7, [base+subpel_h_shufA]
+    mova                 m8, [base+subpel_h_shufB]
+    mova                 m9, [base+subpel_h_shufC]
+  %define shufA m7
+  %define shufB m8
+  %define shufC m9
+ %else
+  %define shufA [base+subpel_h_shufA]
+  %define shufB [base+subpel_h_shufB]
+  %define shufC [base+subpel_h_shufC]
+ %endif
+%endif
+    PREP_8TAP_HV         m4, srcq+strideq*0, m7, m0
+    PREP_8TAP_HV         m5, srcq+strideq*1, m7, m0
+    lea                srcq, [srcq+strideq*2]
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m9, m4
+ %else
+    mova              [esp], m4
+ %endif
+%endif
+    PREP_8TAP_HV         m6, srcq+strideq*0, m7, m4
+    PREP_8TAP_HV         m0, srcq+strideq*1, m7, m4
+    lea                srcq, [srcq+strideq*2]
+%if cpuflag(ssse3)
+    mova                 m7, [base+pw_8192]
+%else
+    mova                 m7, [base+pw_2]
+ %if ARCH_X86_64
+    SWAP                 m4, m9
+ %else
+    mova                 m4, [esp]
+ %endif
+%endif
+    PMULHRSW_8192        m4, m4, m7
+    PMULHRSW_8192        m5, m5, m7
+    PMULHRSW_8192        m6, m6, m7
+    PMULHRSW_8192        m0, m0, m7
+    punpcklwd            m1, m4, m5 ; 01
+    punpcklwd            m2, m5, m6 ; 12
+    punpcklwd            m3, m6, m0 ; 23
+    SAVELINE_W8           1, m1
+    SAVELINE_W8           2, m2
+    SAVELINE_W8           3, m3
+%if cpuflag(ssse3)
+    mova                 m7, [base+subpel_h_shufA]
+%else
+ %if ARCH_X86_64
+    SWAP                 m8, m7
+    SWAP                 m9, m0
+ %else
+    mova         [esp+0x30], m0
+ %endif
+%endif
+    PREP_8TAP_HV         m4, srcq+strideq*0, m7, m0
+    PREP_8TAP_HV         m5, srcq+strideq*1, m7, m0
+    PREP_8TAP_HV         m6, srcq+strideq*2, m7, m0
+    lea                srcq, [srcq+strideq*2]
+%if cpuflag(ssse3)
+    mova                 m7, [base+pw_8192]
+%else
+ %if ARCH_X86_64
+    SWAP                 m0, m9
+    SWAP                 m7, m8
+ %else
+    mova                 m0, [esp+0x30]
+    mova                 m7, [base+pw_2]
+ %endif
+%endif
+    PMULHRSW_8192        m1, m4, m7
+    PMULHRSW_8192        m2, m5, m7
+    PMULHRSW_8192        m3, m6, m7
+    punpcklwd            m4, m0, m1 ; 34
+    punpcklwd            m5, m1, m2 ; 45
+    punpcklwd            m6, m2, m3 ; 56
+    SAVELINE_W8           6, m3
+    RESTORELINE_W8        1, m1
+    RESTORELINE_W8        2, m2
+    RESTORELINE_W8        3, m3
+.hv_w8_loop:
+    SAVELINE_W8           1, m3
+    SAVELINE_W8           2, m4
+    SAVELINE_W8           3, m5
+    SAVELINE_W8           4, m6
+%if ARCH_X86_32
+    pmaddwd              m0, m1, subpelv0 ; a0
+    pmaddwd              m7, m2, subpelv0 ; b0
+    pmaddwd              m3, subpelv1     ; a1
+    pmaddwd              m4, subpelv1     ; b1
+    paddd                m0, m3
+    paddd                m7, m4
+    pmaddwd              m5, subpelv2     ; a2
+    pmaddwd              m6, subpelv2     ; b2
+    paddd                m0, m5
+    paddd                m7, m6
+    mova                 m5, [base+pd_32]
+    paddd                m0, m5
+    paddd                m7, m5
+    mova             accuv0, m0
+    mova             accuv1, m7
+%else
+    pmaddwd          accuv0, m1, subpelv0 ; a0
+    pmaddwd          accuv1, m2, subpelv0 ; b0
+    pmaddwd              m3, subpelv1     ; a1
+    pmaddwd              m4, subpelv1     ; b1
+    paddd            accuv0, m3
+    paddd            accuv1, m4
+    pmaddwd              m5, subpelv2     ; a2
+    pmaddwd              m6, subpelv2     ; b2
+    paddd            accuv0, m5
+    paddd            accuv1, m6
+    mova                 m7, [base+pd_32]
+    paddd            accuv0, m7
+    paddd            accuv1, m7
+ %if cpuflag(ssse3)
+    mova                 m7, [base+subpel_h_shufB]
+    mova                 m6, [base+subpel_h_shufC]
+    mova                 m5, [base+subpel_h_shufA]
+  %define shufA m5
+  %define shufB m7
+  %define shufC m6
+ %endif
+%endif
+    PREP_8TAP_HV         m0, srcq+strideq*1, m5, m6
+    PREP_8TAP_HV         m4, srcq+strideq*2, m5, m6
+    lea                srcq, [srcq+strideq*2]
+%if cpuflag(ssse3)
+    mova                 m5, [base+pw_8192]
+%else
+    mova                 m5, [base+pw_2]
+%endif
+    PMULHRSW_8192        m0, m0, m5
+    PMULHRSW_8192        m4, m4, m5
+    RESTORELINE_W8        6, m6
+    punpcklwd            m5, m6, m0 ; 67
+    punpcklwd            m6, m0, m4 ; 78
+    pmaddwd              m1, m5, subpelv3 ; a3
+    paddd                m2, m1, accuv0
+    pmaddwd              m1, m6, subpelv3 ; b3
+    paddd                m1, m1, accuv1
+    psrad                m2, 6
+    psrad                m1, 6
+    packssdw             m2, m1
+    movq        [tmpq+wq*0], m2
+    movhps      [tmpq+wq*2], m2
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jle .hv_w8_outer
+    SAVELINE_W8           6, m4
+    RESTORELINE_W8        1, m1
+    RESTORELINE_W8        2, m2
+    RESTORELINE_W8        3, m3
+    RESTORELINE_W8        4, m4
+    jmp .hv_w8_loop
+.hv_w8_outer:
+    movzx                hd, r5w
+%if ARCH_X86_32
+    add          dword tmpm, 8
+    mov                tmpq, tmpm
+    mov                srcq, srcm
+    add                srcq, 4
+    mov                srcm, srcq
+%else
+    add                  r8, 8
+    mov                tmpq, r8
+    add                  r6, 4
+    mov                srcq, r6
+%endif
+    sub                 r5d, 1<<16
+    jg .hv_w8_loop0
+    RET
+%endmacro
+
+%if ARCH_X86_32
+ %macro SAVE_ALPHA_BETA 0
+    mov              alpham, alphad
+    mov               betam, betad
+ %endmacro
+
+ %macro SAVE_DELTA_GAMMA 0
+    mov              deltam, deltad
+    mov              gammam, gammad
+ %endmacro
+
+ %macro LOAD_ALPHA_BETA_MX 0
+    mov                 mym, myd
+    mov              alphad, alpham
+    mov               betad, betam
+    mov                 mxd, mxm
+ %endmacro
+
+ %macro LOAD_DELTA_GAMMA_MY 0
+    mov                 mxm, mxd
+    mov              deltad, deltam
+    mov              gammad, gammam
+    mov                 myd, mym
+ %endmacro
+
+ %define PIC_reg r2
+ %define PIC_base_offset $$
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+%else
+ %define SAVE_ALPHA_BETA
+ %define SAVE_DELTA_GAMMA
+ %define PIC_sym(sym) sym
+%endif
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < required_stack_alignment
+  %assign copy_args 8*4
+ %else
+  %assign copy_args 0
+ %endif
+%endif
+
+%macro RELOC_ARGS 0
+ %if copy_args
+    mov                  r0, r0m
+    mov                  r1, r1m
+    mov                  r2, r2m
+    mov                  r3, r3m
+    mov                  r5, r5m
+    mov                dstm, r0
+    mov                 dsm, r1
+    mov                srcm, r2
+    mov                 ssm, r3
+    mov                 mxm, r5
+    mov                  r0, r6m
+    mov                 mym, r0
+ %endif
+%endmacro
+
+%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
+ %if cpuflag(sse4)
+    pblendw              %1, %2, 0xAA
+ %else
+    pand                 %2, m10
+    por                  %1, %2
+ %endif
+%endmacro
+
+%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
+ %if ARCH_X86_32
+  %define m8  m4
+  %define m9  m5
+  %define m14 m6
+  %define m15 m7
+  %define m11 m7
+ %endif
+ %if notcpuflag(ssse3) || ARCH_X86_32
+    pxor                m11, m11
+ %endif
+    lea               tmp1d, [myq+deltaq*4]
+    lea               tmp2d, [myq+deltaq*1]
+    shr                 myd, 10
+    shr               tmp1d, 10
+    movq                 m2, [filterq+myq  *8] ; a
+    movq                 m8, [filterq+tmp1q*8] ; e
+    lea               tmp1d, [tmp2q+deltaq*4]
+    lea                 myd, [tmp2q+deltaq*1]
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movq                 m3, [filterq+tmp2q*8] ; b
+    movq                 m0, [filterq+tmp1q*8] ; f
+    punpcklwd            m2, m3
+    punpcklwd            m8, m0
+    lea               tmp1d, [myq+deltaq*4]
+    lea               tmp2d, [myq+deltaq*1]
+    shr                 myd, 10
+    shr               tmp1d, 10
+    movq                 m0, [filterq+myq  *8] ; c
+    movq                 m9, [filterq+tmp1q*8] ; g
+    lea               tmp1d, [tmp2q+deltaq*4]
+    lea                 myd, [tmp2q+gammaq]       ; my += gamma
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movq                 m3, [filterq+tmp2q*8] ; d
+    movq                 m1, [filterq+tmp1q*8] ; h
+    punpcklwd            m0, m3
+    punpcklwd            m9, m1
+    punpckldq            m1, m2, m0
+    punpckhdq            m2, m0
+    punpcklbw            m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+    punpckhbw            m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
+    punpcklbw            m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+    punpckhbw           m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
+    pmaddwd              m0, %3
+    pmaddwd              m3, %5
+    pmaddwd              m1, %7
+    pmaddwd             m14, %9
+    paddd                m0, m3
+    paddd                m1, m14
+    paddd                m0, m1
+    mova                 %1, m0
+ %if ARCH_X86_64
+    SWAP                 m3, m14
+ %endif
+    punpckldq            m0, m8, m9
+    punpckhdq            m8, m9
+    punpcklbw            m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
+    punpckhbw           m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
+    punpcklbw            m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
+    punpckhbw           m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
+    pmaddwd              m1, %4
+    pmaddwd             m14, %6
+    pmaddwd              m2, %8
+    pmaddwd             m15, %10
+    paddd                m1, m14
+    paddd                m2, m15
+    paddd                m1, m2
+    mova                 %2, m1
+ %if ARCH_X86_64
+    SWAP                m14, m3
+ %endif
+%endmacro
+
+%if ARCH_X86_64
+ %define counterd r4d
+%else
+ %if copy_args == 0
+  %define counterd dword r4m
+ %else
+  %define counterd dword [esp+stack_size-4*7]
+ %endif
+%endif
+
+%macro WARP_AFFINE_8X8T 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8t, 6, 14, 16, 0x90, tmp, ts
+%else
+cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
+ %if copy_args
+  %define tmpm [esp+stack_size-4*1]
+  %define tsm  [esp+stack_size-4*2]
+ %endif
+%endif
+    call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main
+.loop:
+%if ARCH_X86_32
+ %define m12 m4
+ %define m13 m5
+ %define m14 m6
+ %define m15 m7
+    mova                m12, [esp+0xC0]
+    mova                m13, [esp+0xD0]
+    mova                m14, [esp+0xE0]
+    mova                m15, [esp+0xF0]
+%endif
+%if cpuflag(ssse3)
+    psrad               m12, 13
+    psrad               m13, 13
+    psrad               m14, 13
+    psrad               m15, 13
+    packssdw            m12, m13
+    packssdw            m14, m15
+    mova                m13, [PIC_sym(pw_8192)]
+    pmulhrsw            m12, m13 ; (x + (1 << 6)) >> 7
+    pmulhrsw            m14, m13
+%else
+ %if ARCH_X86_32
+  %define m10 m0
+ %endif
+    mova                m10, [PIC_sym(pd_16384)]
+    paddd               m12, m10
+    paddd               m13, m10
+    paddd               m14, m10
+    paddd               m15, m10
+    psrad               m12, 15
+    psrad               m13, 15
+    psrad               m14, 15
+    psrad               m15, 15
+    packssdw            m12, m13
+    packssdw            m14, m15
+%endif
+    mova       [tmpq+tsq*0], m12
+    mova       [tmpq+tsq*2], m14
+    dec            counterd
+    jz   mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).end
+%if ARCH_X86_32
+    mov                tmpm, tmpd
+    mov                  r0, [esp+0x100]
+    mov                  r1, [esp+0x104]
+%endif
+    call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main2
+    lea                tmpq, [tmpq+tsq*4]
+    jmp .loop
+%endmacro
+
+%macro WARP_AFFINE_8X8 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8, 6, 14, 16, 0x90, \
+                         dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+                         filter, tmp1, delta, my, gamma
+%else
+cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
+                         dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+                         filter, tmp1, delta, my, gamma
+ %define alphaq     r0
+ %define alphad     r0
+ %define alpham     [esp+gprsize+0x100]
+ %define betaq      r1
+ %define betad      r1
+ %define betam      [esp+gprsize+0x104]
+ %define deltaq     r0
+ %define deltad     r0
+ %define deltam     [esp+gprsize+0x108]
+ %define gammaq     r1
+ %define gammad     r1
+ %define gammam     [esp+gprsize+0x10C]
+ %define filterq    r3
+ %define tmp1q      r4
+ %define tmp1d      r4
+ %define tmp1m      [esp+gprsize+0x110]
+ %define myq        r5
+ %define myd        r5
+ %define mym        r6m
+ %if copy_args
+  %define dstm [esp+stack_size-4*1]
+  %define dsm  [esp+stack_size-4*2]
+  %define srcm [esp+stack_size-4*3]
+  %define ssm  [esp+stack_size-4*4]
+  %define mxm  [esp+stack_size-4*5]
+  %define mym  [esp+stack_size-4*6]
+ %endif
+%endif
+    call .main
+    jmp .start
+.loop:
+%if ARCH_X86_32
+    mov                dstm, dstd
+    mov              alphad, [esp+0x100]
+    mov               betad, [esp+0x104]
+%endif
+    call .main2
+    lea                dstq, [dstq+dsq*2]
+.start:
+%if notcpuflag(sse4)
+ %if cpuflag(ssse3)
+  %define roundval pw_8192
+ %else
+  %define roundval pd_262144
+ %endif
+ %if ARCH_X86_64
+    mova                m10, [PIC_sym(roundval)]
+ %else
+  %define m10 [PIC_sym(roundval)]
+ %endif
+%endif
+%if ARCH_X86_32
+ %define m12 m5
+ %define m13 m6
+    mova                m12, [esp+0xC0]
+    mova                m13, [esp+0xD0]
+%endif
+%if cpuflag(sse4)
+ %if ARCH_X86_32
+  %define m11 m4
+    pxor                m11, m11
+ %endif
+    psrad               m12, 18
+    psrad               m13, 18
+    packusdw            m12, m13
+    pavgw               m12, m11 ; (x + (1 << 10)) >> 11
+%else
+ %if cpuflag(ssse3)
+    psrad               m12, 17
+    psrad               m13, 17
+    packssdw            m12, m13
+    pmulhrsw            m12, m10
+ %else
+    paddd               m12, m10
+    paddd               m13, m10
+    psrad               m12, 19
+    psrad               m13, 19
+    packssdw            m12, m13
+ %endif
+%endif
+%if ARCH_X86_32
+ %define m14 m6
+ %define m15 m7
+    mova                m14, [esp+0xE0]
+    mova                m15, [esp+0xF0]
+%endif
+%if cpuflag(sse4)
+    psrad               m14, 18
+    psrad               m15, 18
+    packusdw            m14, m15
+    pavgw               m14, m11 ; (x + (1 << 10)) >> 11
+%else
+ %if cpuflag(ssse3)
+    psrad               m14, 17
+    psrad               m15, 17
+    packssdw            m14, m15
+    pmulhrsw            m14, m10
+ %else
+    paddd               m14, m10
+    paddd               m15, m10
+    psrad               m14, 19
+    psrad               m15, 19
+    packssdw            m14, m15
+ %endif
+%endif
+    packuswb            m12, m14
+    movq       [dstq+dsq*0], m12
+    movhps     [dstq+dsq*1], m12
+    dec            counterd
+    jg .loop
+.end:
+    RET
+ALIGN function_align
+.main:
+%assign stack_offset stack_offset+gprsize
+%if ARCH_X86_32
+ %assign stack_size stack_size+4
+ %if copy_args
+  %assign stack_offset stack_offset-4
+ %endif
+    RELOC_ARGS
+    LEA             PIC_reg, $$
+ %define PIC_mem [esp+gprsize+0x114]
+    mov               abcdd, abcdm
+ %if copy_args == 0
+    mov                 ssd, ssm
+    mov                 mxd, mxm
+ %endif
+    mov             PIC_mem, PIC_reg
+    mov                srcd, srcm
+%endif
+    movsx            deltad, word [abcdq+2*2]
+    movsx            gammad, word [abcdq+2*3]
+    lea               tmp1d, [deltaq*3]
+    sub              gammad, tmp1d    ; gamma -= delta*3
+    SAVE_DELTA_GAMMA
+%if ARCH_X86_32
+    mov               abcdd, abcdm
+%endif
+    movsx            alphad, word [abcdq+2*0]
+    movsx             betad, word [abcdq+2*1]
+    lea               tmp1q, [ssq*3+3]
+    add                 mxd, 512+(64<<10)
+    lea               tmp2d, [alphaq*3]
+    sub                srcq, tmp1q    ; src -= src_stride*3 + 3
+%if ARCH_X86_32
+    mov                srcm, srcd
+    mov             PIC_reg, PIC_mem
+%endif
+    sub               betad, tmp2d    ; beta -= alpha*3
+    lea             filterq, [PIC_sym(mc_warp_filter)]
+%if ARCH_X86_64
+    mov                 myd, r6m
+ %if cpuflag(ssse3)
+    pxor                m11, m11
+ %endif
+%endif
+    call .h
+    psrld                m2, m0, 16
+    psrld                m3, m1, 16
+%if ARCH_X86_32
+ %if notcpuflag(ssse3)
+    mova [esp+gprsize+0x00], m2
+ %endif
+    mova [esp+gprsize+0x10], m3
+%endif
+    call .h
+    psrld                m4, m0, 16
+    psrld                m5, m1, 16
+%if ARCH_X86_32
+    mova [esp+gprsize+0x20], m4
+    mova [esp+gprsize+0x30], m5
+%endif
+    call .h
+%if ARCH_X86_64
+ %define blendmask [rsp+gprsize+0x80]
+%else
+ %if notcpuflag(ssse3)
+    mova                 m2, [esp+gprsize+0x00]
+ %endif
+    mova                 m3, [esp+gprsize+0x10]
+ %define blendmask [esp+gprsize+0x120]
+ %define m10 m7
+%endif
+    pcmpeqd             m10, m10
+    pslld               m10, 16
+    mova          blendmask, m10
+    BLENDHWDW            m2, m0 ; 0
+    BLENDHWDW            m3, m1 ; 2
+    mova [rsp+gprsize+0x00], m2
+    mova [rsp+gprsize+0x10], m3
+    call .h
+%if ARCH_X86_32
+    mova                 m4, [esp+gprsize+0x20]
+    mova                 m5, [esp+gprsize+0x30]
+%endif
+    mova                m10, blendmask
+    BLENDHWDW            m4, m0 ; 1
+    BLENDHWDW            m5, m1 ; 3
+    mova [rsp+gprsize+0x20], m4
+    mova [rsp+gprsize+0x30], m5
+    call .h
+%if ARCH_X86_32
+ %if notcpuflag(ssse3)
+    mova                 m2, [esp+gprsize+0x00]
+ %endif
+    mova                 m3, [esp+gprsize+0x10]
+ %define m10 m5
+%endif
+    psrld                m6, m2, 16
+    psrld                m7, m3, 16
+    mova                m10, blendmask
+    BLENDHWDW            m6, m0 ; 2
+    BLENDHWDW            m7, m1 ; 4
+    mova [rsp+gprsize+0x40], m6
+    mova [rsp+gprsize+0x50], m7
+    call .h
+%if ARCH_X86_32
+    mova                m4, [esp+gprsize+0x20]
+    mova                m5, [esp+gprsize+0x30]
+%endif
+    psrld               m2, m4, 16
+    psrld               m3, m5, 16
+    mova                m10, blendmask
+    BLENDHWDW           m2, m0 ; 3
+    BLENDHWDW           m3, m1 ; 5
+    mova [rsp+gprsize+0x60], m2
+    mova [rsp+gprsize+0x70], m3
+    call .h
+%if ARCH_X86_32
+    mova                 m6, [esp+gprsize+0x40]
+    mova                 m7, [esp+gprsize+0x50]
+ %define m10 m7
+%endif
+    psrld                m4, m6, 16
+    psrld                m5, m7, 16
+    mova                m10, blendmask
+    BLENDHWDW            m4, m0 ; 4
+    BLENDHWDW            m5, m1 ; 6
+%if ARCH_X86_64
+    add                 myd, 512+(64<<10)
+    mova                 m6, m2
+    mova                 m7, m3
+%else
+    mova [esp+gprsize+0x80], m4
+    mova [esp+gprsize+0x90], m5
+    add           dword mym, 512+(64<<10)
+%endif
+    mov            counterd, 4
+    SAVE_ALPHA_BETA
+.main2:
+    call .h
+%if ARCH_X86_32
+    mova                 m6, [esp+gprsize+0x60]
+    mova                 m7, [esp+gprsize+0x70]
+ %define m10 m5
+%endif
+    psrld                m6, 16
+    psrld                m7, 16
+    mova                m10, blendmask
+    BLENDHWDW            m6, m0 ; 5
+    BLENDHWDW            m7, m1 ; 7
+%if ARCH_X86_64
+    WARP_V              m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+                                  m4, m5, \
+                                  [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+                                  m6, m7
+%else
+    mova [esp+gprsize+0xA0], m6
+    mova [esp+gprsize+0xB0], m7
+    LOAD_DELTA_GAMMA_MY
+    WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
+           [esp+gprsize+0x00], [esp+gprsize+0x10], \
+           [esp+gprsize+0x80], [esp+gprsize+0x90], \
+           [esp+gprsize+0x20], [esp+gprsize+0x30], \
+           [esp+gprsize+0xA0], [esp+gprsize+0xB0]
+    LOAD_ALPHA_BETA_MX
+%endif
+    call .h
+    mova                 m2, [rsp+gprsize+0x40]
+    mova                 m3, [rsp+gprsize+0x50]
+%if ARCH_X86_32
+    mova                 m4, [rsp+gprsize+0x80]
+    mova                 m5, [rsp+gprsize+0x90]
+ %define m10 m7
+%endif
+    mova [rsp+gprsize+0x00], m2
+    mova [rsp+gprsize+0x10], m3
+    mova [rsp+gprsize+0x40], m4
+    mova [rsp+gprsize+0x50], m5
+    psrld                m4, 16
+    psrld                m5, 16
+    mova                m10, blendmask
+    BLENDHWDW            m4, m0 ; 6
+    BLENDHWDW            m5, m1 ; 8
+%if ARCH_X86_64
+    WARP_V              m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+                                  m6, m7, \
+                                  [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+                                  m4, m5
+%else
+    mova [esp+gprsize+0x80], m4
+    mova [esp+gprsize+0x90], m5
+    LOAD_DELTA_GAMMA_MY
+    WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
+           [esp+gprsize+0x20], [esp+gprsize+0x30], \
+           [esp+gprsize+0xA0], [esp+gprsize+0xB0], \
+           [esp+gprsize+0x00], [esp+gprsize+0x10], \
+           [esp+gprsize+0x80], [esp+gprsize+0x90]
+    mov                 mym, myd
+    mov                dstd, dstm
+    mov                 dsd, dsm
+    mov                 mxd, mxm
+%endif
+    mova                 m2, [rsp+gprsize+0x60]
+    mova                 m3, [rsp+gprsize+0x70]
+%if ARCH_X86_32
+    mova                 m6, [esp+gprsize+0xA0]
+    mova                 m7, [esp+gprsize+0xB0]
+%endif
+    mova [rsp+gprsize+0x20], m2
+    mova [rsp+gprsize+0x30], m3
+    mova [rsp+gprsize+0x60], m6
+    mova [rsp+gprsize+0x70], m7
+    ret
+ALIGN function_align
+.h:
+%if ARCH_X86_32
+ %define m8  m3
+ %define m9  m4
+ %define m10 m5
+ %define m14 m6
+ %define m15 m7
+%endif
+    lea               tmp1d, [mxq+alphaq*4]
+    lea               tmp2d, [mxq+alphaq*1]
+%if ARCH_X86_32
+ %assign stack_offset stack_offset+4
+ %assign stack_size stack_size+4
+ %define PIC_mem [esp+gprsize*2+0x114]
+    mov             PIC_mem, PIC_reg
+    mov                srcd, srcm
+%endif
+    movu                m10, [srcq]
+%if ARCH_X86_32
+    add                srcd, ssm
+    mov                srcm, srcd
+    mov             PIC_reg, PIC_mem
+%else
+    add                srcq, ssq
+%endif
+    shr                 mxd, 10
+    shr               tmp1d, 10
+    movq                 m1, [filterq+mxq  *8]  ; 0 X
+    movq                 m8, [filterq+tmp1q*8]  ; 4 X
+    lea               tmp1d, [tmp2q+alphaq*4]
+    lea                 mxd, [tmp2q+alphaq*1]
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movhps               m1, [filterq+tmp2q*8]  ; 0 1
+    movhps               m8, [filterq+tmp1q*8]  ; 4 5
+    lea               tmp1d, [mxq+alphaq*4]
+    lea               tmp2d, [mxq+alphaq*1]
+    shr                 mxd, 10
+    shr               tmp1d, 10
+%if cpuflag(ssse3)
+    movq                m14, [filterq+mxq  *8]  ; 2 X
+    movq                 m9, [filterq+tmp1q*8]  ; 6 X
+    lea               tmp1d, [tmp2q+alphaq*4]
+    lea                 mxd, [tmp2q+betaq]  ; mx += beta
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movhps              m14, [filterq+tmp2q*8]  ; 2 3
+    movhps               m9, [filterq+tmp1q*8]  ; 6 7
+    pshufb               m0, m10, [PIC_sym(warp_8x8_shufA)]
+    pmaddubsw            m0, m1
+    pshufb               m1, m10, [PIC_sym(warp_8x8_shufB)]
+    pmaddubsw            m1, m8
+    pshufb              m15, m10, [PIC_sym(warp_8x8_shufC)]
+    pmaddubsw           m15, m14
+    pshufb              m10, m10, [PIC_sym(warp_8x8_shufD)]
+    pmaddubsw           m10, m9
+    phaddw               m0, m15
+    phaddw               m1, m10
+%else
+ %if ARCH_X86_32
+  %define m11 m2
+ %endif
+    pcmpeqw              m0, m0
+    psrlw               m14, m0, 8
+    psrlw               m15, m10, 8     ; 01 03 05 07  09 11 13 15
+    pand                m14, m10        ; 00 02 04 06  08 10 12 14
+    packuswb            m14, m15        ; 00 02 04 06  08 10 12 14  01 03 05 07  09 11 13 15
+    psrldq               m9, m0, 4
+    pshufd               m0, m14, q0220
+    pand                 m0, m9
+    psrldq              m14, 1          ; 02 04 06 08  10 12 14 01  03 05 07 09  11 13 15 __
+    pslldq              m15, m14, 12
+    por                  m0, m15    ; shufA
+    psrlw               m15, m0, 8
+    psraw               m11, m1, 8
+    psllw                m0, 8
+    psllw                m1, 8
+    psrlw                m0, 8
+    psraw                m1, 8
+    pmullw              m15, m11
+    pmullw               m0, m1
+    paddw                m0, m15    ; pmaddubsw m0, m1
+    pshufd              m15, m14, q0220
+    pand                m15, m9
+    psrldq              m14, 1          ; 04 06 08 10  12 14 01 03  05 07 09 11  13 15 __ __
+    pslldq               m1, m14, 12
+    por                 m15, m1     ; shufC
+    pshufd               m1, m14, q0220
+    pand                 m1, m9
+    psrldq              m14, 1          ; 06 08 10 12  14 01 03 05  07 09 11 13  15 __ __ __
+    pslldq              m11, m14, 12
+    por                  m1, m11    ; shufB
+    pshufd              m10, m14, q0220
+    pand                m10, m9
+    psrldq              m14, 1          ; 08 10 12 14  01 03 05 07  09 11 13 15  __ __ __ __
+    pslldq              m14, m14, 12
+    por                 m10, m14    ; shufD
+    psrlw                m9, m1, 8
+    psraw               m11, m8, 8
+    psllw                m1, 8
+    psllw                m8, 8
+    psrlw                m1, 8
+    psraw                m8, 8
+    pmullw               m9, m11
+    pmullw               m1, m8
+    paddw                m1, m9     ; pmaddubsw m1, m8
+    movq                m14, [filterq+mxq  *8]  ; 2 X
+    movq                 m9, [filterq+tmp1q*8]  ; 6 X
+    lea               tmp1d, [tmp2q+alphaq*4]
+    lea                 mxd, [tmp2q+betaq]  ; mx += beta
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movhps              m14, [filterq+tmp2q*8]  ; 2 3
+    movhps               m9, [filterq+tmp1q*8]  ; 6 7
+    psrlw                m8, m15, 8
+    psraw               m11, m14, 8
+    psllw               m15, 8
+    psllw               m14, 8
+    psrlw               m15, 8
+    psraw               m14, 8
+    pmullw               m8, m11
+    pmullw              m15, m14
+    paddw               m15, m8     ; pmaddubsw m15, m14
+    psrlw                m8, m10, 8
+    psraw               m11, m9, 8
+    psllw               m10, 8
+    psllw                m9, 8
+    psrlw               m10, 8
+    psraw                m9, 8
+    pmullw               m8, m11
+    pmullw              m10, m9
+    paddw               m10, m8     ; pmaddubsw m10, m9
+    pslld                m8, m0, 16
+    pslld                m9, m1, 16
+    pslld               m14, m15, 16
+    pslld               m11, m10, 16
+    paddw                m0, m8
+    paddw                m1, m9
+    paddw               m15, m14
+    paddw               m10, m11
+    psrad                m0, 16
+    psrad                m1, 16
+    psrad               m15, 16
+    psrad               m10, 16
+    packssdw             m0, m15    ; phaddw m0, m15
+    packssdw             m1, m10    ; phaddw m1, m10
+%endif
+    mova                m14, [PIC_sym(pw_8192)]
+    mova                 m9, [PIC_sym(pd_32768)]
+    pmaddwd              m0, m14 ; 17-bit intermediate, upshifted by 13
+    pmaddwd              m1, m14
+    paddd                m0, m9  ; rounded 14-bit result in upper 16 bits of dword
+    paddd                m1, m9
+    ret
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%macro BIDIR_FN 1 ; op
+    %1                    0
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4_loop:
+    %1_INC_PTR            2
+    %1                    0
+    lea                dstq, [dstq+strideq*4]
+.w4: ; tile 4x
+    movd   [dstq          ], m0      ; copy dw[0]
+    pshuflw              m1, m0, q1032 ; swap dw[1] and dw[0]
+    movd   [dstq+strideq*1], m1      ; copy dw[1]
+    punpckhqdq           m0, m0      ; swap dw[3,2] with dw[1,0]
+    movd   [dstq+strideq*2], m0      ; dw[2]
+    psrlq                m0, 32      ; shift right in dw[3]
+    movd   [dstq+stride3q ], m0      ; copy
+    sub                  hd, 4
+    jg .w4_loop
+    RET
+.w8_loop:
+    %1_INC_PTR            2
+    %1                    0
+    lea                dstq, [dstq+strideq*2]
+.w8:
+    movq   [dstq          ], m0
+    movhps [dstq+strideq*1], m0
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+.w16_loop:
+    %1_INC_PTR            2
+    %1                    0
+    lea                dstq, [dstq+strideq]
+.w16:
+    mova   [dstq          ], m0
+    dec                  hd
+    jg .w16_loop
+    RET
+.w32_loop:
+    %1_INC_PTR            4
+    %1                    0
+    lea                dstq, [dstq+strideq]
+.w32:
+    mova   [dstq          ], m0
+    %1                    2
+    mova   [dstq + 16     ], m0
+    dec                  hd
+    jg .w32_loop
+    RET
+.w64_loop:
+    %1_INC_PTR            8
+    %1                    0
+    add                dstq, strideq
+.w64:
+    %assign i 0
+    %rep 4
+    mova   [dstq + i*16   ], m0
+    %assign i i+1
+    %if i < 4
+    %1                    2*i
+    %endif
+    %endrep
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128_loop:
+    %1_INC_PTR            16
+    %1                    0
+    add                dstq, strideq
+.w128:
+    %assign i 0
+    %rep 8
+    mova   [dstq + i*16   ], m0
+    %assign i i+1
+    %if i < 8
+    %1                    2*i
+    %endif
+    %endrep
+    dec                  hd
+    jg .w128_loop
+    RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+    ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
+    mova                 m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
+    paddw                m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
+    mova                 m1, [tmp1q+(%1+1)*mmsize]
+    paddw                m1, [tmp2q+(%1+1)*mmsize]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    packuswb             m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
+%endmacro
+
+%macro AVG_INC_PTR 1
+    add               tmp1q, %1*mmsize
+    add               tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+    LEA                  r6, avg_ssse3_table
+    tzcnt                wd, wm ; leading zeros
+    movifnidn            hd, hm ; move h(stack) to h(register) if not already that register
+    movsxd               wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
+    mova                 m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
+    add                  wq, r6
+    BIDIR_FN            AVG
+
+%macro W_AVG 1 ; src_offset
+    ; (a * weight + b * (16 - weight) + 128) >> 8
+    ; = ((a - b) * weight + (b << 4) + 128) >> 8
+    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
+    mova                 m2, [tmp1q+(%1+0)*mmsize]
+    mova                 m0, m2
+    psubw                m2, [tmp2q+(%1+0)*mmsize]
+    mova                 m3, [tmp1q+(%1+1)*mmsize]
+    mova                 m1, m3
+    psubw                m3, [tmp2q+(%1+1)*mmsize]
+    pmulhw               m2, m4
+    pmulhw               m3, m4
+    paddw                m0, m2
+    paddw                m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+    LEA                  r6, w_avg_ssse3_table
+    tzcnt                wd, wm
+    movd                 m4, r6m
+    movifnidn            hd, hm
+    pxor                 m0, m0
+    movsxd               wq, dword [r6+wq*4]
+    mova                 m5, [pw_2048+r6-w_avg_ssse3_table]
+    pshufb               m4, m0
+    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
+    add                  wq, r6
+    cmp           dword r6m, 7
+    jg .weight_gt7
+    mov                  r6, tmp1q
+    psubw                m0, m4
+    mov               tmp1q, tmp2q
+    mova                 m4, m0 ; -weight
+    mov               tmp2q, r6
+.weight_gt7:
+    BIDIR_FN          W_AVG
+
+%macro MASK 1 ; src_offset
+    ; (a * m + b * (64 - m) + 512) >> 10
+    ; = ((a - b) * m + (b << 6) + 512) >> 10
+    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+    mova                 m3,     [maskq+(%1+0)*(mmsize/2)]
+    mova                 m0,     [tmp2q+(%1+0)*mmsize] ; b
+    psubw                m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
+    mova                 m6, m3      ; m
+    psubb                m3, m4, m6  ; -m
+    paddw                m1, m1     ; (b - a) << 1
+    paddb                m3, m3     ; -m << 1
+    punpcklbw            m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
+    pmulhw               m1, m2     ; (-m * (b - a)) << 10
+    paddw                m0, m1     ; + b
+    mova                 m1,     [tmp2q+(%1+1)*mmsize] ; b
+    psubw                m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
+    paddw                m2, m2  ; (b - a) << 1
+    mova                 m6, m3  ; (-m << 1)
+    punpckhbw            m3, m4, m6 ; (-m << 9)
+    pmulhw               m2, m3 ; (-m << 9)
+    paddw                m1, m2 ; (-m * (b - a)) << 10
+    pmulhrsw             m0, m5 ; round
+    pmulhrsw             m1, m5 ; round
+    packuswb             m0, m1 ; interleave 16 -> 8
+%endmacro
+
+%macro MASK_INC_PTR 1
+    add               maskq, %1*mmsize/2
+    add               tmp1q, %1*mmsize
+    add               tmp2q, %1*mmsize
+%endmacro
+
+%if ARCH_X86_64
+cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
+    movifnidn            hd, hm
+%else
+cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
+%define hd dword r5m
+%endif
+%define base r6-mask_ssse3_table
+    LEA                  r6, mask_ssse3_table
+    tzcnt                wd, wm
+    movsxd               wq, dword [r6+wq*4]
+    pxor                 m4, m4
+    mova                 m5, [base+pw_2048]
+    add                  wq, r6
+    mov               maskq, r6m
+    BIDIR_FN           MASK
+%undef hd
+
+%macro W_MASK_420_B 2 ; src_offset in bytes, mask_out
+    ;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] ****
+    mova                 m0, [tmp1q+(%1)]
+    mova                 m1, [tmp2q+(%1)]
+    mova                 m2, reg_pw_6903
+    psubw                m1, m0
+    pabsw               m%2, m1 ; abs(tmp1 - tmp2)
+    mova                 m3, m2
+    psubusw              m2, m%2
+    psrlw                m2, 8  ; 64 - m
+    mova                m%2, m2
+    psllw                m2, 10
+    pmulhw               m1, m2 ; tmp2 * ()
+    paddw                m0, m1 ; tmp1 + ()
+    ;**** do m1 = u16.dst[7..0], m%2 = u16.m[7..0] ****
+    mova                 m1, [tmp1q+(%1)+mmsize]
+    mova                 m2, [tmp2q+(%1)+mmsize]
+    psubw                m2, m1
+    pabsw                m7, m2 ; abs(tmp1 - tmp2)
+    psubusw              m3, m7
+    psrlw                m3, 8  ; 64 - m
+    phaddw              m%2, m3 ; pack both u16.m[8..0]runs as u8.m [15..0]
+    psllw                m3, 10
+    pmulhw               m2, m3
+%if ARCH_X86_32
+    mova        reg_pw_2048, [base+pw_2048]
+%endif
+    paddw                m1, m2
+    pmulhrsw             m0, reg_pw_2048 ; round/scale 2048
+    pmulhrsw             m1, reg_pw_2048 ; round/scale 2048
+    packuswb             m0, m1 ; concat m0 = u8.dst[15..0]
+%endmacro
+
+%macro W_MASK_420 2
+    W_MASK_420_B (%1*16), %2
+%endmacro
+
+%define base r6-w_mask_420_ssse3_table
+%if ARCH_X86_64
+%define reg_pw_6903 m8
+%define reg_pw_2048 m9
+; args: dst, stride, tmp1, tmp2, w, h, mask, sign
+cglobal w_mask_420, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask
+    lea                  r6, [w_mask_420_ssse3_table]
+    mov                  wd, wm
+    tzcnt               r7d, wd
+    movd                 m0, r7m ; sign
+    movifnidn            hd, hm
+    movsxd               r7, [r6+r7*4]
+    mova        reg_pw_6903, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+    mova        reg_pw_2048, [base+pw_2048]
+    movd                 m6, [base+pw_258]  ; 64 * 4 + 2
+    add                  r7, r6
+    mov               maskq, maskmp
+    psubw                m6, m0
+    pshuflw              m6, m6, q0000
+    punpcklqdq           m6, m6
+    W_MASK_420            0, 4
+    jmp                  r7
+    %define loop_w      r7d
+%else
+%define reg_pw_6903 [base+pw_6903]
+%define reg_pw_2048 m3
+cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
+    tzcnt                wd, wm
+    LEA                  r6, w_mask_420_ssse3_table
+    movd                 m0, r7m ; sign
+    mov               maskq, r6mp
+    mov                  wd, [r6+wq*4]
+    movd                 m6, [base+pw_258]
+    add                  wq, r6
+    psubw                m6, m0
+    pshuflw              m6, m6, q0000
+    punpcklqdq           m6, m6
+    W_MASK_420            0, 4
+    jmp                  wd
+    %define loop_w dword r0m
+    %define hd     dword r5m
+%endif
+.w4_loop:
+    add               tmp1q, 2*16
+    add               tmp2q, 2*16
+    W_MASK_420            0, 4
+    lea                dstq, [dstq+strideq*2]
+    add               maskq, 4
+.w4:
+    movd   [dstq          ], m0 ; copy m0[0]
+    pshuflw              m1, m0, q1032
+    movd   [dstq+strideq*1], m1 ; copy m0[1]
+    lea                dstq, [dstq+strideq*2]
+    punpckhqdq           m0, m0
+    movd   [dstq+strideq*0], m0 ; copy m0[2]
+    psrlq                m0, 32
+    movd   [dstq+strideq*1], m0 ; copy m0[3]
+    psubw                m1, m6, m4 ; a _ c _
+    psrlq                m4, 32     ; b _ d _
+    psubw                m1, m4
+    psrlw                m1, 2
+    packuswb             m1, m1
+    pshuflw              m1, m1, q2020
+    movd            [maskq], m1
+    sub                  hd, 4
+    jg .w4_loop
+    RET
+.w8_loop:
+    add               tmp1q, 2*16
+    add               tmp2q, 2*16
+    W_MASK_420            0, 4
+    lea                dstq, [dstq+strideq*2]
+    add               maskq, 4
+.w8:
+    movq   [dstq          ], m0
+    movhps [dstq+strideq*1], m0
+    psubw                m0, m6, m4
+    punpckhqdq           m4, m4
+    psubw                m0, m4
+    psrlw                m0, 2
+    packuswb             m0, m0
+    movd            [maskq], m0
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+.w16: ; w32/64/128
+%if ARCH_X86_32
+    mov                  wd, wm     ; because we altered it in 32bit setup
+%endif
+    mov              loop_w, wd     ; use width as counter
+    jmp .w16ge_inner_loop_first
+.w16ge_loop:
+    lea               tmp1q, [tmp1q+wq*2] ; skip even line pixels
+    lea               tmp2q, [tmp2q+wq*2] ; skip even line pixels
+    sub                dstq, wq
+    mov              loop_w, wd
+    lea                dstq, [dstq+strideq*2]
+.w16ge_inner_loop:
+    W_MASK_420_B          0, 4
+.w16ge_inner_loop_first:
+    mova   [dstq          ], m0
+    W_MASK_420_B       wq*2, 5  ; load matching even line (offset = widthpx * (16+16))
+    mova   [dstq+strideq*1], m0
+    psubw                m1, m6, m4 ; m9 == 64 * 4 + 2
+    psubw                m1, m5     ; - odd line mask
+    psrlw                m1, 2      ; >> 2
+    packuswb             m1, m1
+    movq            [maskq], m1
+    add               tmp1q, 2*16
+    add               tmp2q, 2*16
+    add               maskq, 8
+    add                dstq, 16
+    sub              loop_w, 16
+    jg .w16ge_inner_loop
+    sub                  hd, 2
+    jg .w16ge_loop
+    RET
+
+%undef reg_pw_6903
+%undef reg_pw_2048
+%undef dst_bak
+%undef loop_w
+%undef orig_w
+%undef hd
+
+%macro BLEND_64M 4; a, b, mask1, mask2
+    punpcklbw            m0, %1, %2; {b;a}[7..0]
+    punpckhbw            %1, %2    ; {b;a}[15..8]
+    pmaddubsw            m0, %3    ; {b*m[0] + (64-m[0])*a}[7..0] u16
+    pmaddubsw            %1, %4    ; {b*m[1] + (64-m[1])*a}[15..8] u16
+    pmulhrsw             m0, m5    ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
+    pmulhrsw             %1, m5    ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16
+    packuswb             m0, %1    ; {blendpx}[15..0] u8
+%endmacro
+
+%macro BLEND 2; a, b
+    psubb                m3, m4, m0 ; m3 = (64 - m)
+    punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
+    punpckhbw            m3, m0     ; {m;(64-m)}[15..8]
+    BLEND_64M            %1, %2, m2, m3
+%endmacro
+
+cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_ssse3_table
+    LEA                  r6, blend_ssse3_table
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movifnidn         maskq, maskmp
+    movsxd               wq, dword [r6+wq*4]
+    mova                 m4, [base+pb_64]
+    mova                 m5, [base+pw_512]
+    add                  wq, r6
+    lea                  r6, [dsq*3]
+    jmp                  wq
+.w4:
+    movq                 m0, [maskq]; m
+    movd                 m1, [dstq+dsq*0] ; a
+    movd                 m6, [dstq+dsq*1]
+    punpckldq            m1, m6
+    movq                 m6, [tmpq] ; b
+    psubb                m3, m4, m0 ; m3 = (64 - m)
+    punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
+    punpcklbw            m1, m6    ; {b;a}[7..0]
+    pmaddubsw            m1, m2    ; {b*m[0] + (64-m[0])*a}[7..0] u16
+    pmulhrsw             m1, m5    ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
+    packuswb             m1, m0    ; {blendpx}[15..0] u8
+    movd       [dstq+dsq*0], m1
+    psrlq                m1, 32
+    movd       [dstq+dsq*1], m1
+    add               maskq, 8
+    add                tmpq, 8
+    lea                dstq, [dstq+dsq*2] ; dst_stride * 2
+    sub                  hd, 2
+    jg .w4
+    RET
+.w8:
+    mova                 m0, [maskq]; m
+    movq                 m1, [dstq+dsq*0] ; a
+    movhps               m1, [dstq+dsq*1]
+    mova                 m6, [tmpq] ; b
+    BLEND                m1, m6
+    movq       [dstq+dsq*0], m0
+    movhps     [dstq+dsq*1], m0
+    add               maskq, 16
+    add                tmpq, 16
+    lea                dstq, [dstq+dsq*2] ; dst_stride * 2
+    sub                  hd, 2
+    jg .w8
+    RET
+.w16:
+    mova                 m0, [maskq]; m
+    mova                 m1, [dstq] ; a
+    mova                 m6, [tmpq] ; b
+    BLEND                m1, m6
+    mova             [dstq], m0
+    add               maskq, 16
+    add                tmpq, 16
+    add                dstq, dsq ; dst_stride
+    dec                  hd
+    jg .w16
+    RET
+.w32:
+    %assign i 0
+    %rep 2
+    mova                 m0, [maskq+16*i]; m
+    mova                 m1, [dstq+16*i] ; a
+    mova                 m6, [tmpq+16*i] ; b
+    BLEND                m1, m6
+    mova        [dstq+i*16], m0
+    %assign i i+1
+    %endrep
+    add               maskq, 32
+    add                tmpq, 32
+    add                dstq, dsq ; dst_stride
+    dec                  hd
+    jg .w32
+    RET
+
+cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_ssse3_table
+    LEA                  r5, blend_v_ssse3_table
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, dword [r5+wq*4]
+    mova                 m5, [base+pw_512]
+    add                  wq, r5
+    add               maskq, obmc_masks-blend_v_ssse3_table
+    jmp                  wq
+.w2:
+    movd                 m3, [maskq+4]
+    punpckldq            m3, m3
+    ; 2 mask blend is provided for 4 pixels / 2 lines
+.w2_loop:
+    movd                 m1, [dstq+dsq*0] ; a {..;a;a}
+    pinsrw               m1, [dstq+dsq*1], 1
+    movd                 m2, [tmpq] ; b
+    punpcklbw            m0, m1, m2; {b;a}[7..0]
+    pmaddubsw            m0, m3    ; {b*m + (64-m)*a}[7..0] u16
+    pmulhrsw             m0, m5    ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+    packuswb             m0, m1    ; {blendpx}[8..0] u8
+    movd                r3d, m0
+    mov        [dstq+dsq*0], r3w
+    shr                 r3d, 16
+    mov        [dstq+dsq*1], r3w
+    add                tmpq, 2*2
+    lea                dstq, [dstq + dsq * 2]
+    sub                  hd, 2
+    jg .w2_loop
+    RET
+.w4:
+    movddup              m3, [maskq+8]
+    ; 4 mask blend is provided for 8 pixels / 2 lines
+.w4_loop:
+    movd                 m1, [dstq+dsq*0] ; a
+    movd                 m2, [dstq+dsq*1] ;
+    punpckldq            m1, m2
+    movq                 m2, [tmpq] ; b
+    punpcklbw            m1, m2    ; {b;a}[7..0]
+    pmaddubsw            m1, m3    ; {b*m + (64-m)*a}[7..0] u16
+    pmulhrsw             m1, m5    ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+    packuswb             m1, m1    ; {blendpx}[8..0] u8
+    movd             [dstq], m1
+    psrlq                m1, 32
+    movd       [dstq+dsq*1], m1
+    add                tmpq, 2*4
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w4_loop
+    RET
+.w8:
+    mova                 m3, [maskq+16]
+    ; 8 mask blend is provided for 16 pixels
+.w8_loop:
+    movq                 m1, [dstq+dsq*0] ; a
+    movhps               m1, [dstq+dsq*1]
+    mova                 m2, [tmpq]; b
+    BLEND_64M            m1, m2, m3, m3
+    movq       [dstq+dsq*0], m0
+    movhps     [dstq+dsq*1], m0
+    add                tmpq, 16
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+.w16:
+    ; 16 mask blend is provided for 32 pixels
+    mova                  m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
+    mova                  m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
+.w16_loop:
+    mova                 m1, [dstq] ; a
+    mova                 m2, [tmpq] ; b
+    BLEND_64M            m1, m2, m3, m4
+    mova             [dstq], m0
+    add                tmpq, 16
+    add                dstq, dsq
+    dec                  hd
+    jg .w16_loop
+    RET
+.w32:
+%if WIN64
+    mova            [rsp+8], xmm6
+%endif
+    mova                 m3, [maskq+64] ; obmc_masks_32[0] (64-m[0])
+    mova                 m4, [maskq+80] ; obmc_masks_32[1] (64-m[1])
+    mova                 m6, [maskq+96] ; obmc_masks_32[2] (64-m[2])
+    ; 16 mask blend is provided for 64 pixels
+.w32_loop:
+    mova                 m1, [dstq+16*0] ; a
+    mova                 m2, [tmpq+16*0] ; b
+    BLEND_64M            m1, m2, m3, m4
+    movq                 m1, [dstq+16*1] ; a
+    punpcklbw            m1, [tmpq+16*1] ; b
+    pmaddubsw            m1, m6
+    pmulhrsw             m1, m5
+    packuswb             m1, m1
+    mova        [dstq+16*0], m0
+    movq        [dstq+16*1], m1
+    add                tmpq, 32
+    add                dstq, dsq
+    dec                  hd
+    jg .w32_loop
+%if WIN64
+    mova               xmm6, [rsp+8]
+%endif
+    RET
+
+cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base t0-blend_h_ssse3_table
+%if ARCH_X86_32
+    ; We need to keep the PIC pointer for w4, reload wd from stack instead
+    DECLARE_REG_TMP 6
+%else
+    DECLARE_REG_TMP 5
+    mov                 r6d, wd
+%endif
+    LEA                  t0, blend_h_ssse3_table
+    tzcnt                wd, wm
+    mov                  hd, hm
+    movsxd               wq, dword [t0+wq*4]
+    mova                 m5, [base+pw_512]
+    add                  wq, t0
+    lea               maskq, [base+obmc_masks+hq*2]
+    lea                  hd, [hq*3]
+    shr                  hd, 2 ; h * 3/4
+    lea               maskq, [maskq+hq*2]
+    neg                  hq
+    jmp                  wq
+.w2:
+    movd                 m0, [dstq+dsq*0]
+    pinsrw               m0, [dstq+dsq*1], 1
+    movd                 m2, [maskq+hq*2]
+    movd                 m1, [tmpq]
+    punpcklwd            m2, m2
+    punpcklbw            m0, m1
+    pmaddubsw            m0, m2
+    pmulhrsw             m0, m5
+    packuswb             m0, m0
+    movd                r3d, m0
+    mov        [dstq+dsq*0], r3w
+    shr                 r3d, 16
+    mov        [dstq+dsq*1], r3w
+    lea                dstq, [dstq+dsq*2]
+    add                tmpq, 2*2
+    add                  hq, 2
+    jl .w2
+    RET
+.w4:
+%if ARCH_X86_32
+    mova                 m3, [base+blend_shuf]
+%else
+    mova                 m3, [blend_shuf]
+%endif
+.w4_loop:
+    movd                 m0, [dstq+dsq*0]
+    movd                 m2, [dstq+dsq*1]
+    punpckldq            m0, m2 ; a
+    movq                 m1, [tmpq] ; b
+    movq                 m2, [maskq+hq*2] ; m
+    pshufb               m2, m3
+    punpcklbw            m0, m1
+    pmaddubsw            m0, m2
+    pmulhrsw             m0, m5
+    packuswb             m0, m0
+    movd       [dstq+dsq*0], m0
+    psrlq                m0, 32
+    movd       [dstq+dsq*1], m0
+    lea                dstq, [dstq+dsq*2]
+    add                tmpq, 4*2
+    add                  hq, 2
+    jl .w4_loop
+    RET
+.w8:
+    movd                 m4, [maskq+hq*2]
+    punpcklwd            m4, m4
+    pshufd               m3, m4, q0000
+    pshufd               m4, m4, q1111
+    movq                 m1, [dstq+dsq*0] ; a
+    movhps               m1, [dstq+dsq*1]
+    mova                 m2, [tmpq]
+    BLEND_64M            m1, m2, m3, m4
+    movq       [dstq+dsq*0], m0
+    movhps     [dstq+dsq*1], m0
+    lea                dstq, [dstq+dsq*2]
+    add                tmpq, 8*2
+    add                  hq, 2
+    jl .w8
+    RET
+; w16/w32/w64/w128
+.w16:
+%if ARCH_X86_32
+    mov                 r6d, wm
+%endif
+    sub                 dsq, r6
+.w16_loop0:
+    movd                 m3, [maskq+hq*2]
+    pshuflw              m3, m3, q0000
+    punpcklqdq           m3, m3
+    mov                  wd, r6d
+.w16_loop:
+    mova                 m1, [dstq] ; a
+    mova                 m2, [tmpq] ; b
+    BLEND_64M            m1, m2, m3, m3
+    mova             [dstq], m0
+    add                dstq, 16
+    add                tmpq, 16
+    sub                  wd, 16
+    jg .w16_loop
+    add                dstq, dsq
+    inc                  hq
+    jl .w16_loop0
+    RET
+
+; emu_edge args:
+; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
+; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
+; const pixel *ref, const ptrdiff_t ref_stride
+;
+; bw, bh total filled size
+; iw, ih, copied block -> fill bottom, right
+; x, y, offset in bw/bh -> fill top, left
+cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
+                             y, dst, dstride, src, sstride, \
+                             bottomext, rightext, blk
+    ; we assume that the buffer (stride) is larger than width, so we can
+    ; safely overwrite by a few bytes
+    pxor                 m1, m1
+
+%if ARCH_X86_64
+ %define reg_zero       r12q
+ %define reg_tmp        r10
+ %define reg_src        srcq
+ %define reg_bottomext  bottomextq
+ %define reg_rightext   rightextq
+ %define reg_blkm       r9m
+%else
+ %define reg_zero       r6
+ %define reg_tmp        r0
+ %define reg_src        r1
+ %define reg_bottomext  r0
+ %define reg_rightext   r1
+ %define reg_blkm       r2m
+%endif
+    ;
+    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+    xor            reg_zero, reg_zero
+    lea             reg_tmp, [ihq-1]
+    cmp                  yq, ihq
+    cmovs           reg_tmp, yq
+    test                 yq, yq
+    cmovs           reg_tmp, reg_zero
+%if ARCH_X86_64
+    imul            reg_tmp, sstrideq
+    add                srcq, reg_tmp
+%else
+    imul            reg_tmp, sstridem
+    mov             reg_src, srcm
+    add             reg_src, reg_tmp
+%endif
+    ;
+    ; ref += iclip(x, 0, iw - 1)
+    lea             reg_tmp, [iwq-1]
+    cmp                  xq, iwq
+    cmovs           reg_tmp, xq
+    test                 xq, xq
+    cmovs           reg_tmp, reg_zero
+    add             reg_src, reg_tmp
+%if ARCH_X86_32
+    mov                srcm, reg_src
+%endif
+    ;
+    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+%if ARCH_X86_32
+    mov                  r1, r1m ; restore bh
+%endif
+    lea       reg_bottomext, [yq+bhq]
+    sub       reg_bottomext, ihq
+    lea                  r3, [bhq-1]
+    cmovs     reg_bottomext, reg_zero
+    ;
+
+    DEFINE_ARGS bw, bh, iw, ih, x, \
+                topext, dst, dstride, src, sstride, \
+                bottomext, rightext, blk
+
+    ; top_ext = iclip(-y, 0, bh - 1)
+    neg             topextq
+    cmovs           topextq, reg_zero
+    cmp       reg_bottomext, bhq
+    cmovns    reg_bottomext, r3
+    cmp             topextq, bhq
+    cmovg           topextq, r3
+ %if ARCH_X86_32
+    mov                 r4m, reg_bottomext
+    ;
+    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+    mov                  r0, r0m ; restore bw
+ %endif
+    lea        reg_rightext, [xq+bwq]
+    sub        reg_rightext, iwq
+    lea                  r2, [bwq-1]
+    cmovs      reg_rightext, reg_zero
+
+    DEFINE_ARGS bw, bh, iw, ih, leftext, \
+                topext, dst, dstride, src, sstride, \
+                bottomext, rightext, blk
+
+    ; left_ext = iclip(-x, 0, bw - 1)
+    neg            leftextq
+    cmovs          leftextq, reg_zero
+    cmp        reg_rightext, bwq
+    cmovns     reg_rightext, r2
+ %if ARCH_X86_32
+    mov                 r3m, r1
+ %endif
+    cmp            leftextq, bwq
+    cmovns         leftextq, r2
+
+%undef reg_zero
+%undef reg_tmp
+%undef reg_src
+%undef reg_bottomext
+%undef reg_rightext
+
+    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
+                topext, dst, dstride, src, sstride, \
+                bottomext, rightext, blk
+
+    ; center_h = bh - top_ext - bottom_ext
+%if ARCH_X86_64
+    lea                  r3, [bottomextq+topextq]
+    sub            centerhq, r3
+%else
+    mov                   r1, centerhm ; restore r1
+    sub             centerhq, topextq
+    sub             centerhq, r4m
+    mov                  r1m, centerhq
+%endif
+    ;
+    ; blk += top_ext * PXSTRIDE(dst_stride)
+    mov                  r2, topextq
+%if ARCH_X86_64
+    imul                 r2, dstrideq
+%else
+    mov                  r6, r6m ; restore dstq
+    imul                 r2, dstridem
+%endif
+    add                dstq, r2
+    mov            reg_blkm, dstq ; save pointer for ext
+    ;
+    ; center_w = bw - left_ext - right_ext
+    mov            centerwq, bwq
+%if ARCH_X86_64
+    lea                  r3, [rightextq+leftextq]
+    sub            centerwq, r3
+%else
+    sub            centerwq, r3m
+    sub            centerwq, leftextq
+%endif
+
+; vloop Macro
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+  %if ARCH_X86_64
+    %define reg_tmp        r12
+  %else
+    %define reg_tmp        r0
+  %endif
+.v_loop_%3:
+  %if ARCH_X86_32
+    mov                  r0, r0m
+    mov                  r1, r1m
+  %endif
+%if %1
+    ; left extension
+  %if ARCH_X86_64
+    movd                 m0, [srcq]
+  %else
+    mov                  r3, srcm
+    movd                 m0, [r3]
+  %endif
+    pshufb               m0, m1
+    xor                  r3, r3
+.left_loop_%3:
+    mova          [dstq+r3], m0
+    add                  r3, mmsize
+    cmp                  r3, leftextq
+    jl .left_loop_%3
+    ; body
+    lea             reg_tmp, [dstq+leftextq]
+%endif
+    xor                  r3, r3
+.body_loop_%3:
+  %if ARCH_X86_64
+    movu                 m0, [srcq+r3]
+  %else
+    mov                  r1, srcm
+    movu                 m0, [r1+r3]
+  %endif
+%if %1
+    movu       [reg_tmp+r3], m0
+%else
+    movu          [dstq+r3], m0
+%endif
+    add                  r3, mmsize
+    cmp                  r3, centerwq
+    jl .body_loop_%3
+%if %2
+    ; right extension
+%if %1
+    add             reg_tmp, centerwq
+%else
+    lea             reg_tmp, [dstq+centerwq]
+%endif
+  %if ARCH_X86_64
+    movd                 m0, [srcq+centerwq-1]
+  %else
+    mov                  r3, srcm
+    movd                 m0, [r3+centerwq-1]
+  %endif
+    pshufb               m0, m1
+    xor                  r3, r3
+.right_loop_%3:
+    movu       [reg_tmp+r3], m0
+    add                  r3, mmsize
+  %if ARCH_X86_64
+    cmp                  r3, rightextq
+  %else
+    cmp                  r3, r3m
+  %endif
+    jl .right_loop_%3
+%endif
+  %if ARCH_X86_64
+    add                dstq, dstrideq
+    add                srcq, sstrideq
+    dec            centerhq
+    jg .v_loop_%3
+  %else
+    add                dstq, dstridem
+    mov                  r0, sstridem
+    add                srcm, r0
+    sub       dword centerhm, 1
+    jg .v_loop_%3
+    mov                  r0, r0m ; restore r0
+  %endif
+%endmacro ; vloop MACRO
+
+    test           leftextq, leftextq
+    jnz .need_left_ext
+ %if ARCH_X86_64
+    test          rightextq, rightextq
+    jnz .need_right_ext
+ %else
+    cmp            leftextq, r3m ; leftextq == 0
+    jne .need_right_ext
+ %endif
+    v_loop                0, 0, 0
+    jmp .body_done
+
+    ;left right extensions
+.need_left_ext:
+ %if ARCH_X86_64
+    test          rightextq, rightextq
+ %else
+    mov                  r3, r3m
+    test                 r3, r3
+ %endif
+    jnz .need_left_right_ext
+    v_loop                1, 0, 1
+    jmp .body_done
+
+.need_left_right_ext:
+    v_loop                1, 1, 2
+    jmp .body_done
+
+.need_right_ext:
+    v_loop                0, 1, 3
+
+.body_done:
+; r0 ; bw
+; r1 ;; x loop
+; r4 ;; y loop
+; r5 ; topextq
+; r6 ;dstq
+; r7 ;dstrideq
+; r8 ; srcq
+%if ARCH_X86_64
+ %define reg_dstride    dstrideq
+%else
+ %define reg_dstride    r2
+%endif
+    ;
+    ; bottom edge extension
+ %if ARCH_X86_64
+    test         bottomextq, bottomextq
+    jz .top
+ %else
+    xor                  r1, r1
+    cmp                  r1, r4m
+    je .top
+ %endif
+    ;
+ %if ARCH_X86_64
+    mov                srcq, dstq
+    sub                srcq, dstrideq
+    xor                  r1, r1
+ %else
+    mov                  r3, dstq
+    mov         reg_dstride, dstridem
+    sub                  r3, reg_dstride
+    mov                srcm, r3
+ %endif
+    ;
+.bottom_x_loop:
+ %if ARCH_X86_64
+    mova                 m0, [srcq+r1]
+    lea                  r3, [dstq+r1]
+    mov                  r4, bottomextq
+ %else
+    mov                  r3, srcm
+    mova                 m0, [r3+r1]
+    lea                  r3, [dstq+r1]
+    mov                  r4, r4m
+ %endif
+    ;
+.bottom_y_loop:
+    mova               [r3], m0
+    add                  r3, reg_dstride
+    dec                  r4
+    jg .bottom_y_loop
+    add                  r1, mmsize
+    cmp                  r1, bwq
+    jl .bottom_x_loop
+
+.top:
+    ; top edge extension
+    test            topextq, topextq
+    jz .end
+%if ARCH_X86_64
+    mov                srcq, reg_blkm
+%else
+    mov                  r3, reg_blkm
+    mov         reg_dstride, dstridem
+%endif
+    mov                dstq, dstm
+    xor                  r1, r1
+    ;
+.top_x_loop:
+%if ARCH_X86_64
+    mova                 m0, [srcq+r1]
+%else
+    mov                  r3, reg_blkm
+    mova                 m0, [r3+r1]
+%endif
+    lea                  r3, [dstq+r1]
+    mov                  r4, topextq
+    ;
+.top_y_loop:
+    mova               [r3], m0
+    add                  r3, reg_dstride
+    dec                  r4
+    jg .top_y_loop
+    add                  r1, mmsize
+    cmp                  r1, bwq
+    jl .top_x_loop
+
+.end:
+    RET
+
+%undef reg_dstride
+%undef reg_blkm
+%undef reg_tmp
+
+cextern resize_filter
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+    mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+    SWAP             %1, %2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+cglobal resize, 0, 14, 16, dst, dst_stride, src, src_stride, \
+                           dst_w, h, src_w, dx, mx0
+%elif STACK_ALIGNMENT >= 16
+cglobal resize, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+                                 dst_w, h, src_w, dx, mx0
+%else
+cglobal resize, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+                                 dst_w, h, src_w, dx, mx0
+%endif
+    movifnidn          dstq, dstmp
+    movifnidn          srcq, srcmp
+%if STACK_ALIGNMENT >= 16
+    movifnidn        dst_wd, dst_wm
+%endif
+%if ARCH_X86_64
+    movifnidn            hd, hm
+%endif
+    sub          dword mx0m, 4<<14
+    sub        dword src_wm, 8
+    movd                 m7, dxm
+    movd                 m6, mx0m
+    movd                 m5, src_wm
+    pshufd               m7, m7, q0000
+    pshufd               m6, m6, q0000
+    pshufd               m5, m5, q0000
+
+%if ARCH_X86_64
+    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
+    LEA                  r7, $$
+%define base r7-$$
+%else
+    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
+%if STACK_ALIGNMENT >= 16
+    LEA                  r6, $$
+%define base r6-$$
+%else
+    LEA                  r4, $$
+%define base r4-$$
+%endif
+%endif
+
+%if ARCH_X86_64
+    mova                m12, [base+pw_m256]
+    mova                m11, [base+pd_63]
+    mova                m10, [base+pb_8x0_8x8]
+%else
+%define m12 [base+pw_m256]
+%define m11 [base+pd_63]
+%define m10 [base+pb_8x0_8x8]
+%endif
+    pmaddwd              m4, m7, [base+resize_mul]  ; dx*[0,1,2,3]
+    pslld                m7, 2                      ; dx*4
+    pslld                m5, 14
+    paddd                m6, m4                     ; mx+[0..3]*dx
+    SCRATCH               7, 15, 0
+    SCRATCH               6, 14, 1
+    SCRATCH               5, 13, 2
+
+    ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
+    ; m8 = mx+[0..3]*dx, m5 = dx*4, m6 = src_w, m7 = 0x3f, m15=0,8
+
+.loop_y:
+    xor                  xd, xd
+    mova                 m0, m14                    ; per-line working version of mx
+
+.loop_x:
+    pxor                 m1, m1
+    pcmpgtd              m1, m0
+    pandn                m1, m0
+    psrad                m2, m0, 8                  ; filter offset (unmasked)
+    pcmpgtd              m3, m13, m1
+    pand                 m1, m3
+    pandn                m3, m13
+    por                  m1, m3
+    psubd                m3, m0, m1                 ; pshufb offset
+    psrad                m1, 14                     ; clipped src_x offset
+    psrad                m3, 14                     ; pshufb edge_emu offset
+    pand                 m2, m11                    ; filter offset (masked)
+
+    ; load source pixels
+%if ARCH_X86_64
+    movd                r8d, xm1
+    pshuflw             xm1, xm1, q3232
+    movd                r9d, xm1
+    punpckhqdq          xm1, xm1
+    movd               r10d, xm1
+    psrlq               xm1, 32
+    movd               r11d, xm1
+    movq                xm4, [srcq+r8]
+    movq                xm5, [srcq+r10]
+    movhps              xm4, [srcq+r9]
+    movhps              xm5, [srcq+r11]
+%else
+    movd                r3d, xm1
+    pshufd              xm1, xm1, q3312
+    movd                r1d, xm1
+    pshuflw             xm1, xm1, q3232
+    movq                xm4, [srcq+r3]
+    movq                xm5, [srcq+r1]
+    movd                r3d, xm1
+    punpckhqdq          xm1, xm1
+    movd                r1d, xm1
+    movhps              xm4, [srcq+r3]
+    movhps              xm5, [srcq+r1]
+%endif
+
+    ; if no emulation is required, we don't need to shuffle or emulate edges
+    ; this also saves 2 quasi-vpgatherdqs
+    pxor                 m6, m6
+    pcmpeqb              m6, m3
+%if ARCH_X86_64
+    pmovmskb            r8d, m6
+    cmp                 r8d, 0xffff
+%else
+    pmovmskb            r3d, m6
+    cmp                 r3d, 0xffff
+%endif
+    je .filter
+
+%if ARCH_X86_64
+    movd                r8d, xm3
+    pshuflw             xm3, xm3, q3232
+    movd                r9d, xm3
+    punpckhqdq          xm3, xm3
+    movd               r10d, xm3
+    psrlq               xm3, 32
+    movd               r11d, xm3
+    movsxd               r8, r8d
+    movsxd               r9, r9d
+    movsxd              r10, r10d
+    movsxd              r11, r11d
+    movq                xm6, [base+resize_shuf+4+r8]
+    movq                xm7, [base+resize_shuf+4+r10]
+    movhps              xm6, [base+resize_shuf+4+r9]
+    movhps              xm7, [base+resize_shuf+4+r11]
+%else
+    movd                r3d, xm3
+    pshufd              xm3, xm3, q3312
+    movd                r1d, xm3
+    pshuflw             xm3, xm3, q3232
+    movq                xm6, [base+resize_shuf+4+r3]
+    movq                xm7, [base+resize_shuf+4+r1]
+    movd                r3d, xm3
+    punpckhqdq          xm3, xm3
+    movd                r1d, xm3
+    movhps              xm6, [base+resize_shuf+4+r3]
+    movhps              xm7, [base+resize_shuf+4+r1]
+%endif
+
+    paddb                m6, m10
+    paddb                m7, m10
+    pshufb               m4, m6
+    pshufb               m5, m7
+
+.filter:
+%if ARCH_X86_64
+    movd                r8d, xm2
+    pshuflw             xm2, xm2, q3232
+    movd                r9d, xm2
+    punpckhqdq          xm2, xm2
+    movd               r10d, xm2
+    psrlq               xm2, 32
+    movd               r11d, xm2
+    movq                xm6, [base+resize_filter+r8*8]
+    movq                xm7, [base+resize_filter+r10*8]
+    movhps              xm6, [base+resize_filter+r9*8]
+    movhps              xm7, [base+resize_filter+r11*8]
+%else
+    movd                r3d, xm2
+    pshufd              xm2, xm2, q3312
+    movd                r1d, xm2
+    pshuflw             xm2, xm2, q3232
+    movq                xm6, [base+resize_filter+r3*8]
+    movq                xm7, [base+resize_filter+r1*8]
+    movd                r3d, xm2
+    punpckhqdq          xm2, xm2
+    movd                r1d, xm2
+    movhps              xm6, [base+resize_filter+r3*8]
+    movhps              xm7, [base+resize_filter+r1*8]
+%endif
+
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m7
+    phaddw               m4, m5
+    phaddsw              m4, m4
+    pmulhrsw             m4, m12                    ; x=(x+64)>>7
+    packuswb             m4, m4
+    movd          [dstq+xq], m4
+
+    paddd                m0, m15
+    add                  xd, 4
+%if STACK_ALIGNMENT >= 16
+    cmp                  xd, dst_wd
+%else
+    cmp                  xd, dst_wm
+%endif
+    jl .loop_x
+
+%if ARCH_X86_64
+    add                dstq, dst_strideq
+    add                srcq, src_strideq
+    dec                  hd
+%else
+    add                dstq, dst_stridem
+    add                srcq, src_stridem
+    dec           dword r5m
+%endif
+    jg .loop_y
+    RET
+
+INIT_XMM ssse3
+PREP_BILIN
+PREP_8TAP
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse4
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse2
+PREP_BILIN
+PREP_8TAP
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
diff --git a/src/x86/msac.asm b/src/x86/msac.asm
new file mode 100644 (file)
index 0000000..756e19b
--- /dev/null
@@ -0,0 +1,668 @@
+; Copyright © 2019, VideoLAN and dav1d authors
+; Copyright © 2019, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64 ; avoids cacheline splits
+
+min_prob:  dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+pw_0xff00: times 8 dw 0xff00
+pw_32:     times 8 dw 32
+
+%if ARCH_X86_64
+%define resp   resq
+%define movp   movq
+%define c_shuf q3333
+%macro DECODE_SYMBOL_ADAPT_INIT 0-1
+%endmacro
+%else
+%define resp   resd
+%define movp   movd
+%define c_shuf q1111
+%macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok
+    mov            t0, r0m
+    mov            t1, r1m
+%if %1 == 0
+    mov            t2, r2m
+%endif
+%if STACK_ALIGNMENT >= 16
+    sub           esp, 40-%1*4
+%else
+    mov           eax, esp
+    and           esp, ~15
+    sub           esp, 40-%1*4
+    mov         [esp], eax
+%endif
+%endmacro
+%endif
+
+struc msac
+    .buf:        resp 1
+    .end:        resp 1
+    .dif:        resp 1
+    .rng:        resd 1
+    .cnt:        resd 1
+    .update_cdf: resd 1
+endstruc
+
+%define m(x, y) mangle(private_prefix %+ _ %+ x %+ y)
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8
+%define buf rsp+stack_offset+8 ; shadow space
+%elif UNIX64
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8
+%define buf rsp-40 ; red zone
+%else
+DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3
+%define buf esp+8
+%endif
+
+INIT_XMM sse2
+cglobal msac_decode_symbol_adapt4, 0, 6, 6
+    DECODE_SYMBOL_ADAPT_INIT
+    LEA           rax, pw_0xff00
+    movd           m2, [t0+msac.rng]
+    movq           m1, [t1]
+    movp           m3, [t0+msac.dif]
+    mov           t3d, [t0+msac.update_cdf]
+    mov           t4d, t2d
+    not            t2     ; -(n_symbols + 1)
+    pshuflw        m2, m2, q0000
+    movd     [buf+12], m2
+    pand           m2, [rax]
+    mova           m0, m1
+    psrlw          m1, 6
+    psllw          m1, 7
+    pmulhuw        m1, m2
+    movq           m2, [rax+t2*2]
+    pshuflw        m3, m3, c_shuf
+    paddw          m1, m2
+    mova     [buf+16], m1
+    psubusw        m1, m3
+    pxor           m2, m2
+    pcmpeqw        m1, m2 ; c >= v
+    pmovmskb      eax, m1
+    test          t3d, t3d
+    jz .renorm ; !allow_update_cdf
+
+; update_cdf:
+    movzx         t3d, word [t1+t4*2] ; count
+    pcmpeqw        m2, m2
+    mov           t2d, t3d
+    shr           t3d, 4
+    cmp           t4d, 3
+    sbb           t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4
+    cmp           t2d, 32
+    adc           t2d, 0  ; count + (count < 32)
+    movd           m3, t3d
+    pavgw          m2, m1 ; i >= val ? -1 : 32768
+    psubw          m2, m0 ; for (i = 0; i < val; i++)
+    psubw          m0, m1 ;     cdf[i] += (32768 - cdf[i]) >> rate;
+    psraw          m2, m3 ; for (; i < n_symbols; i++)
+    paddw          m0, m2 ;     cdf[i] += ((  -1 - cdf[i]) >> rate) + 1;
+    movq         [t1], m0
+    mov     [t1+t4*2], t2w
+
+.renorm:
+    tzcnt         eax, eax
+    mov            t4, [t0+msac.dif]
+    movzx         t1d, word [buf+rax+16] ; v
+    movzx         t2d, word [buf+rax+14] ; u
+    shr           eax, 1
+.renorm2:
+%if ARCH_X86_64 == 0
+%if STACK_ALIGNMENT >= 16
+    add           esp, 40
+%else
+    mov           esp, [esp]
+%endif
+%endif
+    not            t4
+    sub           t2d, t1d ; rng
+    shl            t1, gprsize*8-16
+    add            t4, t1  ; ~dif
+.renorm3:
+    mov           t1d, [t0+msac.cnt]
+    movifnidn      t7, t0
+.renorm4:
+    bsr           ecx, t2d
+    xor           ecx, 15  ; d
+    shl           t2d, cl
+    shl            t4, cl
+    mov [t7+msac.rng], t2d
+    not            t4
+    sub           t1d, ecx
+    jae .end ; no refill required
+
+; refill:
+    mov            t2, [t7+msac.buf]
+    mov           rcx, [t7+msac.end]
+%if ARCH_X86_64 == 0
+    push           t5
+%endif
+    lea            t5, [t2+gprsize]
+    cmp            t5, rcx
+    ja .refill_eob
+    mov            t2, [t2]
+    lea           ecx, [t1+23]
+    add           t1d, 16
+    shr           ecx, 3   ; shift_bytes
+    bswap          t2
+    sub            t5, rcx
+    shl           ecx, 3   ; shift_bits
+    shr            t2, cl
+    sub           ecx, t1d ; shift_bits - 16 - cnt
+    mov           t1d, gprsize*8-16
+    shl            t2, cl
+    mov [t7+msac.buf], t5
+    sub           t1d, ecx ; cnt + gprsize*8 - shift_bits
+    xor            t4, t2
+%if ARCH_X86_64 == 0
+    pop            t5
+%endif
+.end:
+    mov [t7+msac.cnt], t1d
+    mov [t7+msac.dif], t4
+    RET
+.refill_eob: ; avoid overreading the input buffer
+    mov            t5, rcx
+    mov           ecx, gprsize*8-24
+    sub           ecx, t1d ; c
+.refill_eob_loop:
+    cmp            t2, t5
+    jae .refill_eob_end    ; eob reached
+    movzx         t1d, byte [t2]
+    inc            t2
+    shl            t1, cl
+    xor            t4, t1
+    sub           ecx, 8
+    jge .refill_eob_loop
+.refill_eob_end:
+    mov           t1d, gprsize*8-24
+%if ARCH_X86_64 == 0
+    pop            t5
+%endif
+    sub           t1d, ecx
+    mov [t7+msac.buf], t2
+    mov [t7+msac.dif], t4
+    mov [t7+msac.cnt], t1d
+    RET
+
+cglobal msac_decode_symbol_adapt8, 0, 6, 6
+    DECODE_SYMBOL_ADAPT_INIT
+    LEA           rax, pw_0xff00
+    movd           m2, [t0+msac.rng]
+    mova           m1, [t1]
+    movp           m3, [t0+msac.dif]
+    mov           t3d, [t0+msac.update_cdf]
+    mov           t4d, t2d
+    not            t2
+    pshuflw        m2, m2, q0000
+    movd     [buf+12], m2
+    punpcklqdq     m2, m2
+    mova           m0, m1
+    psrlw          m1, 6
+    pand           m2, [rax]
+    psllw          m1, 7
+    pmulhuw        m1, m2
+    movu           m2, [rax+t2*2]
+    pshuflw        m3, m3, c_shuf
+    paddw          m1, m2
+    punpcklqdq     m3, m3
+    mova     [buf+16], m1
+    psubusw        m1, m3
+    pxor           m2, m2
+    pcmpeqw        m1, m2
+    pmovmskb      eax, m1
+    test          t3d, t3d
+    jz m(msac_decode_symbol_adapt4, SUFFIX).renorm
+    movzx         t3d, word [t1+t4*2]
+    pcmpeqw        m2, m2
+    mov           t2d, t3d
+    shr           t3d, 4
+    cmp           t4d, 3 ; may be called with n_symbols <= 2
+    sbb           t3d, -5
+    cmp           t2d, 32
+    adc           t2d, 0
+    movd           m3, t3d
+    pavgw          m2, m1
+    psubw          m2, m0
+    psubw          m0, m1
+    psraw          m2, m3
+    paddw          m0, m2
+    mova         [t1], m0
+    mov     [t1+t4*2], t2w
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm
+
+cglobal msac_decode_symbol_adapt16, 0, 6, 6
+    DECODE_SYMBOL_ADAPT_INIT
+    LEA           rax, pw_0xff00
+    movd           m4, [t0+msac.rng]
+    mova           m2, [t1]
+    mova           m3, [t1+16]
+    movp           m5, [t0+msac.dif]
+    mov           t3d, [t0+msac.update_cdf]
+    mov           t4d, t2d
+    not            t2
+%if WIN64
+    sub           rsp, 48 ; need 36 bytes, shadow space is only 32
+%endif
+    pshuflw        m4, m4, q0000
+    movd      [buf-4], m4
+    punpcklqdq     m4, m4
+    mova           m0, m2
+    psrlw          m2, 6
+    mova           m1, m3
+    psrlw          m3, 6
+    pand           m4, [rax]
+    psllw          m2, 7
+    psllw          m3, 7
+    pmulhuw        m2, m4
+    pmulhuw        m3, m4
+    movu           m4, [rax+t2*2]
+    pshuflw        m5, m5, c_shuf
+    paddw          m2, m4
+    psubw          m4, [rax-pw_0xff00+pw_32]
+    punpcklqdq     m5, m5
+    paddw          m3, m4
+    mova        [buf], m2
+    psubusw        m2, m5
+    mova     [buf+16], m3
+    psubusw        m3, m5
+    pxor           m4, m4
+    pcmpeqw        m2, m4
+    pcmpeqw        m3, m4
+    packsswb       m5, m2, m3
+    pmovmskb      eax, m5
+    test          t3d, t3d
+    jz .renorm
+    movzx         t3d, word [t1+t4*2]
+    pcmpeqw        m4, m4
+    mova           m5, m4
+    lea           t2d, [t3+80] ; only support n_symbols > 2
+    shr           t2d, 4
+    cmp           t3d, 32
+    adc           t3d, 0
+    pavgw          m4, m2
+    pavgw          m5, m3
+    psubw          m4, m0
+    psubw          m0, m2
+    movd           m2, t2d
+    psubw          m5, m1
+    psubw          m1, m3
+    psraw          m4, m2
+    psraw          m5, m2
+    paddw          m0, m4
+    paddw          m1, m5
+    mova         [t1], m0
+    mova      [t1+16], m1
+    mov     [t1+t4*2], t3w
+.renorm:
+    tzcnt         eax, eax
+    mov            t4, [t0+msac.dif]
+    movzx         t1d, word [buf+rax*2]
+    movzx         t2d, word [buf+rax*2-2]
+%if WIN64
+    add           rsp, 48
+%endif
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2
+
+cglobal msac_decode_bool_adapt, 0, 6, 0
+    movifnidn      t1, r1mp
+    movifnidn      t0, r0mp
+    movzx         eax, word [t1]
+    movzx         t3d, byte [t0+msac.rng+1]
+    mov            t4, [t0+msac.dif]
+    mov           t2d, [t0+msac.rng]
+%if ARCH_X86_64
+    mov           t5d, eax
+%endif
+    and           eax, ~63
+    imul          eax, t3d
+%if UNIX64
+    mov            t6, t4
+%endif
+    shr           eax, 7
+    add           eax, 4            ; v
+    mov           t3d, eax
+    shl           rax, gprsize*8-16 ; vw
+    sub           t2d, t3d          ; r - v
+    sub            t4, rax          ; dif - vw
+    setb           al
+    cmovb         t2d, t3d
+    mov           t3d, [t0+msac.update_cdf]
+%if UNIX64
+    cmovb          t4, t6
+%else
+    cmovb          t4, [t0+msac.dif]
+%endif
+%if ARCH_X86_64 == 0
+    movzx         eax, al
+%endif
+    not            t4
+    test          t3d, t3d
+    jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+%if UNIX64 == 0
+    push           t6
+%endif
+    movzx         t6d, word [t1+2]
+%if ARCH_X86_64 == 0
+    push           t5
+    movzx         t5d, word [t1]
+%endif
+    movifnidn      t7, t0
+    lea           ecx, [t6+64]
+    cmp           t6d, 32
+    adc           t6d, 0
+    mov        [t1+2], t6w
+    imul          t6d, eax, -32769
+    shr           ecx, 4   ; rate
+    add           t6d, t5d ; if (bit)
+    sub           t5d, eax ;     cdf[0] -= ((cdf[0] - 32769) >> rate) + 1;
+    sar           t6d, cl  ; else
+    sub           t5d, t6d ;     cdf[0] -= cdf[0] >> rate;
+    mov          [t1], t5w
+%if WIN64
+    mov           t1d, [t7+msac.cnt]
+    pop            t6
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4
+%else
+%if ARCH_X86_64 == 0
+    pop            t5
+    pop            t6
+%endif
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+%endif
+
+cglobal msac_decode_bool_equi, 0, 6, 0
+    movifnidn      t0, r0mp
+    mov           t1d, [t0+msac.rng]
+    mov            t4, [t0+msac.dif]
+    mov           t2d, t1d
+    mov           t1b, 8
+    mov            t3, t4
+    mov           eax, t1d
+    shr           t1d, 1            ; v
+    shl           rax, gprsize*8-17 ; vw
+    sub           t2d, t1d          ; r - v
+    sub            t4, rax          ; dif - vw
+    cmovb         t2d, t1d
+    cmovb          t4, t3
+    setb           al ; the upper 32 bits contains garbage but that's OK
+    not            t4
+%if ARCH_X86_64 == 0
+    movzx         eax, al
+%endif
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+
+cglobal msac_decode_bool, 0, 6, 0
+    movifnidn      t0, r0mp
+    movifnidn     t1d, r1m
+    movzx         eax, byte [t0+msac.rng+1] ; r >> 8
+    mov            t4, [t0+msac.dif]
+    mov           t2d, [t0+msac.rng]
+    and           t1d, ~63
+    imul          eax, t1d
+    mov            t3, t4
+    shr           eax, 7
+    add           eax, 4            ; v
+    mov           t1d, eax
+    shl           rax, gprsize*8-16 ; vw
+    sub           t2d, t1d          ; r - v
+    sub            t4, rax          ; dif - vw
+    cmovb         t2d, t1d
+    cmovb          t4, t3
+    setb           al
+    not            t4
+%if ARCH_X86_64 == 0
+    movzx         eax, al
+%endif
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+
+%macro HI_TOK 1 ; update_cdf
+%if ARCH_X86_64 == 0
+    mov           eax, -24
+%endif
+%%loop:
+%if %1
+    movzx         t2d, word [t1+3*2]
+%endif
+    mova           m1, m0
+    pshuflw        m2, m2, q0000
+    psrlw          m1, 6
+    movd     [buf+12], m2
+    pand           m2, m4
+    psllw          m1, 7
+    pmulhuw        m1, m2
+%if ARCH_X86_64 == 0
+    add           eax, 5
+    mov       [buf+8], eax
+%endif
+    pshuflw        m3, m3, c_shuf
+    paddw          m1, m5
+    movq     [buf+16], m1
+    psubusw        m1, m3
+    pxor           m2, m2
+    pcmpeqw        m1, m2
+    pmovmskb      eax, m1
+%if %1
+    lea           ecx, [t2+80]
+    pcmpeqw        m2, m2
+    shr           ecx, 4
+    cmp           t2d, 32
+    adc           t2d, 0
+    movd           m3, ecx
+    pavgw          m2, m1
+    psubw          m2, m0
+    psubw          m0, m1
+    psraw          m2, m3
+    paddw          m0, m2
+    movq         [t1], m0
+    mov      [t1+3*2], t2w
+%endif
+    tzcnt         eax, eax
+    movzx         ecx, word [buf+rax+16]
+    movzx         t2d, word [buf+rax+14]
+    not            t4
+%if ARCH_X86_64
+    add           t6d, 5
+%endif
+    sub           eax, 5   ; setup for merging the tok_br and tok branches
+    sub           t2d, ecx
+    shl           rcx, gprsize*8-16
+    add            t4, rcx
+    bsr           ecx, t2d
+    xor           ecx, 15
+    shl           t2d, cl
+    shl            t4, cl
+    movd           m2, t2d
+    mov [t7+msac.rng], t2d
+    not            t4
+    sub           t5d, ecx
+    jae %%end
+    mov            t2, [t7+msac.buf]
+    mov           rcx, [t7+msac.end]
+%if UNIX64 == 0
+    push           t8
+%endif
+    lea            t8, [t2+gprsize]
+    cmp            t8, rcx
+    ja %%refill_eob
+    mov            t2, [t2]
+    lea           ecx, [t5+23]
+    add           t5d, 16
+    shr           ecx, 3
+    bswap          t2
+    sub            t8, rcx
+    shl           ecx, 3
+    shr            t2, cl
+    sub           ecx, t5d
+    mov           t5d, gprsize*8-16
+    shl            t2, cl
+    mov [t7+msac.buf], t8
+%if UNIX64 == 0
+    pop            t8
+%endif
+    sub           t5d, ecx
+    xor            t4, t2
+%%end:
+    movp           m3, t4
+%if ARCH_X86_64
+    add           t6d, eax ; CF = tok_br < 3 || tok == 15
+    jnc %%loop
+    lea           eax, [t6+30]
+%else
+    add           eax, [buf+8]
+    jnc %%loop
+    add           eax, 30
+%if STACK_ALIGNMENT >= 16
+    add           esp, 36
+%else
+    mov           esp, [esp]
+%endif
+%endif
+    mov [t7+msac.dif], t4
+    shr           eax, 1
+    mov [t7+msac.cnt], t5d
+    RET
+%%refill_eob:
+    mov            t8, rcx
+    mov           ecx, gprsize*8-24
+    sub           ecx, t5d
+%%refill_eob_loop:
+    cmp            t2, t8
+    jae %%refill_eob_end
+    movzx         t5d, byte [t2]
+    inc            t2
+    shl            t5, cl
+    xor            t4, t5
+    sub           ecx, 8
+    jge %%refill_eob_loop
+%%refill_eob_end:
+%if UNIX64 == 0
+    pop            t8
+%endif
+    mov           t5d, gprsize*8-24
+    mov [t7+msac.buf], t2
+    sub           t5d, ecx
+    jmp %%end
+%endmacro
+
+cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
+    DECODE_SYMBOL_ADAPT_INIT 1
+%if ARCH_X86_64 == 0 && PIC
+    LEA            t2, min_prob+12*2
+    %define base t2-(min_prob+12*2)
+%else
+    %define base 0
+%endif
+    movq           m0, [t1]
+    movd           m2, [t0+msac.rng]
+    mov           eax, [t0+msac.update_cdf]
+    movq           m4, [base+pw_0xff00]
+    movp           m3, [t0+msac.dif]
+    movq           m5, [base+min_prob+12*2]
+    mov            t4, [t0+msac.dif]
+    mov           t5d, [t0+msac.cnt]
+%if ARCH_X86_64
+    mov           t6d, -24
+%endif
+    movifnidn      t7, t0
+    test          eax, eax
+    jz .no_update_cdf
+    HI_TOK          1
+.no_update_cdf:
+    HI_TOK          0
+
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal msac_decode_symbol_adapt16, 3, 6, 6
+    lea           rax, [pw_0xff00]
+    vpbroadcastw   m2, [t0+msac.rng]
+    mova           m0, [t1]
+    vpbroadcastw   m3, [t0+msac.dif+6]
+    vbroadcasti128 m4, [rax]
+    mov           t3d, [t0+msac.update_cdf]
+    mov           t4d, t2d
+    not            t2
+%if STACK_ALIGNMENT < 32
+    mov            r5, rsp
+%if WIN64
+    and           rsp, ~31
+    sub           rsp, 40
+%else
+    and            r5, ~31
+    %define buf r5-32
+%endif
+%elif WIN64
+    sub           rsp, 64
+%else
+    %define buf rsp-56
+%endif
+    psrlw          m1, m0, 6
+    movd      [buf-4], xm2
+    pand           m2, m4
+    psllw          m1, 7
+    pmulhuw        m1, m2
+    paddw          m1, [rax+t2*2]
+    mova        [buf], m1
+    pmaxuw         m1, m3
+    pcmpeqw        m1, m3
+    pmovmskb      eax, m1
+    test          t3d, t3d
+    jz .renorm
+    movzx         t3d, word [t1+t4*2]
+    pcmpeqw        m2, m2
+    lea           t2d, [t3+80]
+    shr           t2d, 4
+    cmp           t3d, 32
+    adc           t3d, 0
+    movd          xm3, t2d
+    pavgw          m2, m1
+    psubw          m2, m0
+    psubw          m0, m1
+    psraw          m2, xm3
+    paddw          m0, m2
+    mova         [t1], m0
+    mov     [t1+t4*2], t3w
+.renorm:
+    tzcnt         eax, eax
+    mov            t4, [t0+msac.dif]
+    movzx         t1d, word [buf+rax-0]
+    movzx         t2d, word [buf+rax-2]
+    shr           eax, 1
+%if WIN64
+%if STACK_ALIGNMENT < 32
+    mov           rsp, r5
+%else
+    add           rsp, 64
+%endif
+%endif
+    vzeroupper
+    jmp m(msac_decode_symbol_adapt4, _sse2).renorm2
+%endif
diff --git a/src/x86/msac.h b/src/x86/msac.h
new file mode 100644 (file)
index 0000000..e11cd08
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_X86_MSAC_H
+#define DAV1D_SRC_X86_MSAC_H
+
+unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
+                                               size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
+unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
+unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
+
+/* Needed for checkasm */
+unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
+                                               size_t n_symbols);
+
+#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
+#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
+#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
+#endif
+
+#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
+#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
+#define dav1d_msac_decode_bool           dav1d_msac_decode_bool_sse2
+
+#if ARCH_X86_64
+#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
+#elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
+#endif
+
+void dav1d_msac_init_x86(MsacContext *const s);
+
+#endif /* DAV1D_SRC_X86_MSAC_H */
diff --git a/src/x86/msac_init.c b/src/x86/msac_init.c
new file mode 100644 (file)
index 0000000..a634da2
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/msac.h"
+#include "src/x86/msac.h"
+
+#if ARCH_X86_64
+void dav1d_msac_init_x86(MsacContext *const s) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
+        s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
+    }
+
+    if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
+        s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
+    }
+}
+#endif
diff --git a/tests/checkasm/arm/checkasm_32.S b/tests/checkasm/arm/checkasm_32.S
new file mode 100644 (file)
index 0000000..a186ef8
--- /dev/null
@@ -0,0 +1,201 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2015 Martin Storsjo
+ * Copyright © 2015 Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#define PRIVATE_PREFIX checkasm_
+
+#include "src/arm/asm.S"
+#include "src/arm/32/util.S"
+
+const register_init, align=3
+        .quad 0x21f86d66c8ca00ce
+        .quad 0x75b6ba21077c48ad
+        .quad 0xed56bb2dcb3c7736
+        .quad 0x8bda43d3fd1a7e06
+        .quad 0xb64a9c9e5d318408
+        .quad 0xdf9a54b303f1d3a3
+        .quad 0x4a75479abd64e097
+        .quad 0x249214109d5d1c88
+endconst
+
+const error_message_fpscr
+        .asciz "failed to preserve register FPSCR, changed bits: %x"
+error_message_gpr:
+        .asciz "failed to preserve register r%d"
+error_message_vfp:
+        .asciz "failed to preserve register d%d"
+error_message_stack:
+        .asciz "failed to preserve stack"
+endconst
+
+@ max number of args used by any asm function.
+#define MAX_ARGS 15
+
+#define ARG_STACK 4*(MAX_ARGS - 4)
+
+@ Align the used stack space to 8 to preserve the stack alignment.
+@ +8 for stack canary reference.
+#define ARG_STACK_A (((ARG_STACK + pushed + 7) & ~7) - pushed + 8)
+
+.macro clobbercheck variant
+.equ pushed, 4*9
+function checked_call_\variant, export=1
+        push            {r4-r11, lr}
+.ifc \variant, vfp
+        vpush           {d8-d15}
+        fmrx            r4,  FPSCR
+        push            {r4}
+.equ pushed, pushed + 16*4 + 4
+.endif
+
+        movrel          r12, register_init
+.ifc \variant, vfp
+        vldm            r12, {d8-d15}
+.endif
+        ldm             r12, {r4-r11}
+
+        sub             sp,  sp,  #ARG_STACK_A
+.equ pos, 0
+.rept MAX_ARGS-4
+        ldr             r12, [sp, #ARG_STACK_A + pushed + 8 + pos]
+        str             r12, [sp, #pos]
+.equ pos, pos + 4
+.endr
+
+        @ For stack overflows, the callee is free to overwrite the parameters
+        @ that were passed on the stack (if any), so we can only check after
+        @ that point. First figure out how many parameters the function
+        @ really took on the stack:
+        ldr             r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)]
+        @ Load the first non-parameter value from the stack, that should be
+        @ left untouched by the function. Store a copy of it inverted, so that
+        @ e.g. overwriting everything with zero would be noticed.
+        ldr             r12, [sp, r12, lsl #2]
+        mvn             r12, r12
+        str             r12, [sp, #ARG_STACK_A - 4]
+
+        mov             r12, r0
+        mov             r0,  r2
+        mov             r1,  r3
+        ldrd            r2,  r3,  [sp, #ARG_STACK_A + pushed]
+        @ Call the target function
+        blx             r12
+
+        @ Load the number of stack parameters, stack canary and its reference
+        ldr             r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)]
+        ldr             r2,  [sp, r12, lsl #2]
+        ldr             r3,  [sp, #ARG_STACK_A - 4]
+
+        add             sp,  sp,  #ARG_STACK_A
+        push            {r0, r1}
+
+        mvn             r3,  r3
+        cmp             r2,  r3
+        bne             5f
+
+        movrel          r12, register_init
+.ifc \variant, vfp
+.macro check_reg_vfp, dreg, offset
+        ldrd            r2,  r3,  [r12, #8 * (\offset)]
+        vmov            r0,  lr,  \dreg
+        eor             r2,  r2,  r0
+        eor             r3,  r3,  lr
+        orrs            r2,  r2,  r3
+        bne             4f
+.endm
+
+.irp n, 8, 9, 10, 11, 12, 13, 14, 15
+        @ keep track of the checked double/SIMD register
+        mov             r1,  #\n
+        check_reg_vfp   d\n, \n-8
+.endr
+.purgem check_reg_vfp
+
+        fmrx            r1,  FPSCR
+        ldr             r3,  [sp, #8]
+        eor             r1,  r1,  r3
+        @ Ignore changes in bits 0-4 and 7
+        bic             r1,  r1,  #0x9f
+        @ Ignore changes in the topmost 5 bits
+        bics            r1,  r1,  #0xf8000000
+        bne             3f
+.endif
+
+        @ keep track of the checked GPR
+        mov             r1,  #4
+.macro check_reg reg1, reg2=
+        ldrd            r2,  r3,  [r12], #8
+        eors            r2,  r2,  \reg1
+        bne             2f
+        add             r1,  r1,  #1
+.ifnb \reg2
+        eors            r3,  r3,  \reg2
+        bne             2f
+.endif
+        add             r1,  r1,  #1
+.endm
+        check_reg       r4,  r5
+        check_reg       r6,  r7
+@ r9 is a volatile register in the ios ABI
+#ifdef __APPLE__
+        check_reg       r8
+#else
+        check_reg       r8,  r9
+#endif
+        check_reg       r10, r11
+.purgem check_reg
+
+        b               0f
+5:
+        movrel          r0, error_message_stack
+        b               1f
+4:
+        movrel          r0, error_message_vfp
+        b               1f
+3:
+        movrel          r0, error_message_fpscr
+        b               1f
+2:
+        movrel          r0, error_message_gpr
+1:
+#ifdef PREFIX
+        bl              _checkasm_fail_func
+#else
+        bl              checkasm_fail_func
+#endif
+0:
+        pop             {r0, r1}
+.ifc \variant, vfp
+        pop             {r2}
+        fmxr            FPSCR, r2
+        vpop            {d8-d15}
+.endif
+        pop             {r4-r11, pc}
+endfunc
+.endm
+
+clobbercheck vfp
diff --git a/tests/checkasm/arm/checkasm_64.S b/tests/checkasm/arm/checkasm_64.S
new file mode 100644 (file)
index 0000000..2574914
--- /dev/null
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2015 Martin Storsjo
+ * Copyright © 2015 Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#define PRIVATE_PREFIX checkasm_
+
+#include "src/arm/asm.S"
+#include "src/arm/64/util.S"
+
+const register_init, align=4
+        .quad 0x21f86d66c8ca00ce
+        .quad 0x75b6ba21077c48ad
+        .quad 0xed56bb2dcb3c7736
+        .quad 0x8bda43d3fd1a7e06
+        .quad 0xb64a9c9e5d318408
+        .quad 0xdf9a54b303f1d3a3
+        .quad 0x4a75479abd64e097
+        .quad 0x249214109d5d1c88
+        .quad 0x1a1b2550a612b48c
+        .quad 0x79445c159ce79064
+        .quad 0x2eed899d5a28ddcd
+        .quad 0x86b2536fcd8cf636
+        .quad 0xb0856806085e7943
+        .quad 0x3f2bf84fc0fcca4e
+        .quad 0xacbd382dcf5b8de2
+        .quad 0xd229e1f5b281303f
+        .quad 0x71aeaff20b095fd9
+        .quad 0xab63e2e11fa38ed9
+endconst
+
+
+const error_message_register
+        .asciz "failed to preserve register"
+error_message_stack:
+        .asciz "stack clobbered"
+endconst
+
+
+// max number of args used by any asm function.
+#define MAX_ARGS 15
+
+#define CLOBBER_STACK ((8*MAX_ARGS + 15) & ~15)
+
+function stack_clobber, export=1
+        mov             x3,  sp
+        mov             x2,  #CLOBBER_STACK
+1:
+        stp             x0,  x1,  [sp, #-16]!
+        subs            x2,  x2,  #16
+        b.gt            1b
+        mov             sp,  x3
+        ret
+endfunc
+
+// + 16 for stack canary reference
+#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15 + 16)
+
+function checked_call, export=1
+        stp             x29, x30, [sp, #-16]!
+        mov             x29, sp
+        stp             x19, x20, [sp, #-16]!
+        stp             x21, x22, [sp, #-16]!
+        stp             x23, x24, [sp, #-16]!
+        stp             x25, x26, [sp, #-16]!
+        stp             x27, x28, [sp, #-16]!
+        stp             d8,  d9,  [sp, #-16]!
+        stp             d10, d11, [sp, #-16]!
+        stp             d12, d13, [sp, #-16]!
+        stp             d14, d15, [sp, #-16]!
+
+        movrel          x9, register_init
+        ldp             d8,  d9,  [x9], #16
+        ldp             d10, d11, [x9], #16
+        ldp             d12, d13, [x9], #16
+        ldp             d14, d15, [x9], #16
+        ldp             x19, x20, [x9], #16
+        ldp             x21, x22, [x9], #16
+        ldp             x23, x24, [x9], #16
+        ldp             x25, x26, [x9], #16
+        ldp             x27, x28, [x9], #16
+
+        sub             sp,  sp,  #ARG_STACK
+.equ pos, 0
+.rept MAX_ARGS-8
+        // Skip the first 8 args, that are loaded into registers
+        ldr             x9, [x29, #16 + 8*8 + pos]
+        str             x9, [sp, #pos]
+.equ pos, pos + 8
+.endr
+
+        // Fill x8-x17 with garbage. This doesn't have to be preserved,
+        // but avoids relying on them having any particular value.
+        movrel          x9, register_init
+        ldp             x10, x11, [x9], #32
+        ldp             x12, x13, [x9], #32
+        ldp             x14, x15, [x9], #32
+        ldp             x16, x17, [x9], #32
+        ldp             x8,  x9,  [x9]
+
+        // For stack overflows, the callee is free to overwrite the parameters
+        // that were passed on the stack (if any), so we can only check after
+        // that point. First figure out how many parameters the function
+        // really took on the stack:
+        ldr             w2,  [x29, #16 + 8*8 + (MAX_ARGS-8)*8]
+        // Load the first non-parameter value from the stack, that should be
+        // left untouched by the function. Store a copy of it inverted, so that
+        // e.g. overwriting everything with zero would be noticed.
+        ldr             x2,  [sp, x2, lsl #3]
+        mvn             x2,  x2
+        str             x2,  [sp, #ARG_STACK-8]
+
+        // Load the in-register arguments
+        mov             x12, x0
+        ldp             x0,  x1,  [x29, #16]
+        ldp             x2,  x3,  [x29, #32]
+        ldp             x4,  x5,  [x29, #48]
+        ldp             x6,  x7,  [x29, #64]
+        // Call the target function
+        blr             x12
+
+        // Load the number of stack parameters, stack canary and its reference
+        ldr             w2,  [x29, #16 + 8*8 + (MAX_ARGS-8)*8]
+        ldr             x2,  [sp, x2, lsl #3]
+        ldr             x3,  [sp, #ARG_STACK-8]
+
+        add             sp,  sp,  #ARG_STACK
+        stp             x0,  x1,  [sp, #-16]!
+
+        mvn             x3,  x3
+        cmp             x2,  x3
+        b.ne            2f
+
+        movrel          x9, register_init
+        movi            v3.8h,  #0
+
+.macro check_reg_neon reg1, reg2
+        ldr             q1,  [x9], #16
+        uzp1            v2.2d,  v\reg1\().2d, v\reg2\().2d
+        eor             v1.16b, v1.16b, v2.16b
+        orr             v3.16b, v3.16b, v1.16b
+.endm
+        check_reg_neon  8,  9
+        check_reg_neon  10, 11
+        check_reg_neon  12, 13
+        check_reg_neon  14, 15
+        uqxtn           v3.8b,  v3.8h
+        umov            x3,  v3.d[0]
+
+.macro check_reg reg1, reg2
+        ldp             x0,  x1,  [x9], #16
+        eor             x0,  x0,  \reg1
+        eor             x1,  x1,  \reg2
+        orr             x3,  x3,  x0
+        orr             x3,  x3,  x1
+.endm
+        check_reg       x19, x20
+        check_reg       x21, x22
+        check_reg       x23, x24
+        check_reg       x25, x26
+        check_reg       x27, x28
+
+        cbz             x3,  0f
+
+        movrel          x0, error_message_register
+        b               1f
+2:
+        movrel          x0, error_message_stack
+1:
+#ifdef PREFIX
+        bl              _checkasm_fail_func
+#else
+        bl              checkasm_fail_func
+#endif
+0:
+        ldp             x0,  x1,  [sp], #16
+        ldp             d14, d15, [sp], #16
+        ldp             d12, d13, [sp], #16
+        ldp             d10, d11, [sp], #16
+        ldp             d8,  d9,  [sp], #16
+        ldp             x27, x28, [sp], #16
+        ldp             x25, x26, [sp], #16
+        ldp             x23, x24, [sp], #16
+        ldp             x21, x22, [sp], #16
+        ldp             x19, x20, [sp], #16
+        ldp             x29, x30, [sp], #16
+        ret
+endfunc
diff --git a/tests/checkasm/cdef.c b/tests/checkasm/cdef.c
new file mode 100644 (file)
index 0000000..7259e1a
--- /dev/null
@@ -0,0 +1,150 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tests/checkasm/checkasm.h"
+
+#include <string.h>
+#include <stdio.h>
+
+#include "common/dump.h"
+
+#include "src/levels.h"
+#include "src/cdef.h"
+
+static int to_binary(int x) { /* 0-15 -> 0000-1111 */
+    return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8);
+}
+
+static void init_tmp(pixel *buf, int n, const int bitdepth_max) {
+    const int fill_type = rnd() & 7;
+    if (fill_type == 0)
+        while (n--) /* check for cdef_filter underflows */
+            *buf++ = rnd() & 1;
+    else if (fill_type == 1)
+        while (n--) /* check for cdef_filter overflows */
+            *buf++ = bitdepth_max - (rnd() & 1);
+    else
+        while (n--)
+            *buf++ = rnd() & bitdepth_max;
+}
+
+static void check_cdef_filter(const cdef_fn fn, const int w, const int h) {
+    ALIGN_STK_64(pixel, c_src,   16 * 10 + 16, ), *const c_dst = c_src + 8;
+    ALIGN_STK_64(pixel, a_src,   16 * 10 + 16, ), *const a_dst = a_src + 8;
+    ALIGN_STK_64(pixel, top_buf, 16 *  2 + 16, ), *const top = top_buf + 8;
+    ALIGN_STK_16(pixel, left, 8,[2]);
+    const ptrdiff_t stride = 16 * sizeof(pixel);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*left)[2],
+                 const pixel *top, int pri_strength, int sec_strength,
+                 int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+    if (check_func(fn, "cdef_filter_%dx%d_%dbpc", w, h, BITDEPTH)) {
+        for (int dir = 0; dir < 8; dir++) {
+            for (enum CdefEdgeFlags edges = 0x0; edges <= 0xf; edges++) {
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                const int bitdepth_max = 0xff;
+#endif
+                const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+
+                init_tmp(c_src, 16 * 10 + 16, bitdepth_max);
+                init_tmp(top_buf, 16 * 2 + 16, bitdepth_max);
+                init_tmp((pixel *) left, 8 * 2, bitdepth_max);
+                memcpy(a_src, c_src, (16 * 10 + 16) * sizeof(pixel));
+
+                const int lvl = 1 + (rnd() % 62);
+                const int damping = 3 + (rnd() & 3) + bitdepth_min_8 - (w == 4 || (rnd() & 1));
+                int pri_strength = (lvl >> 2) << bitdepth_min_8;
+                int sec_strength = lvl & 3;
+                sec_strength += sec_strength == 3;
+                sec_strength <<= bitdepth_min_8;
+                call_ref(c_dst, stride, left, top, pri_strength, sec_strength,
+                         dir, damping, edges HIGHBD_TAIL_SUFFIX);
+                call_new(a_dst, stride, left, top, pri_strength, sec_strength,
+                         dir, damping, edges HIGHBD_TAIL_SUFFIX);
+                if (checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst")) {
+                    fprintf(stderr, "strength = %d:%d, dir = %d, damping = %d, edges = %04d\n",
+                            pri_strength, sec_strength, dir, damping, to_binary(edges));
+                    return;
+                }
+                if (dir == 7 && (edges == 0x5 || edges == 0xa || edges == 0xf)) {
+                    /* Benchmark a fixed set of cases to get consistent results:
+                     *  1) top/left edges and pri_strength only
+                     *  2) bottom/right edges and sec_strength only
+                     *  3) all edges and both pri_strength and sec_strength
+                     */
+                    pri_strength = (edges & 1) << bitdepth_min_8;
+                    sec_strength = (edges & 2) << bitdepth_min_8;
+                    bench_new(a_dst, stride, left, top, pri_strength, sec_strength,
+                              dir, damping, edges HIGHBD_TAIL_SUFFIX);
+                }
+            }
+        }
+    }
+}
+
+static void check_cdef_direction(const cdef_dir_fn fn) {
+    ALIGN_STK_64(pixel, src, 8 * 8,);
+
+    declare_func(int, pixel *src, ptrdiff_t dst_stride, unsigned *var
+                 HIGHBD_DECL_SUFFIX);
+
+    if (check_func(fn, "cdef_dir_%dbpc", BITDEPTH)) {
+        unsigned c_var, a_var;
+#if BITDEPTH == 16
+        const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+        const int bitdepth_max = 0xff;
+#endif
+        init_tmp(src, 64, bitdepth_max);
+
+        const int c_dir = call_ref(src, 8 * sizeof(pixel), &c_var HIGHBD_TAIL_SUFFIX);
+        const int a_dir = call_new(src, 8 * sizeof(pixel), &a_var HIGHBD_TAIL_SUFFIX);
+        if (c_var != a_var || c_dir != a_dir) {
+            if (fail()) {
+                hex_fdump(stderr, src, 8 * sizeof(pixel), 8, 8, "src");
+                fprintf(stderr, "c_dir %d a_dir %d\n", c_dir, a_dir);
+            }
+        }
+        bench_new(src, 8 * sizeof(pixel), &a_var HIGHBD_TAIL_SUFFIX);
+    }
+    report("cdef_dir");
+}
+
+void bitfn(checkasm_check_cdef)(void) {
+    Dav1dCdefDSPContext c;
+    bitfn(dav1d_cdef_dsp_init)(&c);
+
+    check_cdef_direction(c.dir);
+
+    check_cdef_filter(c.fb[0], 8, 8);
+    check_cdef_filter(c.fb[1], 4, 8);
+    check_cdef_filter(c.fb[2], 4, 4);
+    report("cdef_filter");
+}
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
new file mode 100644 (file)
index 0000000..ee52c89
--- /dev/null
@@ -0,0 +1,797 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "tests/checkasm/checkasm.h"
+
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "src/cpu.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#define COLOR_RED    FOREGROUND_RED
+#define COLOR_GREEN  FOREGROUND_GREEN
+#define COLOR_YELLOW (FOREGROUND_RED|FOREGROUND_GREEN)
+
+static unsigned get_seed(void) {
+    return GetTickCount();
+}
+#else
+#include <unistd.h>
+#include <signal.h>
+#include <time.h>
+#ifdef __APPLE__
+#include <mach/mach_time.h>
+#endif
+#define COLOR_RED    1
+#define COLOR_GREEN  2
+#define COLOR_YELLOW 3
+
+static unsigned get_seed(void) {
+#ifdef __APPLE__
+    return (unsigned) mach_absolute_time();
+#elif defined(HAVE_CLOCK_GETTIME)
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec);
+#endif
+}
+#endif
+
+/* List of tests to invoke */
+static const struct {
+    const char *name;
+    void (*func)(void);
+} tests[] = {
+    { "msac", checkasm_check_msac },
+#if CONFIG_8BPC
+    { "cdef_8bpc", checkasm_check_cdef_8bpc },
+    { "filmgrain_8bpc", checkasm_check_filmgrain_8bpc },
+    { "ipred_8bpc", checkasm_check_ipred_8bpc },
+    { "itx_8bpc", checkasm_check_itx_8bpc },
+    { "loopfilter_8bpc", checkasm_check_loopfilter_8bpc },
+    { "looprestoration_8bpc", checkasm_check_looprestoration_8bpc },
+    { "mc_8bpc", checkasm_check_mc_8bpc },
+#endif
+#if CONFIG_16BPC
+    { "cdef_16bpc", checkasm_check_cdef_16bpc },
+    { "filmgrain_16bpc", checkasm_check_filmgrain_16bpc },
+    { "ipred_16bpc", checkasm_check_ipred_16bpc },
+    { "itx_16bpc", checkasm_check_itx_16bpc },
+    { "loopfilter_16bpc", checkasm_check_loopfilter_16bpc },
+    { "looprestoration_16bpc", checkasm_check_looprestoration_16bpc },
+    { "mc_16bpc", checkasm_check_mc_16bpc },
+#endif
+    { 0 }
+};
+
+/* List of cpu flags to check */
+static const struct {
+    const char *name;
+    const char *suffix;
+    unsigned flag;
+} cpus[] = {
+#if ARCH_X86
+    { "SSE2",               "sse2",      DAV1D_X86_CPU_FLAG_SSE2 },
+    { "SSSE3",              "ssse3",     DAV1D_X86_CPU_FLAG_SSSE3 },
+    { "SSE4.1",             "sse4",      DAV1D_X86_CPU_FLAG_SSE41 },
+    { "AVX2",               "avx2",      DAV1D_X86_CPU_FLAG_AVX2 },
+    { "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL },
+#elif ARCH_AARCH64 || ARCH_ARM
+    { "NEON",               "neon",      DAV1D_ARM_CPU_FLAG_NEON },
+#elif ARCH_PPC64LE
+    { "VSX",                "vsx",       DAV1D_PPC_CPU_FLAG_VSX },
+#endif
+    { 0 }
+};
+
+typedef struct CheckasmFuncVersion {
+    struct CheckasmFuncVersion *next;
+    void *func;
+    int ok;
+    unsigned cpu;
+    int iterations;
+    uint64_t cycles;
+} CheckasmFuncVersion;
+
+/* Binary search tree node */
+typedef struct CheckasmFunc {
+    struct CheckasmFunc *child[2];
+    CheckasmFuncVersion versions;
+    uint8_t color; /* 0 = red, 1 = black */
+    char name[];
+} CheckasmFunc;
+
+/* Internal state */
+static struct {
+    CheckasmFunc *funcs;
+    CheckasmFunc *current_func;
+    CheckasmFuncVersion *current_func_ver;
+    const char *current_test_name;
+    const char *bench_pattern;
+    size_t bench_pattern_len;
+    int num_checked;
+    int num_failed;
+    int nop_time;
+    unsigned cpu_flag;
+    const char *cpu_flag_name;
+    const char *test_name;
+    unsigned seed;
+    int bench_c;
+    int verbose;
+    int function_listing;
+#if ARCH_X86_64
+    void (*simd_warmup)(void);
+#endif
+} state;
+
+/* float compare support code */
+typedef union {
+    float f;
+    uint32_t i;
+} intfloat;
+
+static uint32_t xs_state[4];
+
+static void xor128_srand(unsigned seed) {
+    xs_state[0] = seed;
+    xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff);
+    xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff);
+    xs_state[3] = ~seed;
+}
+
+// xor128 from Marsaglia, George (July 2003). "Xorshift RNGs".
+//             Journal of Statistical Software. 8 (14).
+//             doi:10.18637/jss.v008.i14.
+int xor128_rand(void) {
+    const uint32_t x = xs_state[0];
+    const uint32_t t = x ^ (x << 11);
+
+    xs_state[0] = xs_state[1];
+    xs_state[1] = xs_state[2];
+    xs_state[2] = xs_state[3];
+    uint32_t w = xs_state[3];
+
+    w = (w ^ (w >> 19)) ^ (t ^ (t >> 8));
+    xs_state[3] = w;
+
+    return w >> 1;
+}
+
+static int is_negative(const intfloat u) {
+    return u.i >> 31;
+}
+
+int float_near_ulp(const float a, const float b, const unsigned max_ulp) {
+    intfloat x, y;
+
+    x.f = a;
+    y.f = b;
+
+    if (is_negative(x) != is_negative(y)) {
+        // handle -0.0 == +0.0
+        return a == b;
+    }
+
+    if (llabs((int64_t)x.i - y.i) <= max_ulp)
+        return 1;
+
+    return 0;
+}
+
+int float_near_ulp_array(const float *const a, const float *const b,
+                         const unsigned max_ulp, const int len)
+{
+    for (int i = 0; i < len; i++)
+        if (!float_near_ulp(a[i], b[i], max_ulp))
+            return 0;
+
+    return 1;
+}
+
+int float_near_abs_eps(const float a, const float b, const float eps) {
+    return fabsf(a - b) < eps;
+}
+
+int float_near_abs_eps_array(const float *const a, const float *const b,
+                             const float eps, const int len)
+{
+    for (int i = 0; i < len; i++)
+        if (!float_near_abs_eps(a[i], b[i], eps))
+            return 0;
+
+    return 1;
+}
+
+int float_near_abs_eps_ulp(const float a, const float b, const float eps,
+                           const unsigned max_ulp)
+{
+    return float_near_ulp(a, b, max_ulp) || float_near_abs_eps(a, b, eps);
+}
+
+int float_near_abs_eps_array_ulp(const float *const a, const float *const b,
+                                 const float eps, const unsigned max_ulp,
+                                 const int len)
+{
+    for (int i = 0; i < len; i++)
+        if (!float_near_abs_eps_ulp(a[i], b[i], eps, max_ulp))
+            return 0;
+
+    return 1;
+}
+
+/* Print colored text to stderr if the terminal supports it */
+static void color_printf(const int color, const char *const fmt, ...) {
+    static int8_t use_color = -1;
+    va_list arg;
+
+#ifdef _WIN32
+    static HANDLE con;
+    static WORD org_attributes;
+
+    if (use_color < 0) {
+        CONSOLE_SCREEN_BUFFER_INFO con_info;
+        con = GetStdHandle(STD_ERROR_HANDLE);
+        if (con && con != INVALID_HANDLE_VALUE &&
+            GetConsoleScreenBufferInfo(con, &con_info))
+        {
+            org_attributes = con_info.wAttributes;
+            use_color = 1;
+        } else
+            use_color = 0;
+    }
+    if (use_color)
+        SetConsoleTextAttribute(con, (org_attributes & 0xfff0) |
+                                (color & 0x0f));
+#else
+    if (use_color < 0) {
+        const char *const term = getenv("TERM");
+        use_color = term && strcmp(term, "dumb") && isatty(2);
+    }
+    if (use_color)
+        fprintf(stderr, "\x1b[%d;3%dm", (color & 0x08) >> 3, color & 0x07);
+#endif
+
+    va_start(arg, fmt);
+    vfprintf(stderr, fmt, arg);
+    va_end(arg);
+
+    if (use_color) {
+#ifdef _WIN32
+        SetConsoleTextAttribute(con, org_attributes);
+#else
+        fprintf(stderr, "\x1b[0m");
+#endif
+    }
+}
+
+/* Deallocate a tree */
+static void destroy_func_tree(CheckasmFunc *const f) {
+    if (f) {
+        CheckasmFuncVersion *v = f->versions.next;
+        while (v) {
+            CheckasmFuncVersion *next = v->next;
+            free(v);
+            v = next;
+        }
+
+        destroy_func_tree(f->child[0]);
+        destroy_func_tree(f->child[1]);
+        free(f);
+    }
+}
+
+/* Allocate a zero-initialized block, clean up and exit on failure */
+static void *checkasm_malloc(const size_t size) {
+    void *const ptr = calloc(1, size);
+    if (!ptr) {
+        fprintf(stderr, "checkasm: malloc failed\n");
+        destroy_func_tree(state.funcs);
+        exit(1);
+    }
+    return ptr;
+}
+
+/* Get the suffix of the specified cpu flag */
+static const char *cpu_suffix(const unsigned cpu) {
+    for (int i = (int)(sizeof(cpus) / sizeof(*cpus)) - 2; i >= 0; i--)
+        if (cpu & cpus[i].flag)
+            return cpus[i].suffix;
+
+    return "c";
+}
+
+#ifdef readtime
+static int cmp_nop(const void *a, const void *b) {
+    return *(const uint16_t*)a - *(const uint16_t*)b;
+}
+
+/* Measure the overhead of the timing code (in decicycles) */
+static int measure_nop_time(void) {
+    uint16_t nops[10000];
+    int nop_sum = 0;
+
+    for (int i = 0; i < 10000; i++) {
+        uint64_t t = readtime();
+        nops[i] = (uint16_t) (readtime() - t);
+    }
+
+    qsort(nops, 10000, sizeof(uint16_t), cmp_nop);
+    for (int i = 2500; i < 7500; i++)
+        nop_sum += nops[i];
+
+    return nop_sum / 500;
+}
+
+/* Print benchmark results */
+static void print_benchs(const CheckasmFunc *const f) {
+    if (f) {
+        print_benchs(f->child[0]);
+
+        /* Only print functions with at least one assembly version */
+        if (state.bench_c || f->versions.cpu || f->versions.next) {
+            const CheckasmFuncVersion *v = &f->versions;
+            do {
+                if (v->iterations) {
+                    const int decicycles = (int) (10*v->cycles/v->iterations -
+                                                  state.nop_time) / 4;
+                    printf("%s_%s: %d.%d\n", f->name, cpu_suffix(v->cpu),
+                           decicycles/10, decicycles%10);
+                }
+            } while ((v = v->next));
+        }
+
+        print_benchs(f->child[1]);
+    }
+}
+#endif
+
+static void print_functions(const CheckasmFunc *const f) {
+    if (f) {
+        print_functions(f->child[0]);
+        printf("%s\n", f->name);
+        print_functions(f->child[1]);
+    }
+}
+
+#define is_digit(x) ((x) >= '0' && (x) <= '9')
+
+/* ASCIIbetical sort except preserving natural order for numbers */
+static int cmp_func_names(const char *a, const char *b) {
+    const char *const start = a;
+    int ascii_diff, digit_diff;
+
+    for (; !(ascii_diff = *(const unsigned char*)a -
+                          *(const unsigned char*)b) && *a; a++, b++);
+    for (; is_digit(*a) && is_digit(*b); a++, b++);
+
+    if (a > start && is_digit(a[-1]) &&
+        (digit_diff = is_digit(*a) - is_digit(*b)))
+    {
+        return digit_diff;
+    }
+
+    return ascii_diff;
+}
+
+/* Perform a tree rotation in the specified direction and return the new root */
+static CheckasmFunc *rotate_tree(CheckasmFunc *const f, const int dir) {
+    CheckasmFunc *const r = f->child[dir^1];
+    f->child[dir^1] = r->child[dir];
+    r->child[dir] = f;
+    r->color = f->color;
+    f->color = 0;
+    return r;
+}
+
+#define is_red(f) ((f) && !(f)->color)
+
+/* Balance a left-leaning red-black tree at the specified node */
+static void balance_tree(CheckasmFunc **const root) {
+    CheckasmFunc *const f = *root;
+
+    if (is_red(f->child[0]) && is_red(f->child[1])) {
+        f->color ^= 1;
+        f->child[0]->color = f->child[1]->color = 1;
+    }
+    else if (!is_red(f->child[0]) && is_red(f->child[1]))
+        *root = rotate_tree(f, 0); /* Rotate left */
+    else if (is_red(f->child[0]) && is_red(f->child[0]->child[0]))
+        *root = rotate_tree(f, 1); /* Rotate right */
+}
+
+/* Get a node with the specified name, creating it if it doesn't exist */
+static CheckasmFunc *get_func(CheckasmFunc **const root, const char *const name) {
+    CheckasmFunc *f = *root;
+
+    if (f) {
+        /* Search the tree for a matching node */
+        const int cmp = cmp_func_names(name, f->name);
+        if (cmp) {
+            f = get_func(&f->child[cmp > 0], name);
+
+            /* Rebalance the tree on the way up if a new node was inserted */
+            if (!f->versions.func)
+                balance_tree(root);
+        }
+    } else {
+        /* Allocate and insert a new node into the tree */
+        const size_t name_length = strlen(name) + 1;
+        f = *root = checkasm_malloc(offsetof(CheckasmFunc, name) + name_length);
+        memcpy(f->name, name, name_length);
+    }
+
+    return f;
+}
+
+checkasm_context checkasm_context_buf;
+
+/* Crash handling: attempt to catch crashes and handle them
+ * gracefully instead of just aborting abruptly. */
+#ifdef _WIN32
+static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) {
+    switch (e->ExceptionRecord->ExceptionCode) {
+    case EXCEPTION_FLT_DIVIDE_BY_ZERO:
+    case EXCEPTION_INT_DIVIDE_BY_ZERO:
+        checkasm_fail_func("fatal arithmetic error");
+        break;
+    case EXCEPTION_ILLEGAL_INSTRUCTION:
+    case EXCEPTION_PRIV_INSTRUCTION:
+        checkasm_fail_func("illegal instruction");
+        break;
+    case EXCEPTION_ACCESS_VIOLATION:
+    case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
+    case EXCEPTION_DATATYPE_MISALIGNMENT:
+    case EXCEPTION_IN_PAGE_ERROR:
+    case EXCEPTION_STACK_OVERFLOW:
+        checkasm_fail_func("segmentation fault");
+        break;
+    default:
+        return EXCEPTION_CONTINUE_SEARCH;
+    }
+    checkasm_load_context();
+    return EXCEPTION_CONTINUE_EXECUTION; /* never reached, but shuts up gcc */
+}
+#else
+static void signal_handler(const int s) {
+    checkasm_set_signal_handler_state(0);
+    checkasm_fail_func(s == SIGFPE ? "fatal arithmetic error" :
+                       s == SIGILL ? "illegal instruction" :
+                                     "segmentation fault");
+    checkasm_load_context();
+}
+#endif
+
+/* Perform tests and benchmarks for the specified
+ * cpu flag if supported by the host */
+static void check_cpu_flag(const char *const name, unsigned flag) {
+    const unsigned old_cpu_flag = state.cpu_flag;
+
+    flag |= old_cpu_flag;
+    dav1d_set_cpu_flags_mask(flag);
+    state.cpu_flag = dav1d_get_cpu_flags();
+
+    if (!flag || state.cpu_flag != old_cpu_flag) {
+        state.cpu_flag_name = name;
+        for (int i = 0; tests[i].func; i++) {
+            if (state.test_name && strcmp(tests[i].name, state.test_name))
+                continue;
+            xor128_srand(state.seed);
+            state.current_test_name = tests[i].name;
+            tests[i].func();
+        }
+    }
+}
+
+/* Print the name of the current CPU flag, but only do it once */
+static void print_cpu_name(void) {
+    if (state.cpu_flag_name) {
+        color_printf(COLOR_YELLOW, "%s:\n", state.cpu_flag_name);
+        state.cpu_flag_name = NULL;
+    }
+}
+
+int main(int argc, char *argv[]) {
+    (void)func_new, (void)func_ref;
+    state.seed = get_seed();
+    int ret = 0;
+
+    while (argc > 1) {
+        if (!strncmp(argv[1], "--help", 6)) {
+            fprintf(stdout,
+                    "checkasm [options] <random seed>\n"
+                    "    <random seed>       Numeric value to seed the rng\n"
+                    "Options:\n"
+                    "    --test=<test_name>  Test only <test_name>\n"
+                    "    --bench=<pattern>   Test and benchmark the functions matching <pattern>\n"
+                    "    --list-functions    List available functions\n"
+                    "    --list-tests        List available tests\n"
+                    "    --bench-c           Benchmark the C-only functions\n"
+                    "    --verbose -v        Print failures verbosely\n");
+            return 0;
+        } else if (!strncmp(argv[1], "--bench-c", 9)) {
+            state.bench_c = 1;
+        } else if (!strncmp(argv[1], "--bench", 7)) {
+#ifndef readtime
+            fprintf(stderr,
+                    "checkasm: --bench is not supported on your system\n");
+            return 1;
+#endif
+            if (argv[1][7] == '=') {
+                state.bench_pattern = argv[1] + 8;
+                state.bench_pattern_len = strlen(state.bench_pattern);
+            } else
+                state.bench_pattern = "";
+        } else if (!strncmp(argv[1], "--test=", 7)) {
+            state.test_name = argv[1] + 7;
+        } else if (!strcmp(argv[1], "--list-functions")) {
+            state.function_listing = 1;
+        } else if (!strcmp(argv[1], "--list-tests")) {
+            for (int i = 0; tests[i].name; i++)
+                printf("%s\n", tests[i].name);
+            return 0;
+        } else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) {
+            state.verbose = 1;
+        } else {
+            state.seed = (unsigned) strtoul(argv[1], NULL, 10);
+        }
+
+        argc--;
+        argv++;
+    }
+
+    dav1d_init_cpu();
+
+    if (!state.function_listing) {
+        fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
+#if ARCH_X86_64
+        void checkasm_warmup_avx2(void);
+        void checkasm_warmup_avx512(void);
+        const unsigned cpu_flags = dav1d_get_cpu_flags();
+        if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL)
+            state.simd_warmup = checkasm_warmup_avx512;
+        else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2)
+            state.simd_warmup = checkasm_warmup_avx2;
+        checkasm_simd_warmup();
+#endif
+    }
+
+    check_cpu_flag(NULL, 0);
+
+    if (state.function_listing) {
+        print_functions(state.funcs);
+    } else {
+        for (int i = 0; cpus[i].flag; i++)
+            check_cpu_flag(cpus[i].name, cpus[i].flag);
+        if (!state.num_checked) {
+            fprintf(stderr, "checkasm: no tests to perform\n");
+        } else if (state.num_failed) {
+            fprintf(stderr, "checkasm: %d of %d tests have failed\n",
+                    state.num_failed, state.num_checked);
+            ret = 1;
+        } else {
+            fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
+#ifdef readtime
+            if (state.bench_pattern) {
+                state.nop_time = measure_nop_time();
+                printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
+                print_benchs(state.funcs);
+            }
+#endif
+        }
+    }
+
+    destroy_func_tree(state.funcs);
+    return ret;
+}
+
+/* Decide whether or not the specified function needs to be tested and
+ * allocate/initialize data structures if needed. Returns a pointer to a
+ * reference function if the function should be tested, otherwise NULL */
+void *checkasm_check_func(void *const func, const char *const name, ...) {
+    char name_buf[256];
+    va_list arg;
+
+    va_start(arg, name);
+    const int name_length = vsnprintf(name_buf, sizeof(name_buf), name, arg);
+    va_end(arg);
+
+    if (!func || name_length <= 0 || (size_t)name_length >= sizeof(name_buf))
+        return NULL;
+
+    state.current_func = get_func(&state.funcs, name_buf);
+
+    if (state.function_listing) /* Save function names without running tests */
+        return NULL;
+
+    state.funcs->color = 1;
+    CheckasmFuncVersion *v = &state.current_func->versions;
+    void *ref = func;
+
+    if (v->func) {
+        CheckasmFuncVersion *prev;
+        do {
+            /* Only test functions that haven't already been tested */
+            if (v->func == func)
+                return NULL;
+
+            if (v->ok)
+                ref = v->func;
+
+            prev = v;
+        } while ((v = v->next));
+
+        v = prev->next = checkasm_malloc(sizeof(CheckasmFuncVersion));
+    }
+
+    v->func = func;
+    v->ok = 1;
+    v->cpu = state.cpu_flag;
+    state.current_func_ver = v;
+    xor128_srand(state.seed);
+
+    if (state.cpu_flag || state.bench_c)
+        state.num_checked++;
+
+    return ref;
+}
+
+/* Decide whether or not the current function needs to be benchmarked */
+int checkasm_bench_func(void) {
+    return !state.num_failed && state.bench_pattern &&
+           !strncmp(state.current_func->name, state.bench_pattern,
+                    state.bench_pattern_len);
+}
+
+/* Indicate that the current test has failed, return whether verbose printing
+ * is requested. */
+int checkasm_fail_func(const char *const msg, ...) {
+    if (state.current_func_ver->cpu && state.current_func_ver->ok) {
+        va_list arg;
+
+        print_cpu_name();
+        fprintf(stderr, "   %s_%s (", state.current_func->name,
+                cpu_suffix(state.current_func_ver->cpu));
+        va_start(arg, msg);
+        vfprintf(stderr, msg, arg);
+        va_end(arg);
+        fprintf(stderr, ")\n");
+
+        state.current_func_ver->ok = 0;
+        state.num_failed++;
+    }
+    return state.verbose;
+}
+
+/* Update benchmark results of the current function */
+void checkasm_update_bench(const int iterations, const uint64_t cycles) {
+    state.current_func_ver->iterations += iterations;
+    state.current_func_ver->cycles += cycles;
+}
+
+/* Print the outcome of all tests performed since
+ * the last time this function was called */
+void checkasm_report(const char *const name, ...) {
+    static int prev_checked, prev_failed;
+    static size_t max_length;
+
+    if (state.num_checked > prev_checked) {
+        int pad_length = (int) max_length + 4;
+        va_list arg;
+
+        print_cpu_name();
+        pad_length -= fprintf(stderr, " - %s.", state.current_test_name);
+        va_start(arg, name);
+        pad_length -= vfprintf(stderr, name, arg);
+        va_end(arg);
+        fprintf(stderr, "%*c", imax(pad_length, 0) + 2, '[');
+
+        if (state.num_failed == prev_failed)
+            color_printf(COLOR_GREEN, "OK");
+        else
+            color_printf(COLOR_RED, "FAILED");
+        fprintf(stderr, "]\n");
+
+        prev_checked = state.num_checked;
+        prev_failed  = state.num_failed;
+    } else if (!state.cpu_flag) {
+        /* Calculate the amount of padding required
+         * to make the output vertically aligned */
+        size_t length = strlen(state.current_test_name);
+        va_list arg;
+
+        va_start(arg, name);
+        length += vsnprintf(NULL, 0, name, arg);
+        va_end(arg);
+
+        if (length > max_length)
+            max_length = length;
+    }
+}
+
+void checkasm_set_signal_handler_state(const int enabled) {
+#ifdef _WIN32
+    if (enabled)
+        AddVectoredExceptionHandler(0, signal_handler);
+    else
+        RemoveVectoredExceptionHandler(signal_handler);
+#else
+    void (*const handler)(int) = enabled ? signal_handler : SIG_DFL;
+    signal(SIGBUS,  handler);
+    signal(SIGFPE,  handler);
+    signal(SIGILL,  handler);
+    signal(SIGSEGV, handler);
+#endif
+}
+
+#define DEF_CHECKASM_CHECK_FUNC(type, fmt) \
+int checkasm_check_##type(const char *const file, const int line, \
+                          const type *buf1, ptrdiff_t stride1, \
+                          const type *buf2, ptrdiff_t stride2, \
+                          const int w, int h, const char *const name) \
+{ \
+    stride1 /= sizeof(*buf1); \
+    stride2 /= sizeof(*buf2); \
+    int y = 0; \
+    for (y = 0; y < h; y++) \
+        if (memcmp(&buf1[y*stride1], &buf2[y*stride2], w*sizeof(*buf1))) \
+            break; \
+    if (y == h) \
+        return 0; \
+    if (!checkasm_fail_func("%s:%d", file, line)) \
+        return 1; \
+    fprintf(stderr, "%s:\n", name); \
+    while (h--) { \
+        for (int x = 0; x < w; x++) \
+            fprintf(stderr, " " fmt, buf1[x]); \
+        fprintf(stderr, "    "); \
+        for (int x = 0; x < w; x++) \
+            fprintf(stderr, " " fmt, buf2[x]); \
+        fprintf(stderr, "    "); \
+        for (int x = 0; x < w; x++) \
+            fprintf(stderr, "%c", buf1[x] != buf2[x] ? 'x' : '.'); \
+        buf1 += stride1; \
+        buf2 += stride2; \
+        fprintf(stderr, "\n"); \
+    } \
+    return 1; \
+}
+
+DEF_CHECKASM_CHECK_FUNC(uint8_t,  "%02x")
+DEF_CHECKASM_CHECK_FUNC(uint16_t, "%04x")
+DEF_CHECKASM_CHECK_FUNC(int16_t,  "%6d")
+DEF_CHECKASM_CHECK_FUNC(int32_t,  "%9d")
+
+#if ARCH_X86_64
+void checkasm_simd_warmup(void)
+{
+    if (state.simd_warmup)
+        state.simd_warmup();
+}
+#endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
new file mode 100644 (file)
index 0000000..27c28d7
--- /dev/null
@@ -0,0 +1,330 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_TESTS_CHECKASM_CHECKASM_H
+#define DAV1D_TESTS_CHECKASM_CHECKASM_H
+
+#include "config.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#if ARCH_X86_64 && defined(_WIN32)
+/* setjmp/longjmp on 64-bit Windows will try to use SEH to unwind the stack,
+ * which doesn't work for assembly functions without unwind information. */
+#include <windows.h>
+#define checkasm_context CONTEXT
+#define checkasm_save_context() RtlCaptureContext(&checkasm_context_buf)
+#define checkasm_load_context() RtlRestoreContext(&checkasm_context_buf, NULL)
+#else
+#include <setjmp.h>
+#define checkasm_context jmp_buf
+#define checkasm_save_context() setjmp(checkasm_context_buf)
+#define checkasm_load_context() longjmp(checkasm_context_buf, 1)
+#endif
+
+#include "include/common/attributes.h"
+#include "include/common/bitdepth.h"
+#include "include/common/intops.h"
+
+int xor128_rand(void);
+#define rnd xor128_rand
+
+#define decl_check_bitfns(name) \
+name##_8bpc(void); \
+name##_16bpc(void)
+
+void checkasm_check_msac(void);
+decl_check_bitfns(void checkasm_check_cdef);
+decl_check_bitfns(void checkasm_check_filmgrain);
+decl_check_bitfns(void checkasm_check_ipred);
+decl_check_bitfns(void checkasm_check_itx);
+decl_check_bitfns(void checkasm_check_loopfilter);
+decl_check_bitfns(void checkasm_check_looprestoration);
+decl_check_bitfns(void checkasm_check_mc);
+
+void *checkasm_check_func(void *func, const char *name, ...);
+int checkasm_bench_func(void);
+int checkasm_fail_func(const char *msg, ...);
+void checkasm_update_bench(int iterations, uint64_t cycles);
+void checkasm_report(const char *name, ...);
+void checkasm_set_signal_handler_state(int enabled);
+extern checkasm_context checkasm_context_buf;
+
+/* float compare utilities */
+int float_near_ulp(float a, float b, unsigned max_ulp);
+int float_near_abs_eps(float a, float b, float eps);
+int float_near_abs_eps_ulp(float a, float b, float eps, unsigned max_ulp);
+int float_near_ulp_array(const float *a, const float *b, unsigned max_ulp,
+                         int len);
+int float_near_abs_eps_array(const float *a, const float *b, float eps,
+                             int len);
+int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps,
+                                 unsigned max_ulp, int len);
+
+static void *func_ref, *func_new;
+
+#define BENCH_RUNS (1 << 12) /* Trade-off between accuracy and speed */
+
+/* Decide whether or not the specified function needs to be tested */
+#define check_func(func, ...)\
+    (func_ref = checkasm_check_func((func_new = func), __VA_ARGS__))
+
+/* Declare the function prototype. The first argument is the return value,
+ * the remaining arguments are the function parameters. Naming parameters
+ * is optional. */
+#define declare_func(ret, ...)\
+    declare_new(ret, __VA_ARGS__)\
+    typedef ret func_type(__VA_ARGS__);\
+    checkasm_save_context()
+
+/* Indicate that the current test has failed */
+#define fail() checkasm_fail_func("%s:%d", __FILE__, __LINE__)
+
+/* Print the test outcome */
+#define report checkasm_report
+
+/* Call the reference function */
+#define call_ref(...)\
+    (checkasm_set_signal_handler_state(1),\
+     ((func_type *)func_ref)(__VA_ARGS__));\
+    checkasm_set_signal_handler_state(0)
+
+#if HAVE_ASM
+#if ARCH_X86
+#ifdef _MSC_VER
+#include <intrin.h>
+#define readtime() (_mm_lfence(), __rdtsc())
+#else
+static inline uint64_t readtime(void) {
+    uint32_t eax, edx;
+    __asm__ __volatile__("lfence\nrdtsc" : "=a"(eax), "=d"(edx));
+    return (((uint64_t)edx) << 32) | eax;
+}
+#define readtime readtime
+#endif
+#elif ARCH_AARCH64
+#ifdef _MSC_VER
+#include <windows.h>
+#define readtime() (_InstructionSynchronizationBarrier(), ReadTimeStampCounter())
+#else
+static inline uint64_t readtime(void) {
+    uint64_t cycle_counter;
+    /* This requires enabling user mode access to the cycle counter (which
+     * can only be done from kernel space).
+     * This could also read cntvct_el0 instead of pmccntr_el0; that register
+     * might also be readable (depending on kernel version), but it has much
+     * worse precision (it's a fixed 50 MHz timer). */
+    __asm__ __volatile__("isb\nmrs %0, pmccntr_el0"
+                         : "=r"(cycle_counter)
+                         :: "memory");
+    return cycle_counter;
+}
+#define readtime readtime
+#endif
+#elif ARCH_ARM && !defined(_MSC_VER) && __ARM_ARCH >= 7
+static inline uint64_t readtime(void) {
+    uint32_t cycle_counter;
+    /* This requires enabling user mode access to the cycle counter (which
+     * can only be done from kernel space). */
+    __asm__ __volatile__("isb\nmrc p15, 0, %0, c9, c13, 0"
+                         : "=r"(cycle_counter)
+                         :: "memory");
+    return cycle_counter;
+}
+#define readtime readtime
+#elif ARCH_PPC64LE
+static inline uint64_t readtime(void) {
+    uint32_t tbu, tbl, temp;
+
+    __asm__ __volatile__(
+        "1:\n"
+        "mfspr %2,269\n"
+        "mfspr %0,268\n"
+        "mfspr %1,269\n"
+        "cmpw   %2,%1\n"
+        "bne    1b\n"
+    : "=r"(tbl), "=r"(tbu), "=r"(temp)
+    :
+    : "cc");
+
+    return (((uint64_t)tbu) << 32) | (uint64_t)tbl;
+}
+#define readtime readtime
+#endif
+
+/* Verifies that clobbered callee-saved registers
+ * are properly saved and restored */
+void checkasm_checked_call(void *func, ...);
+
+#if ARCH_X86_64
+/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended
+ * to 64-bit. This is done by clobbering the stack with junk around the stack
+ * pointer and calling the assembly function through checked_call() with added
+ * dummy arguments which forces all real arguments to be passed on the stack
+ * and not in registers. For 32-bit arguments the upper half of the 64-bit
+ * register locations on the stack will now contain junk which will cause
+ * misbehaving functions to either produce incorrect output or segfault. Note
+ * that even though this works extremely well in practice, it's technically
+ * not guaranteed and false negatives is theoretically possible, but there
+ * can never be any false positives. */
+void checkasm_stack_clobber(uint64_t clobber, ...);
+/* YMM and ZMM registers on x86 are turned off to save power when they haven't
+ * been used for some period of time. When they are used there will be a
+ * "warmup" period during which performance will be reduced and inconsistent
+ * which is problematic when trying to benchmark individual functions. We can
+ * work around this by periodically issuing "dummy" instructions that uses
+ * those registers to keep them powered on. */
+void checkasm_simd_warmup(void);
+#define declare_new(ret, ...)\
+    ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__,\
+                        int, int, int, int, int, int, int, int,\
+                        int, int, int, int, int, int, int) =\
+    (void *)checkasm_checked_call;
+#define CLOB (UINT64_C(0xdeadbeefdeadbeef))
+#ifdef _WIN32
+#define STACKARGS 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0
+#else
+#define STACKARGS 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0
+#endif
+#define call_new(...)\
+    (checkasm_set_signal_handler_state(1),\
+     checkasm_simd_warmup(),\
+     checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
+                            CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
+                            CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB),\
+     checked_call(func_new, 0, 0, 0, 0, 0, __VA_ARGS__, STACKARGS));\
+    checkasm_set_signal_handler_state(0)
+#elif ARCH_X86_32
+#define declare_new(ret, ...)\
+    ret (*checked_call)(void *, __VA_ARGS__, int, int, int, int, int, int,\
+                        int, int, int, int, int, int, int, int, int) =\
+        (void *)checkasm_checked_call;
+#define call_new(...)\
+    (checkasm_set_signal_handler_state(1),\
+     checked_call(func_new, __VA_ARGS__, 15, 14, 13, 12,\
+                  11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1));\
+    checkasm_set_signal_handler_state(0)
+#elif ARCH_ARM
+/* Use a dummy argument, to offset the real parameters by 2, not only 1.
+ * This makes sure that potential 8-byte-alignment of parameters is kept
+ * the same even when the extra parameters have been removed. */
+void checkasm_checked_call_vfp(void *func, int dummy, ...);
+#define declare_new(ret, ...)\
+    ret (*checked_call)(void *, int dummy, __VA_ARGS__,\
+                        int, int, int, int, int, int, int, int,\
+                        int, int, int, int, int, int, int) =\
+    (void *)checkasm_checked_call_vfp;
+#define call_new(...)\
+    (checkasm_set_signal_handler_state(1),\
+     checked_call(func_new, 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0));\
+    checkasm_set_signal_handler_state(0)
+#elif ARCH_AARCH64 && !defined(__APPLE__)
+void checkasm_stack_clobber(uint64_t clobber, ...);
+#define declare_new(ret, ...)\
+    ret (*checked_call)(void *, int, int, int, int, int, int, int,\
+                        __VA_ARGS__, int, int, int, int, int, int, int, int,\
+                        int, int, int, int, int, int, int) =\
+    (void *)checkasm_checked_call;
+#define CLOB (UINT64_C(0xdeadbeefdeadbeef))
+#define call_new(...)\
+    (checkasm_set_signal_handler_state(1),\
+     checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
+                            CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
+                            CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
+                            CLOB, CLOB, CLOB, CLOB, CLOB),\
+     checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
+                  7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\
+    checkasm_set_signal_handler_state(0)
+#else
+#define declare_new(ret, ...)
+#define call_new(...)\
+    (checkasm_set_signal_handler_state(1),\
+     ((func_type *)func_new)(__VA_ARGS__));\
+    checkasm_set_signal_handler_state(0)
+#endif
+#else /* HAVE_ASM */
+#define declare_new(ret, ...)
+/* Call the function */
+#define call_new(...)\
+    (checkasm_set_signal_handler_state(1),\
+     ((func_type *)func_new)(__VA_ARGS__));\
+    checkasm_set_signal_handler_state(0)
+#endif /* HAVE_ASM */
+
+/* Benchmark the function */
+#ifdef readtime
+#define bench_new(...)\
+    do {\
+        if (checkasm_bench_func()) {\
+            checkasm_set_signal_handler_state(1);\
+            func_type *tfunc = func_new;\
+            uint64_t tsum = 0;\
+            int tcount = 0;\
+            for (int ti = 0; ti < BENCH_RUNS; ti++) {\
+                uint64_t t = readtime();\
+                tfunc(__VA_ARGS__);\
+                tfunc(__VA_ARGS__);\
+                tfunc(__VA_ARGS__);\
+                tfunc(__VA_ARGS__);\
+                t = readtime() - t;\
+                if (t*tcount <= tsum*4 && ti > 0) {\
+                    tsum += t;\
+                    tcount++;\
+                }\
+            }\
+            checkasm_set_signal_handler_state(0);\
+            checkasm_update_bench(tcount, tsum);\
+        }\
+    } while (0)
+#else
+#define bench_new(...) do {} while (0)
+#endif
+
+#define DECL_CHECKASM_CHECK_FUNC(type) \
+int checkasm_check_##type(const char *const file, const int line, \
+                          const type *const buf1, const ptrdiff_t stride1, \
+                          const type *const buf2, const ptrdiff_t stride2, \
+                          const int w, const int h, const char *const name)
+
+DECL_CHECKASM_CHECK_FUNC(uint8_t);
+DECL_CHECKASM_CHECK_FUNC(uint16_t);
+DECL_CHECKASM_CHECK_FUNC(int16_t);
+DECL_CHECKASM_CHECK_FUNC(int32_t);
+
+
+#define PASTE(a,b) a ## b
+#define CONCAT(a,b) PASTE(a,b)
+
+#define checkasm_check(prefix, ...) CONCAT(checkasm_check_, prefix)(__FILE__, __LINE__, __VA_ARGS__)
+
+#ifdef BITDEPTH
+#define checkasm_check_pixel(...) checkasm_check(PIXEL_TYPE, __VA_ARGS__)
+#define checkasm_check_coef(...)  checkasm_check(COEF_TYPE,  __VA_ARGS__)
+#endif
+
+#endif /* DAV1D_TESTS_CHECKASM_CHECKASM_H */
diff --git a/tests/checkasm/filmgrain.c b/tests/checkasm/filmgrain.c
new file mode 100644 (file)
index 0000000..1219ee7
--- /dev/null
@@ -0,0 +1,331 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tests/checkasm/checkasm.h"
+
+#include <string.h>
+
+#include "src/levels.h"
+#include "src/film_grain.h"
+#define UNIT_TEST 1
+#include "src/fg_apply_tmpl.c"
+
+static const char ss_name[][4] = {
+    [DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
+    [DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
+    [DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
+};
+
+static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
+    entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
+    entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+
+    declare_func(void, entry grain_lut[][GRAIN_WIDTH],
+                 const Dav1dFilmGrainData *data HIGHBD_DECL_SUFFIX);
+
+    for (int i = 0; i < 4; i++) {
+        if (check_func(dsp->generate_grain_y, "gen_grain_y_ar%d_%dbpc", i, BITDEPTH)) {
+            ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
+            fg_data[0].seed = rnd() & 0xFFFF;
+
+#if BITDEPTH == 16
+            const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#endif
+
+            fg_data[0].grain_scale_shift = rnd() & 3;
+            fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
+            fg_data[0].ar_coeff_lag = i;
+            const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
+            for (int n = 0; n < num_y_pos; n++)
+                fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+
+            call_ref(grain_lut_c, fg_data HIGHBD_TAIL_SUFFIX);
+            call_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
+            if (memcmp(grain_lut_c, grain_lut_a,
+                       GRAIN_WIDTH * GRAIN_HEIGHT * sizeof(entry)))
+            {
+                fail();
+            }
+
+            bench_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
+        }
+    }
+
+    report("gen_grain_y");
+}
+
+static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
+    entry grain_lut_y[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+    entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
+    entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+
+    declare_func(void, entry grain_lut[][GRAIN_WIDTH],
+                 const entry grain_lut_y[][GRAIN_WIDTH],
+                 const Dav1dFilmGrainData *data, intptr_t uv HIGHBD_DECL_SUFFIX);
+
+    for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
+        const enum Dav1dPixelLayout layout = layout_idx + 1;
+        const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
+
+        for (int i = 0; i < 4; i++) {
+            if (check_func(dsp->generate_grain_uv[layout_idx],
+                           "gen_grain_uv_ar%d_%dbpc_%s",
+                           i, BITDEPTH, ss_name[layout_idx]))
+            {
+                ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
+                fg_data[0].seed = rnd() & 0xFFFF;
+
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#endif
+
+                fg_data[0].num_y_points = rnd() & 1;
+                fg_data[0].grain_scale_shift = rnd() & 3;
+                fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
+                fg_data[0].ar_coeff_lag = i;
+                const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
+                for (int n = 0; n < num_y_pos; n++)
+                    fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+                dsp->generate_grain_y(grain_lut_y, fg_data HIGHBD_TAIL_SUFFIX);
+
+                const int uv = rnd() & 1;
+                const int num_uv_pos = num_y_pos + !!fg_data[0].num_y_points;
+                for (int n = 0; n < num_uv_pos; n++)
+                    fg_data[0].ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
+                if (!fg_data[0].num_y_points)
+                    fg_data[0].ar_coeffs_uv[uv][num_uv_pos] = 0;
+                memset(grain_lut_c, 0xff, sizeof(grain_lut_c));
+                memset(grain_lut_a, 0xff, sizeof(grain_lut_a));
+                call_ref(grain_lut_c, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
+                call_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
+                int diff = 0, w = ss_x ? 44 : GRAIN_WIDTH;
+                for (int y = 0; y < (ss_y ? 38 : GRAIN_HEIGHT); y++)
+                    diff |= memcmp(grain_lut_a[y], grain_lut_c[y], w * sizeof(entry));
+                if (diff) fail();
+
+                bench_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
+            }
+        }
+    }
+
+    report("gen_grain_uv");
+}
+
+static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
+    ALIGN_STK_64(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, src, 128 * 32,);
+    const ptrdiff_t stride = 128 * sizeof(pixel);
+
+    declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
+                 const Dav1dFilmGrainData *data, size_t pw,
+                 const uint8_t scaling[SCALING_SIZE],
+                 const entry grain_lut[][GRAIN_WIDTH],
+                 int bh, int row_num HIGHBD_DECL_SUFFIX);
+
+    if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) {
+        ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 16,);
+        fg_data[0].seed = rnd() & 0xFFFF;
+
+#if BITDEPTH == 16
+        const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+        const int bitdepth_max = 0xff;
+#endif
+
+        uint8_t scaling[SCALING_SIZE];
+        entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+        fg_data[0].grain_scale_shift = rnd() & 3;
+        fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
+        fg_data[0].ar_coeff_lag = rnd() & 3;
+        const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
+        for (int n = 0; n < num_y_pos; n++)
+            fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+        dsp->generate_grain_y(grain_lut, fg_data HIGHBD_TAIL_SUFFIX);
+
+        fg_data[0].num_y_points = 2 + (rnd() % 13);
+        const int pad = 0xff / fg_data[0].num_y_points;
+        for (int n = 0; n < fg_data[0].num_y_points; n++) {
+            fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
+            fg_data[0].y_points[n][0] += rnd() % pad;
+            fg_data[0].y_points[n][1] = rnd() & 0xff;
+        }
+        generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
+                         fg_data[0].num_y_points, scaling);
+
+        const int w = 1 + (rnd() & 127);
+        const int h = 1 + (rnd() & 31);
+
+        for (int y = 0; y < 32; y++)
+            for (int x = 0; x < 128; x++)
+                src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
+        const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
+
+        fg_data[0].clip_to_restricted_range = rnd() & 1;
+        fg_data[0].scaling_shift = (rnd() & 3) + 8;
+        for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
+             fg_data[0].overlap_flag++)
+        {
+            call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut, h,
+                     row_num HIGHBD_TAIL_SUFFIX);
+            call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut, h,
+                     row_num HIGHBD_TAIL_SUFFIX);
+
+            checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
+        }
+        fg_data[0].overlap_flag = 1;
+        bench_new(a_dst, src, stride, fg_data, 64, scaling, grain_lut, 32,
+                  row_num HIGHBD_TAIL_SUFFIX);
+    }
+
+    report("fgy_32x32xn");
+}
+
+static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
+    ALIGN_STK_64(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, src, 128 * 32,);
+    ALIGN_STK_64(pixel, luma_src, 128 * 32,);
+    const ptrdiff_t lstride = 128 * sizeof(pixel);
+
+    declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
+                 const Dav1dFilmGrainData *data, size_t pw,
+                 const uint8_t scaling[SCALING_SIZE],
+                 const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num,
+                 const pixel *luma_row, ptrdiff_t luma_stride, int uv_pl,
+                 int is_identity HIGHBD_DECL_SUFFIX);
+
+    for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
+        const enum Dav1dPixelLayout layout = layout_idx + 1;
+        const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
+        const ptrdiff_t stride = (ss_x ? 96 : 128) * sizeof(pixel);
+
+        for (int csfl = 0; csfl <= 1; csfl++) {
+            if (check_func(dsp->fguv_32x32xn[layout_idx],
+                           "fguv_32x32xn_%dbpc_%s_csfl%d",
+                           BITDEPTH, ss_name[layout_idx], csfl))
+            {
+                ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
+
+                fg_data[0].seed = rnd() & 0xFFFF;
+
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                const int bitdepth_max = 0xff;
+#endif
+                const int uv_pl = rnd() & 1;
+                const int is_identity = rnd() & 1;
+
+                uint8_t scaling[SCALING_SIZE];
+                entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+                fg_data[0].grain_scale_shift = rnd() & 3;
+                fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
+                fg_data[0].ar_coeff_lag = rnd() & 3;
+                const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
+                for (int n = 0; n < num_y_pos; n++)
+                    fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+                const int num_uv_pos = num_y_pos + 1;
+                for (int n = 0; n < num_uv_pos; n++)
+                    fg_data[0].ar_coeffs_uv[uv_pl][n] = (rnd() & 0xff) - 128;
+                dsp->generate_grain_y(grain_lut[0], fg_data HIGHBD_TAIL_SUFFIX);
+                dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
+                                                   fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
+
+                const int w = 1 + (rnd() & (127 >> ss_x));
+                const int h = 1 + (rnd() & (31 >> ss_y));
+
+                for (int y = 0; y < 32; y++)
+                    for (int x = 0; x < 128; x++)
+                        src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
+                for (int y = 0; y < 32; y++)
+                    for (int x = 0; x < 128; x++)
+                        luma_src[y * PXSTRIDE(lstride) + x] = rnd() & bitdepth_max;
+                const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
+
+                if (csfl) {
+                    fg_data[0].num_y_points = 2 + (rnd() % 13);
+                    const int pad = 0xff / fg_data[0].num_y_points;
+                    for (int n = 0; n < fg_data[0].num_y_points; n++) {
+                        fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
+                        fg_data[0].y_points[n][0] += rnd() % pad;
+                        fg_data[0].y_points[n][1] = rnd() & 0xff;
+                    }
+                    generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
+                                     fg_data[0].num_y_points, scaling);
+                } else {
+                    fg_data[0].num_uv_points[uv_pl] = 2 + (rnd() % 9);
+                    const int pad = 0xff / fg_data[0].num_uv_points[uv_pl];
+                    for (int n = 0; n < fg_data[0].num_uv_points[uv_pl]; n++) {
+                        fg_data[0].uv_points[uv_pl][n][0] = 0xff * n / fg_data[0].num_uv_points[uv_pl];
+                        fg_data[0].uv_points[uv_pl][n][0] += rnd() % pad;
+                        fg_data[0].uv_points[uv_pl][n][1] = rnd() & 0xff;
+                    }
+                    generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].uv_points[uv_pl],
+                                     fg_data[0].num_uv_points[uv_pl], scaling);
+
+                    fg_data[0].uv_mult[uv_pl] = (rnd() & 0xff) - 128;
+                    fg_data[0].uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
+                    fg_data[0].uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
+                }
+
+                fg_data[0].clip_to_restricted_range = rnd() & 1;
+                fg_data[0].scaling_shift = (rnd() & 3) + 8;
+                fg_data[0].chroma_scaling_from_luma = csfl;
+                for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
+                     fg_data[0].overlap_flag++)
+                {
+                    call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
+                             row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
+                    call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
+                             row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
+
+                    checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
+                }
+
+                fg_data[0].overlap_flag = 1;
+                bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16,
+                          row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
+            }
+        }
+    }
+
+    report("fguv_32x32xn");
+}
+
+void bitfn(checkasm_check_filmgrain)(void) {
+    Dav1dFilmGrainDSPContext c;
+
+    bitfn(dav1d_film_grain_dsp_init)(&c);
+
+    check_gen_grny(&c);
+    check_gen_grnuv(&c);
+    check_fgy_sbrow(&c);
+    check_fguv_sbrow(&c);
+}
diff --git a/tests/checkasm/ipred.c b/tests/checkasm/ipred.c
new file mode 100644 (file)
index 0000000..6b054a7
--- /dev/null
@@ -0,0 +1,286 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tests/checkasm/checkasm.h"
+#include "src/ipred.h"
+#include "src/levels.h"
+
+#include <stdio.h>
+
+static const char *const intra_pred_mode_names[N_IMPL_INTRA_PRED_MODES] = {
+    [DC_PRED]       = "dc",
+    [DC_128_PRED]   = "dc_128",
+    [TOP_DC_PRED]   = "dc_top",
+    [LEFT_DC_PRED]  = "dc_left",
+    [HOR_PRED]      = "h",
+    [VERT_PRED]     = "v",
+    [PAETH_PRED]    = "paeth",
+    [SMOOTH_PRED]   = "smooth",
+    [SMOOTH_V_PRED] = "smooth_v",
+    [SMOOTH_H_PRED] = "smooth_h",
+    [Z1_PRED]       = "z1",
+    [Z2_PRED]       = "z2",
+    [Z3_PRED]       = "z3",
+    [FILTER_PRED]   = "filter"
+};
+
+static const char *const cfl_ac_names[3] = { "420", "422", "444" };
+
+static const char *const cfl_pred_mode_names[DC_128_PRED + 1] = {
+    [DC_PRED]       = "cfl",
+    [DC_128_PRED]   = "cfl_128",
+    [TOP_DC_PRED]   = "cfl_top",
+    [LEFT_DC_PRED]  = "cfl_left",
+};
+
+static const uint8_t z_angles[27] = {
+     3,  6,  9,
+    14, 17, 20, 23, 26, 29, 32,
+    36, 39, 42, 45, 48, 51, 54,
+    58, 61, 64, 67, 70, 73, 76,
+    81, 84, 87
+};
+
+static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
+    ALIGN_STK_64(pixel, c_dst, 64 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 64 * 64,);
+    ALIGN_STK_64(pixel, topleft_buf, 257,);
+    pixel *const topleft = topleft_buf + 128;
+
+    declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
+                 int width, int height, int angle, int max_width, int max_height
+                 HIGHBD_DECL_SUFFIX);
+
+    for (int mode = 0; mode < N_IMPL_INTRA_PRED_MODES; mode++) {
+        int bpc_min = BITDEPTH, bpc_max = BITDEPTH;
+        if (mode == FILTER_PRED && BITDEPTH == 16) {
+            bpc_min = 10;
+            bpc_max = 12;
+        }
+        for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2)
+            for (int w = 4; w <= (mode == FILTER_PRED ? 32 : 64); w <<= 1)
+                if (check_func(c->intra_pred[mode], "intra_pred_%s_w%d_%dbpc",
+                    intra_pred_mode_names[mode], w, bpc))
+                {
+                    for (int h = imax(w / 4, 4); h <= imin(w * 4,
+                        (mode == FILTER_PRED ? 32 : 64)); h <<= 1)
+                    {
+                        const ptrdiff_t stride = w * sizeof(pixel);
+
+                        int a = 0, maxw = 0, maxh = 0;
+                        if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */
+                            a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) |
+                                (rnd() & 0x600);
+                            if (mode == Z2_PRED) {
+                                maxw = rnd(), maxh = rnd();
+                                maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1));
+                                maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1));
+                            }
+                        } else if (mode == FILTER_PRED) /* filter_idx */
+                            a = (rnd() % 5) | (rnd() & ~511);
+
+                        int bitdepth_max;
+                        if (bpc == 16)
+                            bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+                        else
+                            bitdepth_max = (1 << bpc) - 1;
+
+                        for (int i = -h * 2; i <= w * 2; i++)
+                            topleft[i] = rnd() & bitdepth_max;
+
+                        call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh
+                                 HIGHBD_TAIL_SUFFIX);
+                        call_new(a_dst, stride, topleft, w, h, a, maxw, maxh
+                                 HIGHBD_TAIL_SUFFIX);
+                        if (checkasm_check_pixel(c_dst, stride, a_dst, stride,
+                                                 w, h, "dst"))
+                        {
+                            if (mode == Z1_PRED || mode == Z3_PRED)
+                                fprintf(stderr, "angle = %d (0x%03x)\n",
+                                        a & 0x1ff, a & 0x600);
+                            else if (mode == Z2_PRED)
+                                fprintf(stderr, "angle = %d (0x%03x), "
+                                        "max_width = %d, max_height = %d\n",
+                                        a & 0x1ff, a & 0x600, maxw, maxh);
+                            else if (mode == FILTER_PRED)
+                                fprintf(stderr, "filter_idx = %d\n", a & 0x1ff);
+                        }
+
+                        bench_new(a_dst, stride, topleft, w, h, a, 128, 128
+                                  HIGHBD_TAIL_SUFFIX);
+                    }
+                }
+    }
+    report("intra_pred");
+}
+
+static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
+    ALIGN_STK_64(int16_t, c_dst, 32 * 32,);
+    ALIGN_STK_64(int16_t, a_dst, 32 * 32,);
+    ALIGN_STK_64(pixel, luma, 32 * 32,);
+
+    declare_func(void, int16_t *ac, const pixel *y, ptrdiff_t stride,
+                 int w_pad, int h_pad, int cw, int ch);
+
+    for (int layout = 1; layout <= DAV1D_PIXEL_LAYOUT_I444; layout++) {
+        const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int h_step = 2 >> ss_hor, v_step = 2 >> ss_ver;
+        for (int w = 4; w <= (32 >> ss_hor); w <<= 1)
+            if (check_func(c->cfl_ac[layout - 1], "cfl_ac_%s_w%d_%dbpc",
+                cfl_ac_names[layout - 1], w, BITDEPTH))
+            {
+                for (int h = imax(w / 4, 4);
+                     h <= imin(w * 4, (32 >> ss_ver)); h <<= 1)
+                {
+                    const ptrdiff_t stride = 32 * sizeof(pixel);
+                    for (int w_pad = imax((w >> 2) - h_step, 0);
+                         w_pad >= 0; w_pad -= h_step)
+                    {
+                        for (int h_pad = imax((h >> 2) - v_step, 0);
+                             h_pad >= 0; h_pad -= v_step)
+                        {
+#if BITDEPTH == 16
+                            const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                            const int bitdepth_max = 0xff;
+#endif
+                            for (int y = 0; y < (h << ss_ver); y++)
+                                for (int x = 0; x < (w << ss_hor); x++)
+                                    luma[y * 32 + x] = rnd() & bitdepth_max;
+
+                            call_ref(c_dst, luma, stride, w_pad, h_pad, w, h);
+                            call_new(a_dst, luma, stride, w_pad, h_pad, w, h);
+                            checkasm_check(int16_t, c_dst, w * sizeof(*c_dst),
+                                                    a_dst, w * sizeof(*a_dst),
+                                                    w, h, "dst");
+                        }
+                    }
+
+                    bench_new(a_dst, luma, stride, 0, 0, w, h);
+                }
+            }
+    }
+    report("cfl_ac");
+}
+
+static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
+    ALIGN_STK_64(pixel, c_dst, 32 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 32 * 32,);
+    ALIGN_STK_64(int16_t, ac, 32 * 32,);
+    ALIGN_STK_64(pixel, topleft_buf, 257,);
+    pixel *const topleft = topleft_buf + 128;
+
+    declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
+                 int width, int height, const int16_t *ac, int alpha
+                 HIGHBD_DECL_SUFFIX);
+
+    for (int mode = 0; mode <= DC_128_PRED; mode += 1 + 2 * !mode)
+        for (int w = 4; w <= 32; w <<= 1)
+            if (check_func(c->cfl_pred[mode], "cfl_pred_%s_w%d_%dbpc",
+                cfl_pred_mode_names[mode], w, BITDEPTH))
+            {
+                for (int h = imax(w / 4, 4); h <= imin(w * 4, 32); h <<= 1)
+                {
+#if BITDEPTH == 16
+                    const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                    const int bitdepth_max = 0xff;
+#endif
+
+                    const ptrdiff_t stride = w * sizeof(pixel);
+
+                    int alpha = ((rnd() & 15) + 1) * (1 - (rnd() & 2));
+
+                    for (int i = -h * 2; i <= w * 2; i++)
+                        topleft[i] = rnd() & bitdepth_max;
+
+                    int luma_avg = w * h >> 1;
+                    for (int i = 0; i < w * h; i++)
+                        luma_avg += ac[i] = rnd() & (bitdepth_max << 3);
+                    luma_avg /= w * h;
+                    for (int i = 0; i < w * h; i++)
+                        ac[i] -= luma_avg;
+
+                    call_ref(c_dst, stride, topleft, w, h, ac, alpha
+                             HIGHBD_TAIL_SUFFIX);
+                    call_new(a_dst, stride, topleft, w, h, ac, alpha
+                             HIGHBD_TAIL_SUFFIX);
+                    checkasm_check_pixel(c_dst, stride, a_dst, stride,
+                                         w, h, "dst");
+
+                    bench_new(a_dst, stride, topleft, w, h, ac, alpha
+                              HIGHBD_TAIL_SUFFIX);
+                }
+            }
+    report("cfl_pred");
+}
+
+static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
+    ALIGN_STK_64(pixel, c_dst, 64 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 64 * 64,);
+    ALIGN_STK_64(uint8_t, idx, 64 * 64,);
+    ALIGN_STK_16(uint16_t, pal, 8,);
+
+    declare_func(void, pixel *dst, ptrdiff_t stride, const uint16_t *pal,
+                 const uint8_t *idx, int w, int h);
+
+    for (int w = 4; w <= 64; w <<= 1)
+        if (check_func(c->pal_pred, "pal_pred_w%d_%dbpc", w, BITDEPTH))
+            for (int h = imax(w / 4, 4); h <= imin(w * 4, 64); h <<= 1)
+            {
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                const int bitdepth_max = 0xff;
+#endif
+                const ptrdiff_t stride = w * sizeof(pixel);
+
+                for (int i = 0; i < 8; i++)
+                    pal[i] = rnd() & bitdepth_max;
+
+                for (int i = 0; i < w * h; i++)
+                    idx[i] = rnd() & 7;
+
+                call_ref(c_dst, stride, pal, idx, w, h);
+                call_new(a_dst, stride, pal, idx, w, h);
+                checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
+
+                bench_new(a_dst, stride, pal, idx, w, h);
+            }
+    report("pal_pred");
+}
+
+void bitfn(checkasm_check_ipred)(void) {
+    Dav1dIntraPredDSPContext c;
+    bitfn(dav1d_intra_pred_dsp_init)(&c);
+
+    check_intra_pred(&c);
+    check_cfl_ac(&c);
+    check_cfl_pred(&c);
+    check_pal_pred(&c);
+}
diff --git a/tests/checkasm/itx.c b/tests/checkasm/itx.c
new file mode 100644 (file)
index 0000000..01f5e05
--- /dev/null
@@ -0,0 +1,291 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tests/checkasm/checkasm.h"
+
+#include <math.h>
+
+#include "src/itx.h"
+#include "src/levels.h"
+#include "src/scan.h"
+#include "src/tables.h"
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+#ifndef M_SQRT1_2
+#define M_SQRT1_2 0.707106781186547524401
+#endif
+
+enum Tx1D { DCT, ADST, FLIPADST, IDENTITY, WHT };
+
+static const uint8_t itx_1d_types[N_TX_TYPES_PLUS_LL][2] = {
+    [DCT_DCT]           = { DCT,      DCT      },
+    [ADST_DCT]          = { DCT,      ADST     },
+    [DCT_ADST]          = { ADST,     DCT      },
+    [ADST_ADST]         = { ADST,     ADST     },
+    [FLIPADST_DCT]      = { DCT,      FLIPADST },
+    [DCT_FLIPADST]      = { FLIPADST, DCT      },
+    [FLIPADST_FLIPADST] = { FLIPADST, FLIPADST },
+    [ADST_FLIPADST]     = { FLIPADST, ADST     },
+    [FLIPADST_ADST]     = { ADST,     FLIPADST },
+    [IDTX]              = { IDENTITY, IDENTITY },
+    [V_DCT]             = { IDENTITY, DCT      },
+    [H_DCT]             = { DCT,      IDENTITY },
+    [V_ADST]            = { IDENTITY, ADST     },
+    [H_ADST]            = { ADST,     IDENTITY },
+    [V_FLIPADST]        = { IDENTITY, FLIPADST },
+    [H_FLIPADST]        = { FLIPADST, IDENTITY },
+    [WHT_WHT]           = { WHT,      WHT      },
+};
+
+static const char *const itx_1d_names[5] = {
+    [DCT]      = "dct",
+    [ADST]     = "adst",
+    [FLIPADST] = "flipadst",
+    [IDENTITY] = "identity",
+    [WHT]      = "wht"
+};
+
+static const double scaling_factors[9] = {
+    4.0000,             /*  4x4                          */
+    4.0000 * M_SQRT1_2, /*  4x8   8x4                    */
+    2.0000,             /*  4x16  8x8  16x4              */
+    2.0000 * M_SQRT1_2, /*        8x16 16x8              */
+    1.0000,             /*        8x32 16x16 32x8        */
+    0.5000 * M_SQRT1_2, /*             16x32 32x16       */
+    0.2500,             /*             16x64 32x32 64x16 */
+    0.1250 * M_SQRT1_2, /*                   32x64 64x32 */
+    0.0625,             /*                         64x64 */
+};
+
+/* FIXME: Ensure that those forward transforms are similar to the real AV1
+ * transforms. The FLIPADST currently uses the ADST forward transform for
+ * example which is obviously "incorrect", but we're just using it for now
+ * since it does produce coefficients in the correct range at least. */
+
+/* DCT-II */
+static void fdct_1d(double *const out, const double *const in, const int sz) {
+    for (int i = 0; i < sz; i++) {
+        out[i] = 0.0;
+        for (int j = 0; j < sz; j++)
+            out[i] += in[j] * cos(M_PI * (2 * j + 1) * i / (sz * 2.0));
+    }
+    out[0] *= M_SQRT1_2;
+}
+
+/* See "Towards jointly optimal spatial prediction and adaptive transform in
+ * video/image coding", by J. Han, A. Saxena, and K. Rose
+ * IEEE Proc. ICASSP, pp. 726-729, Mar. 2010.
+ * and "A Butterfly Structured Design of The Hybrid Transform Coding Scheme",
+ * by Jingning Han, Yaowu Xu, and Debargha Mukherjee
+ * http://research.google.com/pubs/archive/41418.pdf
+ */
+static void fadst_1d(double *const out, const double *const in, const int sz) {
+    for (int i = 0; i < sz; i++) {
+        out[i] = 0.0;
+        for (int j = 0; j < sz; j++)
+            out[i] += in[j] * sin(M_PI *
+            (sz == 4 ? (    j + 1) * (2 * i + 1) / (8.0 + 1.0) :
+                       (2 * j + 1) * (2 * i + 1) / (sz * 4.0)));
+    }
+}
+
+static void fwht4_1d(double *const out, const double *const in)
+{
+    const double t0 = in[0] + in[1];
+    const double t3 = in[3] - in[2];
+    const double t4 = (t0 - t3) * 0.5;
+    const double t1 = t4 - in[1];
+    const double t2 = t4 - in[2];
+    out[0] = t0 - t2;
+    out[1] = t2;
+    out[2] = t3 + t1;
+    out[3] = t1;
+}
+
+static int copy_subcoefs(coef *coeff,
+                         const enum RectTxfmSize tx, const enum TxfmType txtp,
+                         const int sw, const int sh, const int subsh)
+{
+    /* copy the topleft coefficients such that the return value (being the
+     * coefficient scantable index for the eob token) guarantees that only
+     * the topleft $sub out of $sz (where $sz >= $sub) coefficients in both
+     * dimensions are non-zero. This leads to braching to specific optimized
+     * simd versions (e.g. dc-only) so that we get full asm coverage in this
+     * test */
+    const uint16_t *const scan = dav1d_scans[tx][dav1d_tx_type_class[txtp]];
+    const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0;
+    const int sub_low  = subsh > 1 ? sub_high - 8 : 0;
+    int n, eob;
+
+    for (n = 0, eob = 0; n < sw * sh; n++) {
+        const int rc = scan[n];
+        const int rcx = rc % sh, rcy = rc / sh;
+
+        /* Pick a random eob within this sub-itx */
+        if (rcx > sub_high || rcy > sub_high) {
+            break; /* upper boundary */
+        } else if (!eob && (rcx > sub_low || rcy > sub_low))
+            eob = n; /* lower boundary */
+    }
+
+    if (eob)
+        eob += rnd() % (n - eob - 1);
+    for (n = eob + 1; n < sw * sh; n++)
+        coeff[scan[n]] = 0;
+    for (; n < 32 * 32; n++)
+        coeff[n] = rnd();
+    return eob;
+}
+
+static int ftx(coef *const buf, const enum RectTxfmSize tx,
+               const enum TxfmType txtp, const int w, const int h,
+               const int subsh, const int bitdepth_max)
+{
+    double out[64 * 64], temp[64 * 64];
+    const double scale = scaling_factors[ctz(w * h) - 4];
+    const int sw = imin(w, 32), sh = imin(h, 32);
+
+    for (int i = 0; i < h; i++) {
+        double in[64], temp_out[64];
+
+        for (int i = 0; i < w; i++)
+            in[i] = (rnd() & (2 * bitdepth_max + 1)) - bitdepth_max;
+
+        switch (itx_1d_types[txtp][0]) {
+        case DCT:
+            fdct_1d(temp_out, in, w);
+            break;
+        case ADST:
+        case FLIPADST:
+            fadst_1d(temp_out, in, w);
+            break;
+        case WHT:
+            fwht4_1d(temp_out, in);
+            break;
+        case IDENTITY:
+            memcpy(temp_out, in, w * sizeof(*temp_out));
+            break;
+        }
+
+        for (int j = 0; j < w; j++)
+            temp[j * h + i] = temp_out[j] * scale;
+    }
+
+    for (int i = 0; i < w; i++) {
+        switch (itx_1d_types[txtp][0]) {
+        case DCT:
+            fdct_1d(&out[i * h], &temp[i * h], h);
+            break;
+        case ADST:
+        case FLIPADST:
+            fadst_1d(&out[i * h], &temp[i * h], h);
+            break;
+        case WHT:
+            fwht4_1d(&out[i * h], &temp[i * h]);
+            break;
+        case IDENTITY:
+            memcpy(&out[i * h], &temp[i * h], h * sizeof(*out));
+            break;
+        }
+    }
+
+    for (int y = 0; y < sh; y++)
+        for (int x = 0; x < sw; x++)
+            buf[y * sw + x] = (coef) (out[y * w + x] + 0.5);
+
+    return copy_subcoefs(buf, tx, txtp, sw, sh, subsh);
+}
+
+void bitfn(checkasm_check_itx)(void) {
+#if BITDEPTH == 16
+    const int bpc_min = 10, bpc_max = 12;
+#else
+    const int bpc_min = 8, bpc_max = 8;
+#endif
+
+    ALIGN_STK_64(coef, coeff, 2, [32 * 32]);
+    ALIGN_STK_64(pixel, c_dst, 64 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 64 * 64,);
+    Dav1dInvTxfmDSPContext c = { { { 0 } } }; /* Zero unused function pointer elements. */
+
+    static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = {
+        TX_4X4,   RTX_4X8,  RTX_4X16,
+        RTX_8X4,  TX_8X8,   RTX_8X16,  RTX_8X32,
+        RTX_16X4, RTX_16X8, TX_16X16,  RTX_16X32, RTX_16X64,
+                  RTX_32X8, RTX_32X16, TX_32X32,  RTX_32X64,
+                            RTX_64X16, RTX_64X32, TX_64X64
+    };
+
+    static const uint8_t subsh_iters[5] = { 2, 2, 3, 5, 5 };
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob
+                 HIGHBD_DECL_SUFFIX);
+
+    for (int i = 0; i < N_RECT_TX_SIZES; i++) {
+        const enum RectTxfmSize tx = txfm_size_order[i];
+        const int w = dav1d_txfm_dimensions[tx].w * 4;
+        const int h = dav1d_txfm_dimensions[tx].h * 4;
+        const int subsh_max = subsh_iters[imax(dav1d_txfm_dimensions[tx].lw,
+                                               dav1d_txfm_dimensions[tx].lh)];
+
+        for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
+            bitfn(dav1d_itx_dsp_init)(&c, bpc);
+            for (enum TxfmType txtp = 0; txtp < N_TX_TYPES_PLUS_LL; txtp++)
+                for (int subsh = 0; subsh < subsh_max; subsh++)
+                    if (check_func(c.itxfm_add[tx][txtp],
+                                   "inv_txfm_add_%dx%d_%s_%s_%d_%dbpc",
+                                   w, h, itx_1d_names[itx_1d_types[txtp][0]],
+                                   itx_1d_names[itx_1d_types[txtp][1]], subsh,
+                                   bpc))
+                    {
+                        const int bitdepth_max = (1 << bpc) - 1;
+                        const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
+                        memcpy(coeff[1], coeff[0], sizeof(*coeff));
+
+                        for (int j = 0; j < w * h; j++)
+                            c_dst[j] = a_dst[j] = rnd() & bitdepth_max;
+
+                        call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob
+                                 HIGHBD_TAIL_SUFFIX);
+                        call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob
+                                 HIGHBD_TAIL_SUFFIX);
+
+                        checkasm_check_pixel(c_dst, w * sizeof(*c_dst),
+                                             a_dst, w * sizeof(*a_dst),
+                                             w, h, "dst");
+                        if (memcmp(coeff[0], coeff[1], sizeof(*coeff)))
+                            fail();
+
+                        bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob
+                                  HIGHBD_TAIL_SUFFIX);
+                    }
+        }
+        report("add_%dx%d", w, h);
+    }
+}
diff --git a/tests/checkasm/loopfilter.c b/tests/checkasm/loopfilter.c
new file mode 100644 (file)
index 0000000..aabf54f
--- /dev/null
@@ -0,0 +1,204 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tests/checkasm/checkasm.h"
+
+#include <string.h>
+
+#include "src/levels.h"
+#include "src/loopfilter.h"
+
+static void init_lpf_border(pixel *const dst, const ptrdiff_t stride,
+                            int E, int I, int H, const int bitdepth_max)
+{
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    const int F = 1 << bitdepth_min_8;
+    E <<= bitdepth_min_8;
+    I <<= bitdepth_min_8;
+    H <<= bitdepth_min_8;
+
+    const int filter_type = rnd() % 4;
+    const int edge_diff = rnd() % ((E + 2) * 4) - 2 * (E + 2);
+    switch (filter_type) {
+    case 0: // random, unfiltered
+        for (int i = -8; i < 8; i++)
+            dst[i * stride] = rnd() & bitdepth_max;
+        break;
+    case 1: // long flat
+        dst[-8 * stride] = rnd() & bitdepth_max;
+        dst[+7 * stride] = rnd() & bitdepth_max;
+        dst[+0 * stride] = rnd() & bitdepth_max;
+        dst[-1 * stride] = iclip_pixel(dst[+0 * stride] + edge_diff);
+        for (int i = 1; i < 7; i++) {
+            dst[-(1 + i) * stride] = iclip_pixel(dst[-1 * stride] +
+                                                 rnd() % (2 * (F + 1)) - (F + 1));
+            dst[+(0 + i) * stride] = iclip_pixel(dst[+0 * stride] +
+                                                 rnd() % (2 * (F + 1)) - (F + 1));
+        }
+        break;
+    case 2: // short flat
+        for (int i = 4; i < 8; i++) {
+            dst[-(1 + i) * stride] = rnd() & bitdepth_max;
+            dst[+(0 + i) * stride] = rnd() & bitdepth_max;
+        }
+        dst[+0 * stride] = rnd() & bitdepth_max;
+        dst[-1 * stride] = iclip_pixel(dst[+0 * stride] + edge_diff);
+        for (int i = 1; i < 4; i++) {
+            dst[-(1 + i) * stride] = iclip_pixel(dst[-1 * stride] +
+                                                 rnd() % (2 * (F + 1)) - (F + 1));
+            dst[+(0 + i) * stride] = iclip_pixel(dst[+0 * stride] +
+                                                 rnd() % (2 * (F + 1)) - (F + 1));
+        }
+        break;
+    case 3: // normal or hev
+        for (int i = 4; i < 8; i++) {
+            dst[-(1 + i) * stride] = rnd() & bitdepth_max;
+            dst[+(0 + i) * stride] = rnd() & bitdepth_max;
+        }
+        dst[+0 * stride] = rnd() & bitdepth_max;
+        dst[-1 * stride] = iclip_pixel(dst[+0 * stride] + edge_diff);
+        for (int i = 1; i < 4; i++) {
+            dst[-(1 + i) * stride] = iclip_pixel(dst[-(0 + i) * stride] +
+                                                 rnd() % (2 * (I + 1)) - (I + 1));
+            dst[+(0 + i) * stride] = iclip_pixel(dst[+(i - 1) * stride] +
+                                                 rnd() % (2 * (I + 1)) - (I + 1));
+        }
+        break;
+    }
+}
+
+static void check_lpf_sb(loopfilter_sb_fn fn, const char *const name,
+                         const int n_blks, const int lf_idx,
+                         const int is_chroma, const int dir)
+{
+    ALIGN_STK_64(pixel, c_dst_mem, 128 * 16,);
+    ALIGN_STK_64(pixel, a_dst_mem, 128 * 16,);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const uint32_t *mask,
+                 const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                 const Av1FilterLUT *lut, int w HIGHBD_DECL_SUFFIX);
+
+    pixel *a_dst, *c_dst;
+    ptrdiff_t stride, b4_stride;
+    int w, h;
+    if (dir) {
+        a_dst = a_dst_mem + 128 * 8;
+        c_dst = c_dst_mem + 128 * 8;
+        w = 128;
+        h = 16;
+        b4_stride = 32;
+    } else {
+        a_dst = a_dst_mem + 8;
+        c_dst = c_dst_mem + 8;
+        w = 16;
+        h = 128;
+        b4_stride = 2;
+    }
+    stride = w * sizeof(pixel);
+
+    Av1FilterLUT lut;
+    const int sharp = rnd() & 7;
+    for (int level = 0; level < 64; level++) {
+        int limit = level;
+
+        if (sharp > 0) {
+            limit >>= (sharp + 3) >> 2;
+            limit = imin(limit, 9 - sharp);
+        }
+        limit = imax(limit, 1);
+
+        lut.i[level] = limit;
+        lut.e[level] = 2 * (level + 2) + limit;
+    }
+    lut.sharp[0] = (sharp + 3) >> 2;
+    lut.sharp[1] = sharp ? 9 - sharp : 0xff;
+
+    const int n_strengths = is_chroma ? 2 : 3;
+    for (int i = 0; i < n_strengths; i++) {
+        if (check_func(fn, "%s_w%d_%dbpc", name,
+                       is_chroma ? 4 + 2 * i : 4 << i, BITDEPTH))
+        {
+            uint32_t vmask[4] = { 0 };
+            uint8_t l[32 * 2][4];
+
+            for (int j = 0; j < n_blks; j++) {
+                const int idx = rnd() % (i + 2);
+                if (idx) vmask[idx - 1] |= 1U << j;
+                if (dir) {
+                    l[j][lf_idx] = rnd() & 63;
+                    l[j + 32][lf_idx] = rnd() & 63;
+                } else {
+                    l[j * 2][lf_idx] = rnd() & 63;
+                    l[j * 2 + 1][lf_idx] = rnd() & 63;
+                }
+            }
+#if BITDEPTH == 16
+            const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+            const int bitdepth_max = 0xff;
+#endif
+
+            for (int i = 0; i < 4 * n_blks; i++) {
+                const int x = i >> 2;
+                int L;
+                if (dir) {
+                    L = l[32 + x][lf_idx] ? l[32 + x][lf_idx] : l[x][lf_idx];
+                } else {
+                    L = l[2 * x + 1][lf_idx] ? l[2 * x + 1][lf_idx] : l[2 * x][lf_idx];
+                }
+                init_lpf_border(c_dst + i * (dir ? 1 : 16), dir ? 128 : 1,
+                                lut.e[L], lut.i[L], L >> 4, bitdepth_max);
+            }
+            memcpy(a_dst_mem, c_dst_mem, 128 * sizeof(pixel) * 16);
+
+            call_ref(c_dst, stride,
+                     vmask, (const uint8_t(*)[4]) &l[dir ? 32 : 1][lf_idx], b4_stride,
+                     &lut, n_blks HIGHBD_TAIL_SUFFIX);
+            call_new(a_dst, stride,
+                     vmask, (const uint8_t(*)[4]) &l[dir ? 32 : 1][lf_idx], b4_stride,
+                     &lut, n_blks HIGHBD_TAIL_SUFFIX);
+
+            checkasm_check_pixel(c_dst_mem, stride, a_dst_mem, stride,
+                                 w, h, "dst");
+            bench_new(a_dst, stride,
+                      vmask, (const uint8_t(*)[4]) &l[dir ? 32 : 1][lf_idx], b4_stride,
+                      &lut, n_blks HIGHBD_TAIL_SUFFIX);
+        }
+    }
+    report(name);
+}
+
+void bitfn(checkasm_check_loopfilter)(void) {
+    Dav1dLoopFilterDSPContext c;
+
+    bitfn(dav1d_loop_filter_dsp_init)(&c);
+
+    check_lpf_sb(c.loop_filter_sb[0][0], "lpf_h_sb_y", 32, 0, 0, 0);
+    check_lpf_sb(c.loop_filter_sb[0][1], "lpf_v_sb_y", 32, 1, 0, 1);
+    check_lpf_sb(c.loop_filter_sb[1][0], "lpf_h_sb_uv", 16, 2, 1, 0);
+    check_lpf_sb(c.loop_filter_sb[1][1], "lpf_v_sb_uv", 16, 2, 1, 1);
+}
diff --git a/tests/checkasm/looprestoration.c b/tests/checkasm/looprestoration.c
new file mode 100644 (file)
index 0000000..c76b020
--- /dev/null
@@ -0,0 +1,185 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tests/checkasm/checkasm.h"
+
+#include <string.h>
+
+#include "src/levels.h"
+#include "src/looprestoration.h"
+#include "src/tables.h"
+
+static void init_tmp(pixel *buf, const ptrdiff_t stride,
+                     const int w, const int h, const int bitdepth_max)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)
+            buf[x] = rnd() & bitdepth_max;
+        buf += PXSTRIDE(stride);
+    }
+}
+
+static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
+    ALIGN_STK_64(pixel, c_dst, 448 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 448 * 64,);
+    ALIGN_STK_64(pixel, h_edge, 448 * 8,);
+    pixel left[64][4];
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride,
+                 const pixel (*const left)[4],
+                 const pixel *lpf, ptrdiff_t lpf_stride,
+                 int w, int h, const int16_t filterh[7],
+                 const int16_t filterv[7], enum LrEdgeFlags edges
+                 HIGHBD_DECL_SUFFIX);
+
+    for (int pl = 0; pl < 2; pl++) {
+        if (check_func(c->wiener, "wiener_%s_%dbpc",
+                       pl ? "chroma" : "luma", bpc))
+        {
+            int16_t filter[2][3], filter_v[7], filter_h[7];
+
+            filter[0][0] = pl ? 0 : (rnd() & 15) - 5;
+            filter[0][1] = (rnd() & 31) - 23;
+            filter[0][2] = (rnd() & 63) - 17;
+            filter[1][0] = pl ? 0 : (rnd() & 15) - 5;
+            filter[1][1] = (rnd() & 31) - 23;
+            filter[1][2] = (rnd() & 63) - 17;
+
+            filter_h[0] = filter_h[6] = filter[0][0];
+            filter_h[1] = filter_h[5] = filter[0][1];
+            filter_h[2] = filter_h[4] = filter[0][2];
+            filter_h[3] = -((filter_h[0] + filter_h[1] + filter_h[2]) * 2);
+
+            filter_v[0] = filter_v[6] = filter[1][0];
+            filter_v[1] = filter_v[5] = filter[1][1];
+            filter_v[2] = filter_v[4] = filter[1][2];
+            filter_v[3] = -((filter_v[0] + filter_v[1] + filter_v[2]) * 2);
+
+            const int base_w = 1 + (rnd() % 384);
+            const int base_h = 1 + (rnd() & 63);
+            const int bitdepth_max = (1 << bpc) - 1;
+
+            init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
+            init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
+            init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max);
+
+            for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) {
+                const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
+                const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
+
+                memcpy(a_dst, c_dst, 448 * 64 * sizeof(pixel));
+
+                call_ref(c_dst + 32, 448 * sizeof(pixel), left,
+                         h_edge + 32, 448 * sizeof(pixel),
+                         w, h, filter_h, filter_v, edges HIGHBD_TAIL_SUFFIX);
+                call_new(a_dst + 32, 448 * sizeof(pixel), left,
+                         h_edge + 32, 448 * sizeof(pixel),
+                         w, h, filter_h, filter_v, edges HIGHBD_TAIL_SUFFIX);
+                checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
+                                     a_dst + 32, 448 * sizeof(pixel),
+                                     w, h, "dst");
+            }
+            bench_new(a_dst + 32, 448 * sizeof(pixel), left,
+                      h_edge + 32, 448 * sizeof(pixel),
+                      256, 64, filter_h, filter_v, 0xf HIGHBD_TAIL_SUFFIX);
+        }
+    }
+}
+
+static void check_sgr(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
+    ALIGN_STK_64(pixel, c_dst, 448 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 448 * 64,);
+    ALIGN_STK_64(pixel, h_edge, 448 * 8,);
+    pixel left[64][4];
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride,
+                 const pixel (*const left)[4],
+                 const pixel *lpf, ptrdiff_t lpf_stride,
+                 int w, int h, int sgr_idx,
+                 const int16_t sgr_wt[7], enum LrEdgeFlags edges
+                 HIGHBD_DECL_SUFFIX);
+
+    for (int sgr_idx = 14; sgr_idx >= 6; sgr_idx -= 4) {
+        if (check_func(c->selfguided, "selfguided_%s_%dbpc",
+                       sgr_idx == 6 ? "mix" : sgr_idx == 10 ? "3x3" : "5x5", bpc))
+        {
+            int16_t sgr_wt[2];
+
+            sgr_wt[0] = dav1d_sgr_params[sgr_idx][0] ? (rnd() & 127) - 96 : 0;
+            sgr_wt[1] = dav1d_sgr_params[sgr_idx][1] ? (rnd() & 127) - 32 :
+                            iclip(128 - sgr_wt[0], -32, 95);
+
+            const int base_w = 1 + (rnd() % 384);
+            const int base_h = 1 + (rnd() & 63);
+            const int bitdepth_max = (1 << bpc) - 1;
+
+            init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
+            init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
+            init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max);
+
+            for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) {
+                const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
+                const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
+
+                memcpy(a_dst, c_dst, 448 * 64 * sizeof(pixel));
+
+                call_ref(c_dst + 32, 448 * sizeof(pixel), left,
+                         h_edge + 32, 448 * sizeof(pixel),
+                         w, h, sgr_idx, sgr_wt, edges HIGHBD_TAIL_SUFFIX);
+                call_new(a_dst + 32, 448 * sizeof(pixel), left,
+                         h_edge + 32, 448 * sizeof(pixel),
+                         w, h, sgr_idx, sgr_wt, edges HIGHBD_TAIL_SUFFIX);
+                checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
+                                     a_dst + 32, 448 * sizeof(pixel),
+                                     w, h, "dst");
+            }
+            bench_new(a_dst + 32, 448 * sizeof(pixel), left,
+                      h_edge + 32, 448 * sizeof(pixel),
+                      256, 64, sgr_idx, sgr_wt, 0xf HIGHBD_TAIL_SUFFIX);
+        }
+    }
+}
+
+void bitfn(checkasm_check_looprestoration)(void) {
+#if BITDEPTH == 16
+    const int bpc_min = 10, bpc_max = 12;
+#else
+    const int bpc_min = 8, bpc_max = 8;
+#endif
+    for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
+        Dav1dLoopRestorationDSPContext c;
+        bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc);
+        check_wiener(&c, bpc);
+    }
+    report("wiener");
+    for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
+        Dav1dLoopRestorationDSPContext c;
+        bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc);
+        check_sgr(&c, bpc);
+    }
+    report("sgr");
+}
diff --git a/tests/checkasm/mc.c b/tests/checkasm/mc.c
new file mode 100644 (file)
index 0000000..ff8680d
--- /dev/null
@@ -0,0 +1,756 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tests/checkasm/checkasm.h"
+
+#include "src/levels.h"
+#include "src/mc.h"
+
+static const char *const filter_names[] = {
+    "8tap_regular",        "8tap_regular_smooth", "8tap_regular_sharp",
+    "8tap_sharp_regular",  "8tap_sharp_smooth",   "8tap_sharp",
+    "8tap_smooth_regular", "8tap_smooth",         "8tap_smooth_sharp",
+    "bilinear"
+};
+
+static const char *const mxy_names[] = { "0", "h", "v", "hv" };
+static const char *const scaled_paths[] = { "", "_dy1", "_dy2" };
+
+static int mc_h_next(const int h) {
+    switch (h) {
+    case 4:
+    case 8:
+    case 16:
+        return (h * 3) >> 1;
+    case 6:
+    case 12:
+    case 24:
+        return (h & (h - 1)) * 2;
+    default:
+        return h * 2;
+    }
+}
+
+static void check_mc(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, src_buf, 135 * 135,);
+    ALIGN_STK_64(pixel, c_dst,   128 * 128,);
+    ALIGN_STK_64(pixel, a_dst,   128 * 128,);
+    const pixel *src = src_buf + 135 * 3 + 3;
+    const ptrdiff_t src_stride = 135 * sizeof(pixel);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
+                 ptrdiff_t src_stride, int w, int h, int mx, int my
+                 HIGHBD_DECL_SUFFIX);
+
+    for (int filter = 0; filter < N_2D_FILTERS; filter++)
+        for (int w = 2; w <= 128; w <<= 1) {
+            const ptrdiff_t dst_stride = w * sizeof(pixel);
+            for (int mxy = 0; mxy < 4; mxy++)
+                if (check_func(c->mc[filter], "mc_%s_w%d_%s_%dbpc",
+                    filter_names[filter], w, mxy_names[mxy], BITDEPTH))
+                {
+                    const int h_min = w <= 32 ? 2 : w / 4;
+                    const int h_max = imax(imin(w * 4, 128), 32);
+                    for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
+                        const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0;
+                        const int my = (mxy & 2) ? rnd() % 15 + 1 : 0;
+#if BITDEPTH == 16
+                        const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                        const int bitdepth_max = 0xff;
+#endif
+
+                        for (int i = 0; i < 135 * 135; i++)
+                            src_buf[i] = rnd() & bitdepth_max;
+
+                        call_ref(c_dst, dst_stride, src, src_stride, w, h,
+                                 mx, my HIGHBD_TAIL_SUFFIX);
+                        call_new(a_dst, dst_stride, src, src_stride, w, h,
+                                 mx, my HIGHBD_TAIL_SUFFIX);
+                        checkasm_check_pixel(c_dst, dst_stride,
+                                             a_dst, dst_stride,
+                                             w, h, "dst");
+
+                        if (filter == FILTER_2D_8TAP_REGULAR ||
+                            filter == FILTER_2D_BILINEAR)
+                        {
+                            bench_new(a_dst, dst_stride, src, src_stride, w, h,
+                                      mx, my HIGHBD_TAIL_SUFFIX);
+                        }
+                    }
+                }
+        }
+    report("mc");
+}
+
+/* Generate worst case input in the topleft corner, randomize the rest */
+static void generate_mct_input(pixel *const buf, const int bitdepth_max) {
+    static const int8_t pattern[8] = { -1,  0, -1,  0,  0, -1,  0, -1 };
+    const int sign = -(rnd() & 1);
+
+    for (int y = 0; y < 135; y++)
+        for (int x = 0; x < 135; x++)
+            buf[135*y+x] = ((x | y) < 8 ? (pattern[x] ^ pattern[y] ^ sign)
+                                        : rnd()) & bitdepth_max;
+}
+
+static void check_mct(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, src_buf, 135 * 135,);
+    ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
+    ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
+    const pixel *src = src_buf + 135 * 3 + 3;
+    const ptrdiff_t src_stride = 135 * sizeof(pixel);
+
+    declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
+                 int w, int h, int mx, int my HIGHBD_DECL_SUFFIX);
+
+    for (int filter = 0; filter < N_2D_FILTERS; filter++)
+        for (int w = 4; w <= 128; w <<= 1)
+            for (int mxy = 0; mxy < 4; mxy++)
+                if (check_func(c->mct[filter], "mct_%s_w%d_%s_%dbpc",
+                    filter_names[filter], w, mxy_names[mxy], BITDEPTH))
+                    for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
+                    {
+                        const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0;
+                        const int my = (mxy & 2) ? rnd() % 15 + 1 : 0;
+#if BITDEPTH == 16
+                        const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                        const int bitdepth_max = 0xff;
+#endif
+                        generate_mct_input(src_buf, bitdepth_max);
+
+                        call_ref(c_tmp, src, src_stride, w, h,
+                                 mx, my HIGHBD_TAIL_SUFFIX);
+                        call_new(a_tmp, src, src_stride, w, h,
+                                 mx, my HIGHBD_TAIL_SUFFIX);
+                        checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp),
+                                                a_tmp, w * sizeof(*a_tmp),
+                                                w, h, "tmp");
+
+                        if (filter == FILTER_2D_8TAP_REGULAR ||
+                            filter == FILTER_2D_BILINEAR)
+                        {
+                            bench_new(a_tmp, src, src_stride, w, h,
+                                      mx, my HIGHBD_TAIL_SUFFIX);
+                        }
+                    }
+    report("mct");
+}
+
+static void check_mc_scaled(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, src_buf, 263 * 263,);
+    ALIGN_STK_64(pixel, c_dst,   128 * 128,);
+    ALIGN_STK_64(pixel, a_dst,   128 * 128,);
+    const pixel *src = src_buf + 263 * 3 + 3;
+    const ptrdiff_t src_stride = 263 * sizeof(pixel);
+#if BITDEPTH == 16
+    const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+    const int bitdepth_max = 0xff;
+#endif
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
+                 ptrdiff_t src_stride, int w, int h,
+                 int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX);
+
+    for (int filter = 0; filter < N_2D_FILTERS; filter++)
+        for (int w = 2; w <= 128; w <<= 1) {
+            const ptrdiff_t dst_stride = w * sizeof(pixel);
+            for (int p = 0; p < 3; ++p) {
+                if (check_func(c->mc_scaled[filter], "mc_scaled_%s_w%d%s_%dbpc",
+                               filter_names[filter], w, scaled_paths[p], BITDEPTH))
+                {
+                    const int h_min = w <= 32 ? 2 : w / 4;
+                    const int h_max = imax(imin(w * 4, 128), 32);
+                    for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
+                        const int mx = rnd() % 1024;
+                        const int my = rnd() % 1024;
+                        const int dx = rnd() % 2048 + 1;
+                        const int dy = !p
+                            ? rnd() % 2048 + 1
+                            : p << 10; // ystep=1.0 and ystep=2.0 paths
+
+                        for (int k = 0; k < 263 * 263; k++)
+                            src_buf[k] = rnd() & bitdepth_max;
+
+                        call_ref(c_dst, dst_stride, src, src_stride,
+                                 w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                        call_new(a_dst, dst_stride, src, src_stride,
+                                 w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                        checkasm_check_pixel(c_dst, dst_stride,
+                                             a_dst, dst_stride, w, h, "dst");
+
+                        if (filter == FILTER_2D_8TAP_REGULAR ||
+                            filter == FILTER_2D_BILINEAR)
+                            bench_new(a_dst, dst_stride, src, src_stride,
+                                      w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                    }
+                }
+            }
+        }
+    report("mc_scaled");
+}
+
+static void check_mct_scaled(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, src_buf, 263 * 263,);
+    ALIGN_STK_64(int16_t, c_tmp,   128 * 128,);
+    ALIGN_STK_64(int16_t, a_tmp,   128 * 128,);
+    const pixel *src = src_buf + 263 * 3 + 3;
+    const ptrdiff_t src_stride = 263 * sizeof(pixel);
+#if BITDEPTH == 16
+    const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+    const int bitdepth_max = 0xff;
+#endif
+
+    declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
+                 int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX);
+
+    for (int filter = 0; filter < N_2D_FILTERS; filter++)
+        for (int w = 4; w <= 128; w <<= 1)
+            for (int p = 0; p < 3; ++p) {
+                if (check_func(c->mct_scaled[filter], "mct_scaled_%s_w%d%s_%dbpc",
+                               filter_names[filter], w, scaled_paths[p], BITDEPTH))
+                {
+                    const int h_min = imax(w / 4, 4);
+                    const int h_max = imin(w * 4, 128);
+                    for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
+                        const int mx = rnd() % 1024;
+                        const int my = rnd() % 1024;
+                        const int dx = rnd() % 2048 + 1;
+                        const int dy = !p
+                            ? rnd() % 2048 + 1
+                            : p << 10; // ystep=1.0 and ystep=2.0 paths
+
+                        for (int k = 0; k < 263 * 263; k++)
+                            src_buf[k] = rnd() & bitdepth_max;
+
+                        call_ref(c_tmp, src, src_stride,
+                                 w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                        call_new(a_tmp, src, src_stride,
+                                 w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                        checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp),
+                                                a_tmp, w * sizeof(*a_tmp),
+                                                w, h, "tmp");
+
+                        if (filter == FILTER_2D_8TAP_REGULAR ||
+                            filter == FILTER_2D_BILINEAR)
+                            bench_new(a_tmp, src, src_stride,
+                                      w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                    }
+                }
+            }
+    report("mct_scaled");
+}
+
+static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf,
+                     int16_t (*const tmp)[128 * 128], const int bitdepth_max)
+{
+    for (int i = 0; i < 2; i++) {
+        generate_mct_input(buf, bitdepth_max);
+        c->mct[FILTER_2D_8TAP_SHARP](tmp[i], buf + 135 * 3 + 3,
+                                      135 * sizeof(pixel), 128, 128,
+                                      8, 8 HIGHBD_TAIL_SUFFIX);
+    }
+}
+
+static void check_avg(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+    ALIGN_STK_64(pixel, c_dst, 135 * 135,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 128,);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
+                 const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX);
+
+    for (int w = 4; w <= 128; w <<= 1)
+        if (check_func(c->avg, "avg_w%d_%dbpc", w, BITDEPTH)) {
+            ptrdiff_t dst_stride = w * sizeof(pixel);
+            for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
+            {
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                const int bitdepth_max = 0xff;
+#endif
+
+                init_tmp(c, c_dst, tmp, bitdepth_max);
+                call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
+                call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
+                checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
+                                     w, h, "dst");
+
+                bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
+            }
+        }
+    report("avg");
+}
+
+static void check_w_avg(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+    ALIGN_STK_64(pixel, c_dst, 135 * 135,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 128,);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
+                 const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX);
+
+    for (int w = 4; w <= 128; w <<= 1)
+        if (check_func(c->w_avg, "w_avg_w%d_%dbpc", w, BITDEPTH)) {
+            ptrdiff_t dst_stride = w * sizeof(pixel);
+            for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
+            {
+                int weight = rnd() % 15 + 1;
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                const int bitdepth_max = 0xff;
+#endif
+                init_tmp(c, c_dst, tmp, bitdepth_max);
+
+                call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
+                call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
+                checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
+                                     w, h, "dst");
+
+                bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
+            }
+        }
+    report("w_avg");
+}
+
+static void check_mask(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+    ALIGN_STK_64(pixel,   c_dst, 135 * 135,);
+    ALIGN_STK_64(pixel,   a_dst, 128 * 128,);
+    ALIGN_STK_64(uint8_t, mask,  128 * 128,);
+
+    for (int i = 0; i < 128 * 128; i++)
+        mask[i] = rnd() % 65;
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
+                 const int16_t *tmp2, int w, int h, const uint8_t *mask
+                 HIGHBD_DECL_SUFFIX);
+
+    for (int w = 4; w <= 128; w <<= 1)
+        if (check_func(c->mask, "mask_w%d_%dbpc", w, BITDEPTH)) {
+            ptrdiff_t dst_stride = w * sizeof(pixel);
+            for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
+            {
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                const int bitdepth_max = 0xff;
+#endif
+                init_tmp(c, c_dst, tmp, bitdepth_max);
+                call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
+                call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
+                checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
+                                     w, h, "dst");
+
+                bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
+            }
+        }
+    report("mask");
+}
+
+static void check_w_mask(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+    ALIGN_STK_64(pixel,   c_dst,  135 * 135,);
+    ALIGN_STK_64(pixel,   a_dst,  128 * 128,);
+    ALIGN_STK_64(uint8_t, c_mask, 128 * 128,);
+    ALIGN_STK_64(uint8_t, a_mask, 128 * 128,);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
+                 const int16_t *tmp2, int w, int h, uint8_t *mask, int sign
+                 HIGHBD_DECL_SUFFIX);
+
+    static const uint16_t ss[] = { 444, 422, 420 };
+    static const uint8_t ss_hor[] = { 0, 1, 1 };
+    static const uint8_t ss_ver[] = { 0, 0, 1 };
+
+    for (int i = 0; i < 3; i++)
+        for (int w = 4; w <= 128; w <<= 1)
+            if (check_func(c->w_mask[i], "w_mask_%d_w%d_%dbpc", ss[i], w,
+                           BITDEPTH))
+            {
+                ptrdiff_t dst_stride = w * sizeof(pixel);
+                for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
+                {
+                    int sign = rnd() & 1;
+#if BITDEPTH == 16
+                    const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                    const int bitdepth_max = 0xff;
+#endif
+                    init_tmp(c, c_dst, tmp, bitdepth_max);
+
+                    call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h,
+                             c_mask, sign HIGHBD_TAIL_SUFFIX);
+                    call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h,
+                             a_mask, sign HIGHBD_TAIL_SUFFIX);
+                    checkasm_check_pixel(c_dst, dst_stride,
+                                         a_dst, dst_stride,
+                                         w, h, "dst");
+                    checkasm_check(uint8_t, c_mask, w >> ss_hor[i],
+                                            a_mask, w >> ss_hor[i],
+                                            w >> ss_hor[i], h >> ss_ver[i],
+                                            "mask");
+
+                    bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h,
+                              a_mask, sign HIGHBD_TAIL_SUFFIX);
+                }
+            }
+    report("w_mask");
+}
+
+static void check_blend(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, tmp, 32 * 32,);
+    ALIGN_STK_64(pixel, c_dst, 32 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 32 * 32,);
+    ALIGN_STK_64(uint8_t, mask, 32 * 32,);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
+                 int w, int h, const uint8_t *mask);
+
+    for (int w = 4; w <= 32; w <<= 1) {
+        const ptrdiff_t dst_stride = w * sizeof(pixel);
+        if (check_func(c->blend, "blend_w%d_%dbpc", w, BITDEPTH))
+            for (int h = imax(w / 2, 4); h <= imin(w * 2, 32); h <<= 1) {
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                const int bitdepth_max = 0xff;
+#endif
+                for (int i = 0; i < 32 * 32; i++) {
+                    tmp[i] = rnd() & bitdepth_max;
+                    mask[i] = rnd() % 65;
+                }
+                for (int i = 0; i < w * h; i++)
+                    c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
+
+                call_ref(c_dst, dst_stride, tmp, w, h, mask);
+                call_new(a_dst, dst_stride, tmp, w, h, mask);
+                checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
+                                     w, h, "dst");
+
+                bench_new(a_dst, dst_stride, tmp, w, h, mask);
+            }
+    }
+    report("blend");
+}
+
+static void check_blend_v(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, tmp,   32 * 128,);
+    ALIGN_STK_64(pixel, c_dst, 32 * 128,);
+    ALIGN_STK_64(pixel, a_dst, 32 * 128,);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
+                 int w, int h);
+
+    for (int w = 2; w <= 32; w <<= 1) {
+        const ptrdiff_t dst_stride = w * sizeof(pixel);
+        if (check_func(c->blend_v, "blend_v_w%d_%dbpc", w, BITDEPTH))
+            for (int h = 2; h <= (w == 2 ? 64 : 128); h <<= 1) {
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                const int bitdepth_max = 0xff;
+#endif
+
+                for (int i = 0; i < w * h; i++)
+                    c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
+                for (int i = 0; i < 32 * 128; i++)
+                    tmp[i] = rnd() & bitdepth_max;
+
+                call_ref(c_dst, dst_stride, tmp, w, h);
+                call_new(a_dst, dst_stride, tmp, w, h);
+                checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
+                                     w, h, "dst");
+
+                bench_new(a_dst, dst_stride, tmp, w, h);
+            }
+    }
+    report("blend_v");
+}
+
+static void check_blend_h(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, tmp,   128 * 32,);
+    ALIGN_STK_64(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 32,);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
+                 int w, int h);
+
+    for (int w = 2; w <= 128; w <<= 1) {
+        const ptrdiff_t dst_stride = w * sizeof(pixel);
+        if (check_func(c->blend_h, "blend_h_w%d_%dbpc", w, BITDEPTH))
+            for (int h = (w == 128 ? 4 : 2); h <= 32; h <<= 1) {
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                const int bitdepth_max = 0xff;
+#endif
+                for (int i = 0; i < w * h; i++)
+                    c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
+                for (int i = 0; i < 128 * 32; i++)
+                    tmp[i] = rnd() & bitdepth_max;
+
+                call_ref(c_dst, dst_stride, tmp, w, h);
+                call_new(a_dst, dst_stride, tmp, w, h);
+                checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
+                                     w, h, "dst");
+
+                bench_new(a_dst, dst_stride, tmp, w, h);
+            }
+    }
+    report("blend_h");
+}
+
+static void check_warp8x8(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, src_buf, 15 * 15,);
+    ALIGN_STK_64(pixel, c_dst,    8 *  8,);
+    ALIGN_STK_64(pixel, a_dst,    8 *  8,);
+    int16_t abcd[4];
+    const pixel *src = src_buf + 15 * 3 + 3;
+    const ptrdiff_t dst_stride =  8 * sizeof(pixel);
+    const ptrdiff_t src_stride = 15 * sizeof(pixel);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
+                 ptrdiff_t src_stride, const int16_t *abcd, int mx, int my
+                 HIGHBD_DECL_SUFFIX);
+
+    if (check_func(c->warp8x8, "warp_8x8_%dbpc", BITDEPTH)) {
+        const int mx = (rnd() & 0x1fff) - 0xa00;
+        const int my = (rnd() & 0x1fff) - 0xa00;
+#if BITDEPTH == 16
+        const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+        const int bitdepth_max = 0xff;
+#endif
+
+        for (int i = 0; i < 4; i++)
+            abcd[i] = (rnd() & 0x1fff) - 0xa00;
+
+        for (int i = 0; i < 15 * 15; i++)
+            src_buf[i] = rnd() & bitdepth_max;
+
+        call_ref(c_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
+        call_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
+        checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
+                             8, 8, "dst");
+
+        bench_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
+    }
+    report("warp8x8");
+}
+
+static void check_warp8x8t(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, src_buf, 15 * 15,);
+    ALIGN_STK_64(int16_t, c_tmp,  8 *  8,);
+    ALIGN_STK_64(int16_t, a_tmp,  8 *  8,);
+    int16_t abcd[4];
+    const pixel *src = src_buf + 15 * 3 + 3;
+    const ptrdiff_t src_stride = 15 * sizeof(pixel);
+
+    declare_func(void, int16_t *tmp, ptrdiff_t tmp_stride, const pixel *src,
+                 ptrdiff_t src_stride, const int16_t *abcd, int mx, int my
+                 HIGHBD_DECL_SUFFIX);
+
+    if (check_func(c->warp8x8t, "warp_8x8t_%dbpc", BITDEPTH)) {
+        const int mx = (rnd() & 0x1fff) - 0xa00;
+        const int my = (rnd() & 0x1fff) - 0xa00;
+#if BITDEPTH == 16
+        const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+        const int bitdepth_max = 0xff;
+#endif
+
+        for (int i = 0; i < 4; i++)
+            abcd[i] = (rnd() & 0x1fff) - 0xa00;
+
+        for (int i = 0; i < 15 * 15; i++)
+            src_buf[i] = rnd() & bitdepth_max;
+
+        call_ref(c_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
+        call_new(a_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
+        checkasm_check(int16_t, c_tmp, 8 * sizeof(*c_tmp),
+                                a_tmp, 8 * sizeof(*a_tmp),
+                                8, 8, "tmp");
+
+        bench_new(a_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
+    }
+    report("warp8x8t");
+}
+
+enum EdgeFlags {
+    HAVE_TOP = 1,
+    HAVE_BOTTOM = 2,
+    HAVE_LEFT = 4,
+    HAVE_RIGHT = 8,
+};
+
+static void random_offset_for_edge(int *const x, int *const y,
+                                   const int bw, const int bh,
+                                   int *const iw, int *const ih,
+                                   const enum EdgeFlags edge)
+{
+#define set_off(edge1, edge2, pos, dim) \
+    *i##dim = edge & (HAVE_##edge1 | HAVE_##edge2) ? 160 : 1 + (rnd() % (b##dim - 2)); \
+    switch (edge & (HAVE_##edge1 | HAVE_##edge2)) { \
+    case HAVE_##edge1 | HAVE_##edge2: \
+        assert(b##dim <= *i##dim); \
+        *pos = rnd() % (*i##dim - b##dim + 1); \
+        break; \
+    case HAVE_##edge1: \
+        *pos = (*i##dim - b##dim) + 1 + (rnd() % (b##dim - 1)); \
+        break; \
+    case HAVE_##edge2: \
+        *pos = -(1 + (rnd() % (b##dim - 1))); \
+        break; \
+    case 0: \
+        assert(b##dim - 1 > *i##dim); \
+        *pos = -(1 + (rnd() % (b##dim - *i##dim - 1))); \
+        break; \
+    }
+    set_off(LEFT, RIGHT, x, w);
+    set_off(TOP, BOTTOM, y, h);
+}
+
+static void check_emuedge(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, c_dst, 135 * 192,);
+    ALIGN_STK_64(pixel, a_dst, 135 * 192,);
+    ALIGN_STK_64(pixel, src,   160 * 160,);
+
+    for (int i = 0; i < 160 * 160; i++)
+        src[i] = rnd() & ((1U << BITDEPTH) - 1);
+
+    declare_func(void, intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih,
+                 intptr_t x, intptr_t y,
+                 pixel *dst, ptrdiff_t dst_stride,
+                 const pixel *src, ptrdiff_t src_stride);
+
+    int x, y, iw, ih;
+    for (int w = 4; w <= 128; w <<= 1)
+        if (check_func(c->emu_edge, "emu_edge_w%d_%dbpc", w, BITDEPTH)) {
+            for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) {
+                // we skip 0xf, since it implies that we don't need emu_edge
+                for (enum EdgeFlags edge = 0; edge < 0xf; edge++) {
+                    const int bw = w + (rnd() & 7);
+                    const int bh = h + (rnd() & 7);
+                    random_offset_for_edge(&x, &y, bw, bh, &iw, &ih, edge);
+                    call_ref(bw, bh, iw, ih, x, y,
+                             c_dst, 192 * sizeof(pixel), src, 160 * sizeof(pixel));
+                    call_new(bw, bh, iw, ih, x, y,
+                             a_dst, 192 * sizeof(pixel), src, 160 * sizeof(pixel));
+                    checkasm_check_pixel(c_dst, 192 * sizeof(pixel),
+                                         a_dst, 192 * sizeof(pixel),
+                                         bw, bh, "dst");
+                }
+            }
+            for (enum EdgeFlags edge = 1; edge < 0xf; edge <<= 1) {
+                random_offset_for_edge(&x, &y, w + 7, w + 7, &iw, &ih, edge);
+                bench_new(w + 7, w + 7, iw, ih, x, y,
+                          a_dst, 192 * sizeof(pixel), src, 160 * sizeof(pixel));
+            }
+        }
+    report("emu_edge");
+}
+
+static int get_upscale_x0(const int in_w, const int out_w, const int step) {
+    const int err = out_w * step - (in_w << 14);
+    const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1);
+    return x0 & 0x3fff;
+}
+
+static void check_resize(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, c_dst, 1024 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 1024 * 64,);
+    ALIGN_STK_64(pixel, src,   512 * 64,);
+
+    const int height = 64;
+    const int max_src_width = 512;
+    const ptrdiff_t dst_stride = 1024 * sizeof(pixel);
+    const ptrdiff_t src_stride = 512 * sizeof(pixel);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride,
+                 const pixel *src, ptrdiff_t src_stride,
+                 int dst_w, int src_w, int h, int dx, int mx0
+                 HIGHBD_DECL_SUFFIX);
+
+    if (check_func(c->resize, "resize_%dbpc", BITDEPTH)) {
+#if BITDEPTH == 16
+        const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+        const int bitdepth_max = 0xff;
+#endif
+
+        for (int i = 0; i < max_src_width * height; i++)
+            src[i] = rnd() & bitdepth_max;
+
+        const int w_den = 9 + (rnd() & 7);
+        const int src_w = 16 + (rnd() % (max_src_width - 16 + 1));
+        const int dst_w = w_den * src_w >> 3;
+#define scale_fac(ref_sz, this_sz) \
+    ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
+        const int dx = scale_fac(src_w, dst_w);
+#undef scale_fac
+        const int mx0 = get_upscale_x0(src_w, dst_w, dx);
+
+        call_ref(c_dst, dst_stride, src, src_stride,
+                 dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
+        call_new(a_dst, dst_stride, src, src_stride,
+                 dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
+        checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
+                             dst_w, height, "dst");
+
+        bench_new(a_dst, dst_stride, src, src_stride,
+                  512, height, 512 * 8 / w_den, dx, mx0 HIGHBD_TAIL_SUFFIX);
+    }
+
+    report("resize");
+}
+
+void bitfn(checkasm_check_mc)(void) {
+    Dav1dMCDSPContext c;
+    bitfn(dav1d_mc_dsp_init)(&c);
+
+    check_mc(&c);
+    check_mct(&c);
+    check_mc_scaled(&c);
+    check_mct_scaled(&c);
+    check_avg(&c);
+    check_w_avg(&c);
+    check_mask(&c);
+    check_w_mask(&c);
+    check_blend(&c);
+    check_blend_v(&c);
+    check_blend_h(&c);
+    check_warp8x8(&c);
+    check_warp8x8t(&c);
+    check_emuedge(&c);
+    check_resize(&c);
+}
diff --git a/tests/checkasm/msac.c b/tests/checkasm/msac.c
new file mode 100644 (file)
index 0000000..cdaf0de
--- /dev/null
@@ -0,0 +1,277 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tests/checkasm/checkasm.h"
+
+#include "src/cpu.h"
+#include "src/msac.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#define BUF_SIZE 8192
+
+/* The normal code doesn't use function pointers */
+typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
+                                           size_t n_symbols);
+typedef unsigned (*decode_adapt_fn)(MsacContext *s, uint16_t *cdf);
+typedef unsigned (*decode_bool_equi_fn)(MsacContext *s);
+typedef unsigned (*decode_bool_fn)(MsacContext *s, unsigned f);
+
+typedef struct {
+    decode_symbol_adapt_fn symbol_adapt4;
+    decode_symbol_adapt_fn symbol_adapt8;
+    decode_symbol_adapt_fn symbol_adapt16;
+    decode_adapt_fn        bool_adapt;
+    decode_bool_equi_fn    bool_equi;
+    decode_bool_fn         bool;
+    decode_adapt_fn        hi_tok;
+} MsacDSPContext;
+
+static void randomize_cdf(uint16_t *const cdf, const int n) {
+    int i;
+    for (i = 15; i > n; i--)
+        cdf[i] = rnd(); // padding
+    cdf[i] = 0;         // count
+    do {
+        cdf[i - 1] = cdf[i] + rnd() % (32768 - cdf[i] - i) + 1;
+    } while (--i > 0);
+}
+
+/* memcmp() on structs can have weird behavior due to padding etc. */
+static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
+    return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
+           a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt ||
+           a->allow_update_cdf != b->allow_update_cdf;
+}
+
+static void msac_dump(unsigned c_res, unsigned a_res,
+                      const MsacContext *const a, const MsacContext *const b,
+                      const uint16_t *const cdf_a, const uint16_t *const cdf_b,
+                      const int num_cdf)
+{
+    if (c_res != a_res)
+        fprintf(stderr, "c_res %u a_res %u\n", c_res, a_res);
+    if (a->buf_pos != b->buf_pos)
+        fprintf(stderr, "buf_pos %p vs %p\n", a->buf_pos, b->buf_pos);
+    if (a->buf_end != b->buf_end)
+        fprintf(stderr, "buf_end %p vs %p\n", a->buf_end, b->buf_end);
+    if (a->dif != b->dif)
+        fprintf(stderr, "dif %zx vs %zx\n", a->dif, b->dif);
+    if (a->rng != b->rng)
+        fprintf(stderr, "rng %u vs %u\n", a->rng, b->rng);
+    if (a->cnt != b->cnt)
+        fprintf(stderr, "cnt %d vs %d\n", a->cnt, b->cnt);
+    if (a->allow_update_cdf)
+        fprintf(stderr, "allow_update_cdf %d vs %d\n",
+                a->allow_update_cdf, b->allow_update_cdf);
+    if (num_cdf && memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * (num_cdf + 1))) {
+        fprintf(stderr, "cdf:\n");
+        for (int i = 0; i <= num_cdf; i++)
+            fprintf(stderr, " %5u", cdf_a[i]);
+        fprintf(stderr, "\n");
+        for (int i = 0; i <= num_cdf; i++)
+            fprintf(stderr, " %5u", cdf_b[i]);
+        fprintf(stderr, "\n");
+        for (int i = 0; i <= num_cdf; i++)
+            fprintf(stderr, "     %c", cdf_a[i] != cdf_b[i] ? 'x' : '.');
+        fprintf(stderr, "\n");
+    }
+}
+
+#define CHECK_SYMBOL_ADAPT(n, n_min, n_max) do {                           \
+    if (check_func(c->symbol_adapt##n, "msac_decode_symbol_adapt%d", n)) { \
+        for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {          \
+            for (int ns = n_min; ns <= n_max; ns++) {                      \
+                dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);         \
+                s_a = s_c;                                                 \
+                randomize_cdf(cdf[0], ns);                                 \
+                memcpy(cdf[1], cdf[0], sizeof(*cdf));                      \
+                for (int i = 0; i < 64; i++) {                             \
+                    unsigned c_res = call_ref(&s_c, cdf[0], ns);           \
+                    unsigned a_res = call_new(&s_a, cdf[1], ns);           \
+                    if (c_res != a_res || msac_cmp(&s_c, &s_a) ||          \
+                        memcmp(cdf[0], cdf[1], sizeof(**cdf) * (ns + 1)))  \
+                    {                                                      \
+                        if (fail())                                        \
+                            msac_dump(c_res, a_res, &s_c, &s_a,            \
+                                      cdf[0], cdf[1], ns);                 \
+                    }                                                      \
+                }                                                          \
+                if (cdf_update && ns == n - 1)                             \
+                    bench_new(&s_a, cdf[1], ns);                           \
+            }                                                              \
+        }                                                                  \
+    }                                                                      \
+} while (0)
+
+static void check_decode_symbol(MsacDSPContext *const c, uint8_t *const buf) {
+    ALIGN_STK_32(uint16_t, cdf, 2, [16]);
+    MsacContext s_c, s_a;
+
+    declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols);
+    CHECK_SYMBOL_ADAPT( 4, 1,  4);
+    CHECK_SYMBOL_ADAPT( 8, 1,  7);
+    CHECK_SYMBOL_ADAPT(16, 3, 15);
+    report("decode_symbol");
+}
+
+static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
+    MsacContext s_c, s_a;
+
+    if (check_func(c->bool_adapt, "msac_decode_bool_adapt")) {
+        declare_func(unsigned, MsacContext *s, uint16_t *cdf);
+        uint16_t cdf[2][2];
+        for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
+            dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
+            s_a = s_c;
+            cdf[0][0] = cdf[1][0] = rnd() % 32767 + 1;
+            cdf[0][1] = cdf[1][1] = 0;
+            for (int i = 0; i < 64; i++) {
+                unsigned c_res = call_ref(&s_c, cdf[0]);
+                unsigned a_res = call_new(&s_a, cdf[1]);
+                if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
+                    memcmp(cdf[0], cdf[1], sizeof(*cdf)))
+                {
+                    if (fail())
+                        msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 1);
+                }
+            }
+            if (cdf_update)
+                bench_new(&s_a, cdf[1]);
+        }
+    }
+
+    if (check_func(c->bool_equi, "msac_decode_bool_equi")) {
+        declare_func(unsigned, MsacContext *s);
+        dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
+        s_a = s_c;
+        for (int i = 0; i < 64; i++) {
+            unsigned c_res = call_ref(&s_c);
+            unsigned a_res = call_new(&s_a);
+            if (c_res != a_res || msac_cmp(&s_c, &s_a)) {
+                if (fail())
+                    msac_dump(c_res, a_res, &s_c, &s_a, NULL, NULL, 0);
+            }
+        }
+        bench_new(&s_a);
+    }
+
+    if (check_func(c->bool, "msac_decode_bool")) {
+        declare_func(unsigned, MsacContext *s, unsigned f);
+        dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
+        s_a = s_c;
+        for (int i = 0; i < 64; i++) {
+            const unsigned f = rnd() & 0x7fff;
+            unsigned c_res = call_ref(&s_c, f);
+            unsigned a_res = call_new(&s_a, f);
+            if (c_res != a_res || msac_cmp(&s_c, &s_a)) {
+                if (fail())
+                    msac_dump(c_res, a_res, &s_c, &s_a, NULL, NULL, 0);
+            }
+        }
+        bench_new(&s_a, 16384);
+    }
+
+    report("decode_bool");
+}
+
+static void check_decode_hi_tok(MsacDSPContext *const c, uint8_t *const buf) {
+    ALIGN_STK_16(uint16_t, cdf, 2, [16]);
+    MsacContext s_c, s_a;
+
+    if (check_func(c->hi_tok, "msac_decode_hi_tok")) {
+        declare_func(unsigned, MsacContext *s, uint16_t *cdf);
+        for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
+            dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
+            s_a = s_c;
+            randomize_cdf(cdf[0], 3);
+            memcpy(cdf[1], cdf[0], sizeof(*cdf));
+            for (int i = 0; i < 64; i++) {
+                unsigned c_res = call_ref(&s_c, cdf[0]);
+                unsigned a_res = call_new(&s_a, cdf[1]);
+                if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
+                    memcmp(cdf[0], cdf[1], sizeof(*cdf)))
+                {
+                    if (fail())
+                        msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 3);
+                    break;
+                }
+            }
+            if (cdf_update)
+                bench_new(&s_a, cdf[1]);
+        }
+    }
+    report("decode_hi_tok");
+}
+
+void checkasm_check_msac(void) {
+    MsacDSPContext c;
+    c.symbol_adapt4  = dav1d_msac_decode_symbol_adapt_c;
+    c.symbol_adapt8  = dav1d_msac_decode_symbol_adapt_c;
+    c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
+    c.bool_adapt     = dav1d_msac_decode_bool_adapt_c;
+    c.bool_equi      = dav1d_msac_decode_bool_equi_c;
+    c.bool           = dav1d_msac_decode_bool_c;
+    c.hi_tok         = dav1d_msac_decode_hi_tok_c;
+
+#if (ARCH_AARCH64 || ARCH_ARM) && HAVE_ASM
+    if (dav1d_get_cpu_flags() & DAV1D_ARM_CPU_FLAG_NEON) {
+        c.symbol_adapt4  = dav1d_msac_decode_symbol_adapt4_neon;
+        c.symbol_adapt8  = dav1d_msac_decode_symbol_adapt8_neon;
+        c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_neon;
+        c.bool_adapt     = dav1d_msac_decode_bool_adapt_neon;
+        c.bool_equi      = dav1d_msac_decode_bool_equi_neon;
+        c.bool           = dav1d_msac_decode_bool_neon;
+        c.hi_tok         = dav1d_msac_decode_hi_tok_neon;
+    }
+#elif ARCH_X86 && HAVE_ASM
+    if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
+        c.symbol_adapt4  = dav1d_msac_decode_symbol_adapt4_sse2;
+        c.symbol_adapt8  = dav1d_msac_decode_symbol_adapt8_sse2;
+        c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
+        c.bool_adapt     = dav1d_msac_decode_bool_adapt_sse2;
+        c.bool_equi      = dav1d_msac_decode_bool_equi_sse2;
+        c.bool           = dav1d_msac_decode_bool_sse2;
+        c.hi_tok         = dav1d_msac_decode_hi_tok_sse2;
+    }
+
+#if ARCH_X86_64
+    if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_AVX2) {
+        c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
+    }
+#endif
+#endif
+
+    uint8_t buf[BUF_SIZE];
+    for (int i = 0; i < BUF_SIZE; i++)
+        buf[i] = rnd();
+
+    check_decode_symbol(&c, buf);
+    check_decode_bool(&c, buf);
+    check_decode_hi_tok(&c, buf);
+}
diff --git a/tests/checkasm/x86/checkasm.asm b/tests/checkasm/x86/checkasm.asm
new file mode 100644 (file)
index 0000000..bc7ec22
--- /dev/null
@@ -0,0 +1,286 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%define private_prefix checkasm
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%if ARCH_X86_64
+; just random numbers to reduce the chance of incidental match
+%if WIN64
+x6:  dq 0x1a1b2550a612b48c,0x79445c159ce79064
+x7:  dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636
+x8:  dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e
+x9:  dq 0xacbd382dcf5b8de2,0xd229e1f5b281303f
+x10: dq 0x71aeaff20b095fd9,0xab63e2e11fa38ed9
+x11: dq 0x89b0c0765892729a,0x77d410d5c42c882d
+x12: dq 0xc45ea11a955d8dd5,0x24b3c1d2a024048b
+x13: dq 0x2e8ec680de14b47c,0xdd7b8919edd42786
+x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef
+x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5
+n7:  dq 0x21f86d66c8ca00ce
+n8:  dq 0x75b6ba21077c48ad
+%endif
+n9:  dq 0xed56bb2dcb3c7736
+n10: dq 0x8bda43d3fd1a7e06
+n11: dq 0xb64a9c9e5d318408
+n12: dq 0xdf9a54b303f1d3a3
+n13: dq 0x4a75479abd64e097
+n14: dq 0x249214109d5d1c88
+%endif
+
+errmsg_reg:   db "failed to preserve register", 0
+errmsg_stack: db "stack corruption", 0
+
+SECTION .text
+
+cextern fail_func
+
+; max number of args used by any asm function.
+; (max_args % 4) must equal 3 for stack alignment
+%define max_args 15
+
+%if ARCH_X86_64
+
+;-----------------------------------------------------------------------------
+; int checkasm_stack_clobber(uint64_t clobber, ...)
+;-----------------------------------------------------------------------------
+cglobal stack_clobber, 1, 2
+    ; Clobber the stack with junk below the stack pointer
+    %define argsize (max_args+6)*8
+    SUB  rsp, argsize
+    mov   r1, argsize-8
+.loop:
+    mov [rsp+r1], r0
+    sub   r1, 8
+    jge .loop
+    ADD  rsp, argsize
+    RET
+
+%if WIN64
+    %assign free_regs 7
+    %define stack_param rsp+32 ; shadow space
+    %define num_stack_params rsp+stack_offset+22*8
+    DECLARE_REG_TMP 4
+%else
+    %assign free_regs 9
+    %define stack_param rsp
+    %define num_stack_params rsp+stack_offset+16*8
+    DECLARE_REG_TMP 7
+%endif
+
+;-----------------------------------------------------------------------------
+; void checkasm_checked_call(void *func, ...)
+;-----------------------------------------------------------------------------
+INIT_XMM
+cglobal checked_call, 2, 15, 16, max_args*8+64+8
+    mov  t0, r0
+
+    ; All arguments have been pushed on the stack instead of registers in
+    ; order to test for incorrect assumptions that 32-bit ints are
+    ; zero-extended to 64-bit.
+    mov  r0, r6mp
+    mov  r1, r7mp
+    mov  r2, r8mp
+    mov  r3, r9mp
+%if UNIX64
+    mov  r4, r10mp
+    mov  r5, r11mp
+%else ; WIN64
+    ; Move possible floating-point arguments to the correct registers
+    movq m0, r0
+    movq m1, r1
+    movq m2, r2
+    movq m3, r3
+
+    %assign i 6
+    %rep 16-6
+        mova m %+ i, [x %+ i]
+        %assign i i+1
+    %endrep
+%endif
+
+    ; write stack canaries to the area above parameters passed on the stack
+    mov r9d, [num_stack_params]
+    mov  r8, [rsp+stack_offset] ; return address
+    not  r8
+%assign i 0
+%rep 8 ; 64 bytes
+    mov [stack_param+(r9+i)*8], r8
+    %assign i i+1
+%endrep
+    dec r9d
+    jl .stack_setup_done ; no stack parameters
+.copy_stack_parameter:
+    mov  r8, [stack_param+stack_offset+7*8+r9*8]
+    mov [stack_param+r9*8], r8
+    dec r9d
+    jge .copy_stack_parameter
+.stack_setup_done:
+
+%assign i 14
+%rep 15-free_regs
+    mov r %+ i, [n %+ i]
+    %assign i i-1
+%endrep
+    call t0
+
+    ; check for failure to preserve registers
+    xor r14, [n14]
+    lea  r0, [errmsg_reg]
+%assign i 13
+%rep 14-free_regs
+    xor r %+ i, [n %+ i]
+    or  r14, r %+ i
+    %assign i i-1
+%endrep
+%if WIN64
+    pxor m6, [x6]
+    %assign i 7
+    %rep 16-7
+        pxor m %+ i, [x %+ i]
+        por  m6, m %+ i
+        %assign i i+1
+    %endrep
+    packsswb m6, m6
+    movq r5, m6
+    or  r14, r5
+%endif
+    jnz .fail
+
+    ; check for stack corruption
+    mov r9d, [num_stack_params]
+    mov  r8, [rsp+stack_offset]
+    mov  r4, [stack_param+r9*8]
+    not  r8
+    xor  r4, r8
+%assign i 1
+%rep 6
+    mov  r5, [stack_param+(r9+i)*8]
+    xor  r5, r8
+    or   r4, r5
+    %assign i i+1
+%endrep
+    xor  r8, [stack_param+(r9+7)*8]
+    or   r4, r8
+    jz .ok
+    add  r0, errmsg_stack-errmsg_reg
+.fail:
+    ; Call fail_func() with a descriptive message to mark it as a failure.
+    ; Save the return value located in rdx:rax first to prevent clobbering.
+    mov  r9, rax
+    mov r10, rdx
+    xor eax, eax
+    call fail_func
+    mov rdx, r10
+    mov rax, r9
+.ok:
+    RET
+
+; trigger a warmup of vector units
+%macro WARMUP 0
+cglobal warmup, 0, 0
+    xorps   m0, m0
+    mulps   m0, m0
+    RET
+%endmacro
+
+INIT_YMM avx2
+WARMUP
+INIT_ZMM avx512
+WARMUP
+
+%else
+
+; just random numbers to reduce the chance of incidental match
+%assign n3 0x6549315c
+%assign n4 0xe02f3e23
+%assign n5 0xb78d0d1d
+%assign n6 0x33627ba7
+
+;-----------------------------------------------------------------------------
+; void checkasm_checked_call(void *func, ...)
+;-----------------------------------------------------------------------------
+cglobal checked_call, 1, 7
+    mov  r3, [esp+stack_offset]      ; return address
+    mov  r1, [esp+stack_offset+17*4] ; num_stack_params
+    mov  r2, 27
+    not  r3
+    sub  r2, r1
+.push_canary:
+    push r3
+    dec  r2
+    jg .push_canary
+.push_parameter:
+    push dword [esp+32*4]
+    dec  r1
+    jg .push_parameter
+    mov  r3, n3
+    mov  r4, n4
+    mov  r5, n5
+    mov  r6, n6
+    call r0
+
+    ; check for failure to preserve registers
+    xor  r3, n3
+    xor  r4, n4
+    xor  r5, n5
+    xor  r6, n6
+    or   r3, r4
+    or   r5, r6
+    LEA  r1, errmsg_reg
+    or   r3, r5
+    jnz .fail
+
+    ; check for stack corruption
+    mov  r3, [esp+48*4] ; num_stack_params
+    mov  r6, [esp+31*4] ; return address
+    mov  r4, [esp+r3*4]
+    sub  r3, 26
+    not  r6
+    xor  r4, r6
+.check_canary:
+    mov  r5, [esp+(r3+27)*4]
+    xor  r5, r6
+    or   r4, r5
+    inc  r3
+    jl .check_canary
+    test r4, r4
+    jz .ok
+    add  r1, errmsg_stack-errmsg_reg
+.fail:
+    mov  r3, eax
+    mov  r4, edx
+    mov [esp], r1
+    call fail_func
+    mov edx, r4
+    mov eax, r3
+.ok:
+    add esp, 27*4
+    RET
+
+%endif ; ARCH_X86_64
diff --git a/tests/libfuzzer/alloc_fail.c b/tests/libfuzzer/alloc_fail.c
new file mode 100644 (file)
index 0000000..ddd1dd7
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include "alloc_fail.h"
+
+static int fail_probability;
+
+void dav1d_setup_alloc_fail(unsigned seed, unsigned probability) {
+    srand(seed);
+
+    while (probability >= RAND_MAX)
+        probability >>= 1;
+
+    fail_probability = probability;
+}
+
+void * __wrap_malloc(size_t);
+
+void * __wrap_malloc(size_t sz) {
+    if (rand() < fail_probability)
+        return NULL;
+    return malloc(sz);
+}
+
+#if defined(HAVE_POSIX_MEMALIGN)
+int __wrap_posix_memalign(void **memptr, size_t alignment, size_t size);
+
+int __wrap_posix_memalign(void **memptr, size_t alignment, size_t size) {
+    if (rand() < fail_probability)
+        return ENOMEM;
+    return posix_memalign(memptr, alignment, size);
+}
+#else
+#error "HAVE_POSIX_MEMALIGN required"
+#endif
+
+int __wrap_pthread_create(pthread_t *, const pthread_attr_t *,
+                          void *(*) (void *), void *);
+
+int __wrap_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
+                          void *(*start_routine) (void *), void *arg)
+{
+    if (rand() < (fail_probability + RAND_MAX/16))
+        return EAGAIN;
+
+    return pthread_create(thread, attr, start_routine, arg);
+}
+
+int __wrap_pthread_mutex_init(pthread_mutex_t *,
+                              const pthread_mutexattr_t *);
+
+int __wrap_pthread_mutex_init(pthread_mutex_t *restrict mutex,
+                              const pthread_mutexattr_t *restrict attr)
+{
+    if (rand() < (fail_probability + RAND_MAX/8))
+        return ENOMEM;
+
+    return pthread_mutex_init(mutex, attr);
+}
+
+int __wrap_pthread_cond_init(pthread_cond_t *,
+                             const pthread_condattr_t *);
+
+int __wrap_pthread_cond_init(pthread_cond_t *restrict cond,
+                             const pthread_condattr_t *restrict attr)
+{
+    if (rand() < (fail_probability + RAND_MAX/16))
+        return ENOMEM;
+
+    return pthread_cond_init(cond, attr);
+}
diff --git a/tests/libfuzzer/alloc_fail.h b/tests/libfuzzer/alloc_fail.h
new file mode 100644 (file)
index 0000000..5ace870
--- /dev/null
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_TESTS_LIBFUZZER_ALLOC_FAIL_H
+#define DAV1D_TESTS_LIBFUZZER_ALLOC_FAIL_H
+
+#include <dav1d/common.h>
+
+DAV1D_API void dav1d_setup_alloc_fail(unsigned seed, unsigned probability);
+
+#endif /* DAV1D_TESTS_LIBFUZZER_ALLOC_FAIL_H */
diff --git a/tests/libfuzzer/dav1d_fuzzer.c b/tests/libfuzzer/dav1d_fuzzer.c
new file mode 100644 (file)
index 0000000..4506d2f
--- /dev/null
@@ -0,0 +1,161 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <dav1d/dav1d.h>
+#include "src/cpu.h"
+#include "dav1d_fuzzer.h"
+
+#ifdef DAV1D_ALLOC_FAIL
+
+#include <stdlib.h>
+
+#include "alloc_fail.h"
+
+static unsigned djb_xor(const uint8_t * c, size_t len) {
+    unsigned hash = 5381;
+    for(size_t i = 0; i < len; i++)
+        hash = hash * 33 ^ c[i];
+    return hash;
+}
+#endif
+
+static unsigned r32le(const uint8_t *const p) {
+    return ((uint32_t)p[3] << 24U) | (p[2] << 16U) | (p[1] << 8U) | p[0];
+}
+
+#define DAV1D_FUZZ_MAX_SIZE 4096 * 4096
+
+// expects ivf input
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
+{
+    Dav1dSettings settings = { 0 };
+    Dav1dContext * ctx = NULL;
+    Dav1dPicture pic;
+    const uint8_t *ptr = data;
+    int have_seq_hdr = 0;
+    int err;
+
+    dav1d_version();
+
+    if (size < 32) goto end;
+#ifdef DAV1D_ALLOC_FAIL
+    unsigned h = djb_xor(ptr, 32);
+    unsigned seed = h;
+    unsigned probability = h > (RAND_MAX >> 5) ? RAND_MAX >> 5 : h;
+    int n_frame_threads = (h & 0xf) + 1;
+    int n_tile_threads = ((h >> 4) & 0x7) + 1;
+    if (n_frame_threads > 5) n_frame_threads = 1;
+    if (n_tile_threads > 3) n_tile_threads = 1;
+#endif
+    ptr += 32; // skip ivf header
+
+    dav1d_default_settings(&settings);
+
+#ifdef DAV1D_MT_FUZZING
+    settings.n_frame_threads = settings.n_tile_threads = 2;
+#elif defined(DAV1D_ALLOC_FAIL)
+    settings.n_frame_threads = n_frame_threads;
+    settings.n_tile_threads = n_tile_threads;
+    dav1d_setup_alloc_fail(seed, probability);
+#else
+    settings.n_frame_threads = settings.n_tile_threads = 1;
+#endif
+#if defined(DAV1D_FUZZ_MAX_SIZE)
+    settings.frame_size_limit = DAV1D_FUZZ_MAX_SIZE;
+#endif
+
+    err = dav1d_open(&ctx, &settings);
+    if (err < 0) goto end;
+
+    while (ptr <= data + size - 12) {
+        Dav1dData buf;
+        uint8_t *p;
+
+        size_t frame_size = r32le(ptr);
+        ptr += 12;
+
+        if (frame_size > size || ptr > data + size - frame_size)
+            break;
+
+        if (!frame_size) continue;
+
+        if (!have_seq_hdr) {
+            Dav1dSequenceHeader seq = { 0 };
+            int err = dav1d_parse_sequence_header(&seq, ptr, frame_size);
+            // skip frames until we see a sequence header
+            if  (err != 0) {
+                ptr += frame_size;
+                continue;
+            }
+            have_seq_hdr = 1;
+        }
+
+        // copy frame data to a new buffer to catch reads past the end of input
+        p = dav1d_data_create(&buf, frame_size);
+        if (!p) goto cleanup;
+        memcpy(p, ptr, frame_size);
+        ptr += frame_size;
+
+        do {
+            if ((err = dav1d_send_data(ctx, &buf)) < 0) {
+                if (err != DAV1D_ERR(EAGAIN))
+                    break;
+            }
+            memset(&pic, 0, sizeof(pic));
+            err = dav1d_get_picture(ctx, &pic);
+            if (err == 0) {
+                dav1d_picture_unref(&pic);
+            } else if (err != DAV1D_ERR(EAGAIN)) {
+                break;
+            }
+        } while (buf.sz > 0);
+
+        if (buf.sz > 0)
+            dav1d_data_unref(&buf);
+    }
+
+    do {
+        memset(&pic, 0, sizeof(pic));
+        err = dav1d_get_picture(ctx, &pic);
+        if (err == 0)
+            dav1d_picture_unref(&pic);
+    } while (err != DAV1D_ERR(EAGAIN));
+
+cleanup:
+    dav1d_flush(ctx);
+    dav1d_close(&ctx);
+end:
+    return 0;
+}
diff --git a/tests/libfuzzer/dav1d_fuzzer.h b/tests/libfuzzer/dav1d_fuzzer.h
new file mode 100644 (file)
index 0000000..5d93299
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H
+#define DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size);
+
+#endif /* DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H */
diff --git a/tests/libfuzzer/main.c b/tests/libfuzzer/main.c
new file mode 100644 (file)
index 0000000..985ebba
--- /dev/null
@@ -0,0 +1,96 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "dav1d_fuzzer.h"
+
+// expects ivf input
+
+int main(const int argc, char *const *const argv) {
+    int ret = -1;
+    FILE *f = NULL;
+    int64_t fsize;
+    const char *filename = NULL;
+    uint8_t *data = NULL;
+    size_t size = 0;
+
+    if (argc != 2) {
+        fprintf(stdout, "Usage:\n%s fuzzing_testcase.ivf\n", argv[0]);
+        return -1;
+    }
+    filename = argv[1];
+
+    if (!(f = fopen(filename, "rb"))) {
+        fprintf(stderr, "failed to open %s: %s\n", filename, strerror(errno));
+        goto error;
+    }
+
+    if (fseeko(f, 0, SEEK_END) == -1) {
+        fprintf(stderr, "fseek(%s, 0, SEEK_END) failed: %s\n", filename,
+                strerror(errno));
+        goto error;
+    }
+    if ((fsize = ftello(f)) == -1) {
+        fprintf(stderr, "ftell(%s) failed: %s\n", filename, strerror(errno));
+        goto error;
+    }
+    rewind(f);
+
+    if (fsize < 0 || fsize > INT_MAX) {
+        fprintf(stderr, "%s is too large: %"PRId64"\n", filename, fsize);
+        goto error;
+    }
+    size = (size_t)fsize;
+
+    if (!(data = malloc(size))) {
+        fprintf(stderr, "failed to allocate: %zu bytes\n", size);
+        goto error;
+    }
+
+    if (fread(data, size, 1, f) == size) {
+        fprintf(stderr, "failed to read %zu bytes from %s: %s\n", size,
+                filename, strerror(errno));
+        goto error;
+    }
+
+    ret = LLVMFuzzerTestOneInput(data, size);
+
+error:
+    free(data);
+    if (f) fclose(f);
+    return ret;
+}
diff --git a/tests/libfuzzer/meson.build b/tests/libfuzzer/meson.build
new file mode 100644 (file)
index 0000000..3591403
--- /dev/null
@@ -0,0 +1,102 @@
+# Copyright © 2020, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# Build definition for the dav1d fuzzing binaries
+#
+
+if fuzzing_engine == 'none' and not have_fseeko
+    subdir_done()
+endif
+
+dav1d_fuzzer_sources =  files('dav1d_fuzzer.c')
+fuzzer_ldflags = []
+fuzzer_link_lang = {}
+
+if get_option('fuzzer_ldflags') != ''
+    fuzzer_ldflags += [get_option('fuzzer_ldflags')]
+endif
+
+if fuzzing_engine == 'none'
+    dav1d_fuzzer_sources += files('main.c')
+elif fuzzing_engine == 'libfuzzer'
+    fuzzer_ldflags += ['-fsanitize=fuzzer']
+elif fuzzing_engine == 'oss-fuzz'
+    # libFuzzingEngine needs c++
+    add_languages('cpp')
+    fuzzer_link_lang = {'link_language': 'cpp'}
+endif
+
+dav1d_fuzzer = executable('dav1d_fuzzer',
+    dav1d_fuzzer_sources,
+    include_directories: dav1d_inc_dirs,
+    c_args: [stackalign_flag, stackrealign_flag],
+    link_args: fuzzer_ldflags,
+    link_with : libdav1d,
+    build_by_default: true,
+    dependencies : [thread_dependency],
+    kwargs: fuzzer_link_lang
+    )
+
+dav1d_fuzzer_mt = executable('dav1d_fuzzer_mt',
+    dav1d_fuzzer_sources,
+    include_directories: dav1d_inc_dirs,
+    c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_MT_FUZZING'],
+    link_args: fuzzer_ldflags,
+    link_with : libdav1d,
+    build_by_default: true,
+    dependencies : [thread_dependency],
+    kwargs: fuzzer_link_lang
+    )
+
+objcopy = find_program('objcopy',
+                       required: false)
+if (objcopy.found() and
+    not get_option('b_lto') and
+    get_option('default_library') == 'static' and
+    cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args))
+
+    libdav1d_af = custom_target('libdav1d_af',
+                                input: libdav1d,
+                                output: 'libdav1d_af.a',
+                                depends: libdav1d,
+                                command: [objcopy,
+                                          '--redefine-sym', 'malloc=__wrap_malloc',
+                                          '--redefine-sym', 'posix_memalign=__wrap_posix_memalign',
+                                          '--redefine-sym', 'pthread_create=__wrap_pthread_create',
+                                          '--redefine-sym', 'pthread_cond_init=__wrap_pthread_cond_init',
+                                          '--redefine-sym', 'pthread_mutex_init=__wrap_pthread_mutex_init',
+                                          '@INPUT@', '@OUTPUT@'])
+
+    dav1d_fuzzer_mem = executable('dav1d_fuzzer_mem',
+        dav1d_fuzzer_sources + ['alloc_fail.c'],
+        include_directories: dav1d_inc_dirs,
+        c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_ALLOC_FAIL'],
+        link_args: fuzzer_ldflags + [join_paths(libdav1d_af.full_path())],
+        link_depends: libdav1d_af,
+        build_by_default: false,
+        dependencies : [thread_dependency],
+        kwargs: fuzzer_link_lang
+        )
+endif
diff --git a/tests/meson.build b/tests/meson.build
new file mode 100644 (file)
index 0000000..5a41f2f
--- /dev/null
@@ -0,0 +1,112 @@
+# Copyright © 2018, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# Build definition for the dav1d tests
+#
+
+# Leave subdir if tests are disabled
+if not get_option('enable_tests')
+    subdir_done()
+endif
+
+libdav1d_nasm_objs_if_needed = []
+
+if is_asm_enabled
+    checkasm_sources = files(
+        'checkasm/checkasm.c',
+        'checkasm/msac.c',
+    )
+
+    checkasm_tmpl_sources = files(
+        'checkasm/cdef.c',
+        'checkasm/filmgrain.c',
+        'checkasm/ipred.c',
+        'checkasm/itx.c',
+        'checkasm/loopfilter.c',
+        'checkasm/looprestoration.c',
+        'checkasm/mc.c',
+    )
+
+    checkasm_bitdepth_objs = []
+    foreach bitdepth : dav1d_bitdepths
+        checkasm_bitdepth_lib = static_library(
+            'checkasm_bitdepth_@0@'.format(bitdepth),
+            checkasm_tmpl_sources,
+            include_directories: dav1d_inc_dirs,
+            c_args: ['-DBITDEPTH=@0@'.format(bitdepth), stackalign_flag],
+            install: false,
+            build_by_default: false,
+        )
+        checkasm_bitdepth_objs += checkasm_bitdepth_lib.extract_all_objects()
+    endforeach
+
+    checkasm_nasm_objs = []
+    if host_machine.cpu_family() == 'aarch64'
+        checkasm_sources += files('checkasm/arm/checkasm_64.S')
+    elif host_machine.cpu_family().startswith('arm')
+        checkasm_sources += files('checkasm/arm/checkasm_32.S')
+    elif host_machine.cpu_family().startswith('x86')
+        checkasm_nasm_objs = nasm_gen.process(files('checkasm/x86/checkasm.asm'))
+    endif
+
+    m_lib = cc.find_library('m', required: false)
+
+    if meson.version().version_compare('< 0.48.999')
+        libdav1d_nasm_objs_if_needed = libdav1d_nasm_objs
+    endif
+
+    checkasm = executable('checkasm',
+        checkasm_sources,
+        checkasm_nasm_objs,
+        libdav1d_nasm_objs_if_needed,
+
+        objects: [
+            checkasm_bitdepth_objs,
+            libdav1d.extract_all_objects(recursive: true),
+            ],
+
+        include_directories: dav1d_inc_dirs,
+        c_args: [stackalign_flag, stackrealign_flag],
+        build_by_default: false,
+        dependencies : [
+            thread_dependency,
+            rt_dependency,
+            libdl_dependency,
+            m_lib,
+            ],
+        )
+
+    test('checkasm', checkasm, is_parallel: false)
+endif
+
+# fuzzing binaries
+if meson.version().version_compare('>=0.49')
+    subdir('libfuzzer')
+endif
+
+# Include dav1d test data repository with additional tests
+if get_option('testdata_tests')
+    subdir('dav1d-test-data')
+endif
diff --git a/tools/compat/getopt.c b/tools/compat/getopt.c
new file mode 100644 (file)
index 0000000..ac1fda4
--- /dev/null
@@ -0,0 +1,562 @@
+/*     $OpenBSD: getopt_long.c,v 1.23 2007/10/31 12:34:57 chl Exp $    */
+/*     $NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $      */
+
+/*
+ * Copyright (c) 2002 Todd C. Miller <Todd.Miller@courtesan.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F39502-99-1-0512.
+ */
+/*-
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <windows.h>
+
+#define        REPLACE_GETOPT          /* use this getopt as the system getopt(3) */
+
+#ifdef REPLACE_GETOPT
+int    opterr = 1;             /* if error message should be printed */
+int    optind = 1;             /* index into parent argv vector */
+int    optopt = '?';           /* character checked for validity */
+#undef optreset                /* see getopt.h */
+#define        optreset                __mingw_optreset
+int    optreset;               /* reset getopt */
+char    *optarg;               /* argument associated with option */
+#endif
+
+#define PRINT_ERROR    ((opterr) && (*options != ':'))
+
+#define FLAG_PERMUTE   0x01    /* permute non-options to the end of argv */
+#define FLAG_ALLARGS   0x02    /* treat non-options as args to option "-1" */
+#define FLAG_LONGONLY  0x04    /* operate as getopt_long_only */
+
+/* return values */
+#define        BADCH           (int)'?'
+#define        BADARG          ((*options == ':') ? (int)':' : (int)'?')
+#define        INORDER         (int)1
+
+#ifndef __CYGWIN__
+#define __progname __argv[0]
+#else
+extern char __declspec(dllimport) *__progname;
+#endif
+
+#ifdef __CYGWIN__
+static char EMSG[] = "";
+#else
+#define        EMSG            ""
+#endif
+
+static int getopt_internal(int, char * const *, const char *,
+                          const struct option *, int *, int);
+static int parse_long_options(char * const *, const char *,
+                             const struct option *, int *, int);
+static int gcd(int, int);
+static void permute_args(int, int, int, char * const *);
+
+static char *place = EMSG; /* option letter processing */
+
+/* XXX: set optreset to 1 rather than these two */
+static int nonopt_start = -1; /* first non option argument (for permute) */
+static int nonopt_end = -1;   /* first option after non options (for permute) */
+
+/* Error messages */
+static const char recargchar[] = "option requires an argument -- %c";
+static const char recargstring[] = "option requires an argument -- %s";
+static const char ambig[] = "ambiguous option -- %.*s";
+static const char noarg[] = "option doesn't take an argument -- %.*s";
+static const char illoptchar[] = "unknown option -- %c";
+static const char illoptstring[] = "unknown option -- %s";
+
+static void
+_vwarnx(const char *fmt,va_list ap)
+{
+  (void)fprintf(stderr,"%s: ",__progname);
+  if (fmt != NULL)
+    (void)vfprintf(stderr,fmt,ap);
+  (void)fprintf(stderr,"\n");
+}
+
+static void
+warnx(const char *fmt,...)
+{
+  va_list ap;
+  va_start(ap,fmt);
+  _vwarnx(fmt,ap);
+  va_end(ap);
+}
+
+/*
+ * Compute the greatest common divisor of a and b.
+ */
+static int
+gcd(int a, int b)
+{
+       int c;
+
+       c = a % b;
+       while (c != 0) {
+               a = b;
+               b = c;
+               c = a % b;
+       }
+
+       return (b);
+}
+
+/*
+ * Exchange the block from nonopt_start to nonopt_end with the block
+ * from nonopt_end to opt_end (keeping the same order of arguments
+ * in each block).
+ */
+static void
+permute_args(int panonopt_start, int panonopt_end, int opt_end,
+       char * const *nargv)
+{
+       int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
+       char *swap;
+
+       /*
+        * compute lengths of blocks and number and size of cycles
+        */
+       nnonopts = panonopt_end - panonopt_start;
+       nopts = opt_end - panonopt_end;
+       ncycle = gcd(nnonopts, nopts);
+       cyclelen = (opt_end - panonopt_start) / ncycle;
+
+       for (i = 0; i < ncycle; i++) {
+               cstart = panonopt_end+i;
+               pos = cstart;
+               for (j = 0; j < cyclelen; j++) {
+                       if (pos >= panonopt_end)
+                               pos -= nnonopts;
+                       else
+                               pos += nopts;
+                       swap = nargv[pos];
+                       /* LINTED const cast */
+                       ((char **) nargv)[pos] = nargv[cstart];
+                       /* LINTED const cast */
+                       ((char **)nargv)[cstart] = swap;
+               }
+       }
+}
+
+/*
+ * parse_long_options --
+ *     Parse long options in argc/argv argument vector.
+ * Returns -1 if short_too is set and the option does not match long_options.
+ */
+static int
+parse_long_options(char * const *nargv, const char *options,
+       const struct option *long_options, int *idx, int short_too)
+{
+       char *current_argv, *has_equal;
+       size_t current_argv_len;
+       int i, ambiguous, match;
+
+#define IDENTICAL_INTERPRETATION(_x, _y)                                \
+       (long_options[(_x)].has_arg == long_options[(_y)].has_arg &&    \
+        long_options[(_x)].flag == long_options[(_y)].flag &&          \
+        long_options[(_x)].val == long_options[(_y)].val)
+
+       current_argv = place;
+       match = -1;
+       ambiguous = 0;
+
+       optind++;
+
+       if ((has_equal = strchr(current_argv, '=')) != NULL) {
+               /* argument found (--option=arg) */
+               current_argv_len = has_equal - current_argv;
+               has_equal++;
+       } else
+               current_argv_len = strlen(current_argv);
+
+       for (i = 0; long_options[i].name; i++) {
+               /* find matching long option */
+               if (strncmp(current_argv, long_options[i].name,
+                   current_argv_len))
+                       continue;
+
+               if (strlen(long_options[i].name) == current_argv_len) {
+                       /* exact match */
+                       match = i;
+                       ambiguous = 0;
+                       break;
+               }
+               /*
+                * If this is a known short option, don't allow
+                * a partial match of a single character.
+                */
+               if (short_too && current_argv_len == 1)
+                       continue;
+
+               if (match == -1)        /* partial match */
+                       match = i;
+               else if (!IDENTICAL_INTERPRETATION(i, match))
+                       ambiguous = 1;
+       }
+       if (ambiguous) {
+               /* ambiguous abbreviation */
+               if (PRINT_ERROR)
+                       warnx(ambig, (int)current_argv_len,
+                            current_argv);
+               optopt = 0;
+               return (BADCH);
+       }
+       if (match != -1) {              /* option found */
+               if (long_options[match].has_arg == no_argument
+                   && has_equal) {
+                       if (PRINT_ERROR)
+                               warnx(noarg, (int)current_argv_len,
+                                    current_argv);
+                       /*
+                        * XXX: GNU sets optopt to val regardless of flag
+                        */
+                       if (long_options[match].flag == NULL)
+                               optopt = long_options[match].val;
+                       else
+                               optopt = 0;
+                       return (BADARG);
+               }
+               if (long_options[match].has_arg == required_argument ||
+                   long_options[match].has_arg == optional_argument) {
+                       if (has_equal)
+                               optarg = has_equal;
+                       else if (long_options[match].has_arg ==
+                           required_argument) {
+                               /*
+                                * optional argument doesn't use next nargv
+                                */
+                               optarg = nargv[optind++];
+                       }
+               }
+               if ((long_options[match].has_arg == required_argument)
+                   && (optarg == NULL)) {
+                       /*
+                        * Missing argument; leading ':' indicates no error
+                        * should be generated.
+                        */
+                       if (PRINT_ERROR)
+                               warnx(recargstring,
+                                   current_argv);
+                       /*
+                        * XXX: GNU sets optopt to val regardless of flag
+                        */
+                       if (long_options[match].flag == NULL)
+                               optopt = long_options[match].val;
+                       else
+                               optopt = 0;
+                       --optind;
+                       return (BADARG);
+               }
+       } else {                        /* unknown option */
+               if (short_too) {
+                       --optind;
+                       return (-1);
+               }
+               if (PRINT_ERROR)
+                       warnx(illoptstring, current_argv);
+               optopt = 0;
+               return (BADCH);
+       }
+       if (idx)
+               *idx = match;
+       if (long_options[match].flag) {
+               *long_options[match].flag = long_options[match].val;
+               return (0);
+       } else
+               return (long_options[match].val);
+#undef IDENTICAL_INTERPRETATION
+}
+
+/*
+ * getopt_internal --
+ *     Parse argc/argv argument vector.  Called by user level routines.
+ */
+static int
+getopt_internal(int nargc, char * const *nargv, const char *options,
+       const struct option *long_options, int *idx, int flags)
+{
+       char *oli;                              /* option letter list index */
+       int optchar, short_too;
+       static int posixly_correct = -1;
+
+       if (options == NULL)
+               return (-1);
+
+       /*
+        * XXX Some GNU programs (like cvs) set optind to 0 instead of
+        * XXX using optreset.  Work around this braindamage.
+        */
+       if (optind == 0)
+               optind = optreset = 1;
+
+       /*
+        * Disable GNU extensions if POSIXLY_CORRECT is set or options
+        * string begins with a '+'.
+        *
+        * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or
+        *                 optreset != 0 for GNU compatibility.
+        */
+       if (posixly_correct == -1 || optreset != 0)
+               posixly_correct = (getenv("POSIXLY_CORRECT") != NULL);
+       if (*options == '-')
+               flags |= FLAG_ALLARGS;
+       else if (posixly_correct || *options == '+')
+               flags &= ~FLAG_PERMUTE;
+       if (*options == '+' || *options == '-')
+               options++;
+
+       optarg = NULL;
+       if (optreset)
+               nonopt_start = nonopt_end = -1;
+start:
+       if (optreset || !*place) {              /* update scanning pointer */
+               optreset = 0;
+               if (optind >= nargc) {          /* end of argument vector */
+                       place = EMSG;
+                       if (nonopt_end != -1) {
+                               /* do permutation, if we have to */
+                               permute_args(nonopt_start, nonopt_end,
+                                   optind, nargv);
+                               optind -= nonopt_end - nonopt_start;
+                       }
+                       else if (nonopt_start != -1) {
+                               /*
+                                * If we skipped non-options, set optind
+                                * to the first of them.
+                                */
+                               optind = nonopt_start;
+                       }
+                       nonopt_start = nonopt_end = -1;
+                       return (-1);
+               }
+               if (*(place = nargv[optind]) != '-' ||
+                   (place[1] == '\0' && strchr(options, '-') == NULL)) {
+                       place = EMSG;           /* found non-option */
+                       if (flags & FLAG_ALLARGS) {
+                               /*
+                                * GNU extension:
+                                * return non-option as argument to option 1
+                                */
+                               optarg = nargv[optind++];
+                               return (INORDER);
+                       }
+                       if (!(flags & FLAG_PERMUTE)) {
+                               /*
+                                * If no permutation wanted, stop parsing
+                                * at first non-option.
+                                */
+                               return (-1);
+                       }
+                       /* do permutation */
+                       if (nonopt_start == -1)
+                               nonopt_start = optind;
+                       else if (nonopt_end != -1) {
+                               permute_args(nonopt_start, nonopt_end,
+                                   optind, nargv);
+                               nonopt_start = optind -
+                                   (nonopt_end - nonopt_start);
+                               nonopt_end = -1;
+                       }
+                       optind++;
+                       /* process next argument */
+                       goto start;
+               }
+               if (nonopt_start != -1 && nonopt_end == -1)
+                       nonopt_end = optind;
+
+               /*
+                * If we have "-" do nothing, if "--" we are done.
+                */
+               if (place[1] != '\0' && *++place == '-' && place[1] == '\0') {
+                       optind++;
+                       place = EMSG;
+                       /*
+                        * We found an option (--), so if we skipped
+                        * non-options, we have to permute.
+                        */
+                       if (nonopt_end != -1) {
+                               permute_args(nonopt_start, nonopt_end,
+                                   optind, nargv);
+                               optind -= nonopt_end - nonopt_start;
+                       }
+                       nonopt_start = nonopt_end = -1;
+                       return (-1);
+               }
+       }
+
+       /*
+        * Check long options if:
+        *  1) we were passed some
+        *  2) the arg is not just "-"
+        *  3) either the arg starts with -- we are getopt_long_only()
+        */
+       if (long_options != NULL && place != nargv[optind] &&
+           (*place == '-' || (flags & FLAG_LONGONLY))) {
+               short_too = 0;
+               if (*place == '-')
+                       place++;                /* --foo long option */
+               else if (*place != ':' && strchr(options, *place) != NULL)
+                       short_too = 1;          /* could be short option too */
+
+               optchar = parse_long_options(nargv, options, long_options,
+                   idx, short_too);
+               if (optchar != -1) {
+                       place = EMSG;
+                       return (optchar);
+               }
+       }
+
+       if ((optchar = (int)*place++) == (int)':' ||
+           (optchar == (int)'-' && *place != '\0') ||
+           (oli = strchr(options, optchar)) == NULL) {
+               /*
+                * If the user specified "-" and  '-' isn't listed in
+                * options, return -1 (non-option) as per POSIX.
+                * Otherwise, it is an unknown option character (or ':').
+                */
+               if (optchar == (int)'-' && *place == '\0')
+                       return (-1);
+               if (!*place)
+                       ++optind;
+               if (PRINT_ERROR)
+                       warnx(illoptchar, optchar);
+               optopt = optchar;
+               return (BADCH);
+       }
+       if (long_options != NULL && optchar == 'W' && oli[1] == ';') {
+               /* -W long-option */
+               if (*place)                     /* no space */
+                       /* NOTHING */;
+               else if (++optind >= nargc) {   /* no arg */
+                       place = EMSG;
+                       if (PRINT_ERROR)
+                               warnx(recargchar, optchar);
+                       optopt = optchar;
+                       return (BADARG);
+               } else                          /* white space */
+                       place = nargv[optind];
+               optchar = parse_long_options(nargv, options, long_options,
+                   idx, 0);
+               place = EMSG;
+               return (optchar);
+       }
+       if (*++oli != ':') {                    /* doesn't take argument */
+               if (!*place)
+                       ++optind;
+       } else {                                /* takes (optional) argument */
+               optarg = NULL;
+               if (*place)                     /* no white space */
+                       optarg = place;
+               else if (oli[1] != ':') {       /* arg not optional */
+                       if (++optind >= nargc) {        /* no arg */
+                               place = EMSG;
+                               if (PRINT_ERROR)
+                                       warnx(recargchar, optchar);
+                               optopt = optchar;
+                               return (BADARG);
+                       } else
+                               optarg = nargv[optind];
+               }
+               place = EMSG;
+               ++optind;
+       }
+       /* dump back option letter */
+       return (optchar);
+}
+
+#ifdef REPLACE_GETOPT
+/*
+ * getopt --
+ *     Parse argc/argv argument vector.
+ *
+ * [eventually this will replace the BSD getopt]
+ */
+int
+getopt(int nargc, char * const *nargv, const char *options)
+{
+
+       /*
+        * We don't pass FLAG_PERMUTE to getopt_internal() since
+        * the BSD getopt(3) (unlike GNU) has never done this.
+        *
+        * Furthermore, since many privileged programs call getopt()
+        * before dropping privileges it makes sense to keep things
+        * as simple (and bug-free) as possible.
+        */
+       return (getopt_internal(nargc, nargv, options, NULL, NULL, 0));
+}
+#endif /* REPLACE_GETOPT */
+
+/*
+ * getopt_long --
+ *     Parse argc/argv argument vector.
+ */
+int
+getopt_long(int nargc, char * const *nargv, const char *options,
+    const struct option *long_options, int *idx)
+{
+
+       return (getopt_internal(nargc, nargv, options, long_options, idx,
+           FLAG_PERMUTE));
+}
+
+/*
+ * getopt_long_only --
+ *     Parse argc/argv argument vector.
+ */
+int
+getopt_long_only(int nargc, char * const *nargv, const char *options,
+    const struct option *long_options, int *idx)
+{
+
+       return (getopt_internal(nargc, nargv, options, long_options, idx,
+           FLAG_PERMUTE|FLAG_LONGONLY));
+}
diff --git a/tools/dav1d.c b/tools/dav1d.c
new file mode 100644 (file)
index 0000000..4b97a9f
--- /dev/null
@@ -0,0 +1,313 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "vcs_version.h"
+#include "cli_config.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif
+#ifdef HAVE_IO_H
+# include <io.h>
+#endif
+#ifdef _WIN32
+# include <windows.h>
+#endif
+#ifdef __APPLE__
+#include <mach/mach_time.h>
+#endif
+
+#include "dav1d/dav1d.h"
+
+#include "input/input.h"
+
+#include "output/output.h"
+
+#include "dav1d_cli_parse.h"
+
+static uint64_t get_time_nanos(void) {
+#ifdef _WIN32
+    LARGE_INTEGER frequency;
+    QueryPerformanceFrequency(&frequency);
+    LARGE_INTEGER t;
+    QueryPerformanceCounter(&t);
+    uint64_t seconds = t.QuadPart / frequency.QuadPart;
+    uint64_t fractions = t.QuadPart % frequency.QuadPart;
+    return 1000000000 * seconds + 1000000000 * fractions / frequency.QuadPart;
+#elif defined(HAVE_CLOCK_GETTIME)
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
+#elif defined(__APPLE__)
+    mach_timebase_info_data_t info;
+    mach_timebase_info(&info);
+    return mach_absolute_time() * info.numer / info.denom;
+#endif
+}
+
+static void sleep_nanos(uint64_t d) {
+#ifdef _WIN32
+    Sleep((unsigned)(d / 1000000));
+#else
+    const struct timespec ts = {
+        .tv_sec = (time_t)(d / 1000000000),
+        .tv_nsec = d % 1000000000,
+    };
+    nanosleep(&ts, NULL);
+#endif
+}
+
+static void synchronize(const int realtime, const unsigned cache,
+                        const unsigned n_out, const uint64_t nspf,
+                        const uint64_t tfirst, uint64_t *const elapsed,
+                        FILE *const frametimes)
+{
+    const uint64_t tcurr = get_time_nanos();
+    const uint64_t last = *elapsed;
+    *elapsed = tcurr - tfirst;
+    if (realtime) {
+        const uint64_t deadline = nspf * n_out;
+        if (*elapsed < deadline) {
+            const uint64_t remaining = deadline - *elapsed;
+            if (remaining > nspf * cache) sleep_nanos(remaining - nspf * cache);
+            *elapsed = deadline;
+        }
+    }
+    if (frametimes) {
+        const uint64_t frametime = *elapsed - last;
+        fprintf(frametimes, "%" PRIu64 "\n", frametime);
+        fflush(frametimes);
+    }
+}
+
+static void print_stats(const int istty, const unsigned n, const unsigned num,
+                        const uint64_t elapsed, const double i_fps)
+{
+    char buf[80], *b = buf, *const end = buf + 80;
+
+    if (istty)
+        *b++ = '\r';
+    if (num == 0xFFFFFFFF)
+        b += snprintf(b, end - b, "Decoded %u frames", n);
+    else
+        b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)",
+                      n, num, 100.0 * n / num);
+    if (i_fps && b < end) {
+        const double d_fps = 1e9 * n / elapsed;
+        const double speed = d_fps / i_fps;
+        b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
+                      d_fps, i_fps, speed);
+    }
+    if (!istty)
+        strcpy(b > end - 2 ? end - 2 : b, "\n");
+    fputs(buf, stderr);
+}
+
+int main(const int argc, char *const *const argv) {
+    const int istty = isatty(fileno(stderr));
+    int res = 0;
+    CLISettings cli_settings;
+    Dav1dSettings lib_settings;
+    DemuxerContext *in;
+    MuxerContext *out = NULL;
+    Dav1dPicture p;
+    Dav1dContext *c;
+    Dav1dData data;
+    unsigned n_out = 0, total, fps[2], timebase[2];
+    uint64_t nspf, tfirst, elapsed;
+    double i_fps;
+    FILE *frametimes = NULL;
+    const char *version = dav1d_version();
+
+    if (strcmp(version, DAV1D_VERSION)) {
+        fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n",
+                version, DAV1D_VERSION);
+        return EXIT_FAILURE;
+    }
+
+    parse(argc, argv, &cli_settings, &lib_settings);
+
+    if ((res = input_open(&in, cli_settings.demuxer,
+                          cli_settings.inputfile,
+                          fps, &total, timebase)) < 0)
+    {
+        return EXIT_FAILURE;
+    }
+    for (unsigned i = 0; i <= cli_settings.skip; i++) {
+        if ((res = input_read(in, &data)) < 0) {
+            input_close(in);
+            return EXIT_FAILURE;
+        }
+        if (i < cli_settings.skip) dav1d_data_unref(&data);
+    }
+
+    if (!cli_settings.quiet)
+        fprintf(stderr, "dav1d %s - by VideoLAN\n", dav1d_version());
+
+    // skip frames until a sequence header is found
+    if (cli_settings.skip) {
+        Dav1dSequenceHeader seq;
+        unsigned seq_skip = 0;
+        while (dav1d_parse_sequence_header(&seq, data.data, data.sz)) {
+            if ((res = input_read(in, &data)) < 0) {
+                input_close(in);
+                return EXIT_FAILURE;
+            }
+            seq_skip++;
+        }
+        if (seq_skip && !cli_settings.quiet)
+            fprintf(stderr,
+                    "skipped %u packets due to missing sequence header\n",
+                    seq_skip);
+    }
+
+    //getc(stdin);
+    if (cli_settings.limit != 0 && cli_settings.limit < total)
+        total = cli_settings.limit;
+
+    if ((res = dav1d_open(&c, &lib_settings)))
+        return EXIT_FAILURE;
+
+    if (cli_settings.frametimes)
+        frametimes = fopen(cli_settings.frametimes, "w");
+
+    if (cli_settings.realtime != REALTIME_CUSTOM) {
+        if (fps[1] == 0) {
+            i_fps = 0;
+            nspf = 0;
+        } else {
+            i_fps = (double)fps[0] / fps[1];
+            nspf = 1000000000ULL * fps[1] / fps[0];
+        }
+    } else {
+        i_fps = cli_settings.realtime_fps;
+        nspf = (uint64_t)(1000000000.0 / cli_settings.realtime_fps);
+    }
+    tfirst = get_time_nanos();
+
+    do {
+        memset(&p, 0, sizeof(p));
+        if ((res = dav1d_send_data(c, &data)) < 0) {
+            if (res != DAV1D_ERR(EAGAIN)) {
+                fprintf(stderr, "Error decoding frame: %s\n",
+                        strerror(DAV1D_ERR(res)));
+                break;
+            }
+        }
+
+        if ((res = dav1d_get_picture(c, &p)) < 0) {
+            if (res != DAV1D_ERR(EAGAIN)) {
+                fprintf(stderr, "Error decoding frame: %s\n",
+                        strerror(DAV1D_ERR(res)));
+                break;
+            }
+            res = 0;
+        } else {
+            if (!n_out) {
+                if ((res = output_open(&out, cli_settings.muxer,
+                                       cli_settings.outputfile,
+                                       &p.p, fps)) < 0)
+                {
+                    if (frametimes) fclose(frametimes);
+                    return EXIT_FAILURE;
+                }
+            }
+            if ((res = output_write(out, &p)) < 0)
+                break;
+            n_out++;
+            if (nspf || !cli_settings.quiet) {
+                synchronize(cli_settings.realtime, cli_settings.realtime_cache,
+                            n_out, nspf, tfirst, &elapsed, frametimes);
+            }
+            if (!cli_settings.quiet)
+                print_stats(istty, n_out, total, elapsed, i_fps);
+        }
+
+        if (cli_settings.limit && n_out == cli_settings.limit)
+            break;
+    } while (data.sz > 0 || !input_read(in, &data));
+
+    if (data.sz > 0) dav1d_data_unref(&data);
+
+    // flush
+    if (res == 0) while (!cli_settings.limit || n_out < cli_settings.limit) {
+        if ((res = dav1d_get_picture(c, &p)) < 0) {
+            if (res != DAV1D_ERR(EAGAIN)) {
+                fprintf(stderr, "Error decoding frame: %s\n",
+                        strerror(DAV1D_ERR(res)));
+            } else {
+                res = 0;
+                break;
+            }
+        } else {
+            if (!n_out) {
+                if ((res = output_open(&out, cli_settings.muxer,
+                                       cli_settings.outputfile,
+                                       &p.p, fps)) < 0)
+                {
+                    if (frametimes) fclose(frametimes);
+                    return EXIT_FAILURE;
+                }
+            }
+            if ((res = output_write(out, &p)) < 0)
+                break;
+            n_out++;
+            if (nspf || !cli_settings.quiet) {
+                synchronize(cli_settings.realtime, cli_settings.realtime_cache,
+                            n_out, nspf, tfirst, &elapsed, frametimes);
+            }
+            if (!cli_settings.quiet)
+                print_stats(istty, n_out, total, elapsed, i_fps);
+        }
+    }
+
+    if (frametimes) fclose(frametimes);
+
+    input_close(in);
+    if (out) {
+        if (!cli_settings.quiet && istty)
+            fprintf(stderr, "\n");
+        if (cli_settings.verify)
+            res |= output_verify(out, cli_settings.verify);
+        else
+            output_close(out);
+    } else {
+        fprintf(stderr, "No data decoded\n");
+        res = 1;
+    }
+    dav1d_close(&c);
+
+    return (res == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tools/dav1d_cli_parse.c b/tools/dav1d_cli_parse.c
new file mode 100644 (file)
index 0000000..f363033
--- /dev/null
@@ -0,0 +1,362 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <getopt.h>
+#include <limits.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif
+
+#include "dav1d_cli_parse.h"
+#include "src/cpu.h"
+
+static const char short_opts[] = "i:o:vql:s:";
+
+enum {
+    ARG_DEMUXER = 256,
+    ARG_MUXER,
+    ARG_FRAME_TIMES,
+    ARG_REALTIME,
+    ARG_REALTIME_CACHE,
+    ARG_FRAME_THREADS,
+    ARG_TILE_THREADS,
+    ARG_VERIFY,
+    ARG_FILM_GRAIN,
+    ARG_OPPOINT,
+    ARG_ALL_LAYERS,
+    ARG_SIZE_LIMIT,
+    ARG_CPU_MASK,
+};
+
+static const struct option long_opts[] = {
+    { "input",          1, NULL, 'i' },
+    { "output",         1, NULL, 'o' },
+    { "quiet",          0, NULL, 'q' },
+    { "demuxer",        1, NULL, ARG_DEMUXER },
+    { "muxer",          1, NULL, ARG_MUXER },
+    { "version",        0, NULL, 'v' },
+    { "frametimes",     1, NULL, ARG_FRAME_TIMES },
+    { "limit",          1, NULL, 'l' },
+    { "skip",           1, NULL, 's' },
+    { "realtime",       2, NULL, ARG_REALTIME },
+    { "realtimecache",  1, NULL, ARG_REALTIME_CACHE },
+    { "framethreads",   1, NULL, ARG_FRAME_THREADS },
+    { "tilethreads",    1, NULL, ARG_TILE_THREADS },
+    { "verify",         1, NULL, ARG_VERIFY },
+    { "filmgrain",      1, NULL, ARG_FILM_GRAIN },
+    { "oppoint",        1, NULL, ARG_OPPOINT },
+    { "alllayers",      1, NULL, ARG_ALL_LAYERS },
+    { "sizelimit",      1, NULL, ARG_SIZE_LIMIT },
+    { "cpumask",        1, NULL, ARG_CPU_MASK },
+    { NULL,             0, NULL, 0 },
+};
+
+#if ARCH_AARCH64 || ARCH_ARM
+#define ALLOWED_CPU_MASKS " or 'neon'"
+#elif ARCH_PPC64LE
+#define ALLOWED_CPU_MASKS " or 'vsx'"
+#elif ARCH_X86
+#define ALLOWED_CPU_MASKS \
+    ", 'sse2', 'ssse3', 'sse41', 'avx2' or 'avx512icl'"
+#else
+#define ALLOWED_CPU_MASKS "not yet implemented for this architecture"
+#endif
+
+static void usage(const char *const app, const char *const reason, ...) {
+    if (reason) {
+        va_list args;
+
+        va_start(args, reason);
+        vfprintf(stderr, reason, args);
+        va_end(args);
+        fprintf(stderr, "\n\n");
+    }
+    fprintf(stderr, "Usage: %s [options]\n\n", app);
+    fprintf(stderr, "Supported options:\n"
+            " --input/-i $file:     input file\n"
+            " --output/-o $file:    output file\n"
+            " --demuxer $name:      force demuxer type ('ivf', 'section5' or 'annexb'; default: detect from extension)\n"
+            " --muxer $name:        force muxer type ('md5', 'yuv', 'yuv4mpeg2' or 'null'; default: detect from extension)\n"
+            " --quiet/-q:           disable status messages\n"
+            " --frametimes $file:   dump frame times to file\n"
+            " --limit/-l $num:      stop decoding after $num frames\n"
+            " --skip/-s $num:       skip decoding of the first $num frames\n"
+            " --realtime [$fract]:  limit framerate, optional argument to override input framerate\n"
+            " --realtimecache $num: set the size of the cache in realtime mode (default: 0)\n"
+            " --version/-v:         print version and exit\n"
+            " --framethreads $num:  number of frame threads (default: 1)\n"
+            " --tilethreads $num:   number of tile threads (default: 1)\n"
+            " --filmgrain $num:     enable film grain application (default: 1, except if muxer is md5)\n"
+            " --oppoint $num:       select an operating point of a scalable AV1 bitstream (0 - 31)\n"
+            " --alllayers $num:     output all spatial layers of a scalable AV1 bitstream (default: 1)\n"
+            " --sizelimit $num:     stop decoding if the frame size exceeds the specified limit\n"
+            " --verify $md5:        verify decoded md5. implies --muxer md5, no output\n"
+            " --cpumask $mask:      restrict permitted CPU instruction sets (0" ALLOWED_CPU_MASKS "; default: -1)\n");
+    exit(1);
+}
+
+static void error(const char *const app, const char *const optarg,
+                  const int option, const char *const shouldbe)
+{
+    char optname[256];
+    int n;
+
+    for (n = 0; long_opts[n].name; n++)
+        if (long_opts[n].val == option)
+            break;
+    assert(long_opts[n].name);
+    if (long_opts[n].val < 256) {
+        sprintf(optname, "-%c/--%s", long_opts[n].val, long_opts[n].name);
+    } else {
+        sprintf(optname, "--%s", long_opts[n].name);
+    }
+
+    usage(app, "Invalid argument \"%s\" for option %s; should be %s",
+          optarg, optname, shouldbe);
+}
+
+static unsigned parse_unsigned(const char *const optarg, const int option,
+                               const char *const app)
+{
+    char *end;
+    const unsigned res = (unsigned) strtoul(optarg, &end, 0);
+    if (*end || end == optarg) error(app, optarg, option, "an integer");
+    return res;
+}
+
+static int parse_optional_fraction(const char *const optarg, const int option,
+                                   const char *const app, double *value)
+{
+    if (optarg == NULL) return 0;
+    char *end;
+    *value = strtod(optarg, &end);
+    if (*end == '/' && end != optarg) {
+        const char *optarg2 = end + 1;
+        *value /= strtod(optarg2, &end);
+        if (*end || end == optarg2) error(app, optarg, option, "a fraction");
+    } else if (*end || end == optarg) {
+        error(app, optarg, option, "a fraction");
+    }
+    return 1;
+}
+
+typedef struct EnumParseTable {
+    const char *str;
+    const int val;
+} EnumParseTable;
+
+#if ARCH_X86
+enum CpuMask {
+    X86_CPU_MASK_SSE2      = DAV1D_X86_CPU_FLAG_SSE2,
+    X86_CPU_MASK_SSSE3     = DAV1D_X86_CPU_FLAG_SSSE3     | X86_CPU_MASK_SSE2,
+    X86_CPU_MASK_SSE41     = DAV1D_X86_CPU_FLAG_SSE41     | X86_CPU_MASK_SSSE3,
+    X86_CPU_MASK_AVX2      = DAV1D_X86_CPU_FLAG_AVX2      | X86_CPU_MASK_SSE41,
+    X86_CPU_MASK_AVX512ICL = DAV1D_X86_CPU_FLAG_AVX512ICL | X86_CPU_MASK_AVX2,
+};
+#endif
+
+static const EnumParseTable cpu_mask_tbl[] = {
+#if ARCH_AARCH64 || ARCH_ARM
+    { "neon", DAV1D_ARM_CPU_FLAG_NEON },
+#elif ARCH_PPC64LE
+    { "vsx", DAV1D_PPC_CPU_FLAG_VSX },
+#elif ARCH_X86
+    { "sse2",      X86_CPU_MASK_SSE2 },
+    { "ssse3",     X86_CPU_MASK_SSSE3 },
+    { "sse41",     X86_CPU_MASK_SSE41 },
+    { "avx2",      X86_CPU_MASK_AVX2 },
+    { "avx512icl", X86_CPU_MASK_AVX512ICL },
+#endif
+    { 0 },
+};
+
+static unsigned parse_enum(char *optarg, const EnumParseTable *const tbl,
+                           const int option, const char *app)
+{
+    char str[1024];
+
+    strcpy(str, "any of ");
+    for (int n = 0; tbl[n].str; n++) {
+        if (!strcmp(tbl[n].str, optarg))
+            return tbl[n].val;
+
+        if (n) {
+            if (!tbl[n + 1].str)
+                strcat(str, " or ");
+            else
+                strcat(str, ", ");
+        }
+        strcat(str, tbl[n].str);
+    }
+
+    char *end;
+    unsigned res;
+    if (!strncmp(optarg, "0x", 2)) {
+        res = (unsigned) strtoul(&optarg[2], &end, 16);
+    } else {
+        res = (unsigned) strtoul(optarg, &end, 0);
+    }
+
+    if (*end || end == optarg) {
+        strcat(str, ", a hexadecimal (starting with 0x), or an integer");
+        error(app, optarg, option, str);
+    }
+
+    return res;
+}
+
+void parse(const int argc, char *const *const argv,
+           CLISettings *const cli_settings, Dav1dSettings *const lib_settings)
+{
+    int o;
+
+    memset(cli_settings, 0, sizeof(*cli_settings));
+    dav1d_default_settings(lib_settings);
+    int grain_specified = 0;
+
+    while ((o = getopt_long(argc, argv, short_opts, long_opts, NULL)) != -1) {
+        switch (o) {
+        case 'o':
+            cli_settings->outputfile = optarg;
+            break;
+        case 'i':
+            cli_settings->inputfile = optarg;
+            break;
+        case 'q':
+            cli_settings->quiet = 1;
+            break;
+        case 'l':
+            cli_settings->limit = parse_unsigned(optarg, 'l', argv[0]);
+            break;
+        case 's':
+            cli_settings->skip = parse_unsigned(optarg, 's', argv[0]);
+            break;
+        case ARG_DEMUXER:
+            cli_settings->demuxer = optarg;
+            break;
+        case ARG_MUXER:
+            cli_settings->muxer = optarg;
+            break;
+        case ARG_FRAME_TIMES:
+            cli_settings->frametimes = optarg;
+            break;
+        case ARG_REALTIME:
+            // workaround to parse an optional argument of the form `--a b`
+            // (getopt only allows `--a=b`)
+            if (optarg == NULL && optind < argc && argv[optind] != NULL &&
+                argv[optind][0] != '-')
+            {
+                optarg = argv[optind];
+                optind++;
+            }
+            cli_settings->realtime = 1 + parse_optional_fraction(optarg,
+                ARG_REALTIME, argv[0], &cli_settings->realtime_fps);
+            break;
+        case ARG_REALTIME_CACHE:
+            cli_settings->realtime_cache =
+                parse_unsigned(optarg, ARG_REALTIME_CACHE, argv[0]);
+            break;
+        case ARG_FRAME_THREADS:
+            lib_settings->n_frame_threads =
+                parse_unsigned(optarg, ARG_FRAME_THREADS, argv[0]);
+            break;
+        case ARG_TILE_THREADS:
+            lib_settings->n_tile_threads =
+                parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]);
+            break;
+        case ARG_VERIFY:
+            cli_settings->verify = optarg;
+            break;
+        case ARG_FILM_GRAIN:
+            lib_settings->apply_grain =
+                !!parse_unsigned(optarg, ARG_FILM_GRAIN, argv[0]);
+            grain_specified = 1;
+            break;
+        case ARG_OPPOINT:
+            lib_settings->operating_point =
+                parse_unsigned(optarg, ARG_OPPOINT, argv[0]);
+            break;
+        case ARG_ALL_LAYERS:
+            lib_settings->all_layers =
+                !!parse_unsigned(optarg, ARG_ALL_LAYERS, argv[0]);
+            break;
+        case ARG_SIZE_LIMIT: {
+            char *arg = optarg, *end;
+            uint64_t res = strtoul(arg, &end, 0);
+            if (*end == 'x') // NxM
+                res *= strtoul((arg = end + 1), &end, 0);
+            if (*end || end == arg || res >= UINT_MAX)
+                error(argv[0], optarg, ARG_SIZE_LIMIT, "an integer or dimension");
+            lib_settings->frame_size_limit = (unsigned) res;
+            break;
+        }
+        case 'v':
+            fprintf(stderr, "%s\n", dav1d_version());
+            exit(0);
+        case ARG_CPU_MASK:
+            dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl,
+                                                ARG_CPU_MASK, argv[0]));
+            break;
+        default:
+            usage(argv[0], NULL);
+        }
+    }
+
+    if (optind < argc)
+        usage(argv[0], "Extra/unused arguments found, e.g. '%s'\n", argv[optind]);
+    if (cli_settings->verify) {
+        if (cli_settings->outputfile)
+            usage(argv[0], "Verification (--verify) requires output file (-o/--output) to not set");
+        if (cli_settings->muxer && !strcmp(cli_settings->muxer, "md5"))
+            usage(argv[0], "Verification (--verify) requires the md5 muxer (--muxer md5)");
+
+        cli_settings->outputfile = "-";
+        if (!cli_settings->muxer)
+            cli_settings->muxer = "md5";
+    }
+
+    if (!grain_specified && cli_settings->muxer &&
+        !strcmp(cli_settings->muxer, "md5"))
+    {
+        lib_settings->apply_grain = 0;
+    }
+
+    if (!cli_settings->inputfile)
+        usage(argv[0], "Input file (-i/--input) is required");
+    if ((!cli_settings->muxer || strcmp(cli_settings->muxer, "null")) &&
+        !cli_settings->outputfile)
+    {
+        usage(argv[0], "Output file (-o/--output) is required");
+    }
+}
diff --git a/tools/dav1d_cli_parse.h b/tools/dav1d_cli_parse.h
new file mode 100644 (file)
index 0000000..11e88e1
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_CLI_PARSE_H
+#define DAV1D_CLI_PARSE_H
+
+#include "dav1d/dav1d.h"
+
+typedef struct {
+    const char *outputfile;
+    const char *inputfile;
+    const char *demuxer;
+    const char *muxer;
+    const char *frametimes;
+    const char *verify;
+    unsigned limit, skip;
+    int quiet;
+    enum {
+        REALTIME_DISABLE = 0,
+        REALTIME_INPUT,
+        REALTIME_CUSTOM,
+    } realtime;
+    double realtime_fps;
+    unsigned realtime_cache;
+} CLISettings;
+
+void parse(const int argc, char *const *const argv,
+           CLISettings *const cli_settings, Dav1dSettings *const lib_settings);
+
+#endif /* DAV1D_CLI_PARSE_H */
diff --git a/tools/input/annexb.c b/tools/input/annexb.c
new file mode 100644 (file)
index 0000000..032480d
--- /dev/null
@@ -0,0 +1,195 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2019, James Almer <jamrial@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "dav1d/headers.h"
+
+#include "input/demuxer.h"
+#include "input/parse.h"
+
+// these functions are based on an implementation from FFmpeg, and relicensed
+// with author's permission
+
+#define PROBE_SIZE 1024
+
+static int annexb_probe(const uint8_t *data) {
+    int ret, cnt = 0;
+
+    size_t temporal_unit_size;
+    ret = leb(data + cnt, PROBE_SIZE - cnt, &temporal_unit_size);
+    if (ret < 0)
+        return 0;
+    cnt += ret;
+
+    size_t frame_unit_size;
+    ret = leb(data + cnt, PROBE_SIZE - cnt, &frame_unit_size);
+    if (ret < 0 || ((uint64_t)frame_unit_size + ret) > temporal_unit_size)
+        return 0;
+    cnt += ret;
+
+    temporal_unit_size -= ret;
+
+    size_t obu_unit_size;
+    ret = leb(data + cnt, PROBE_SIZE - cnt, &obu_unit_size);
+    if (ret < 0 || ((uint64_t)obu_unit_size + ret) >= frame_unit_size)
+        return 0;
+    cnt += ret;
+
+    temporal_unit_size -= obu_unit_size + ret;
+    frame_unit_size -= obu_unit_size + ret;
+
+    // Check that the first OBU is a Temporal Delimiter.
+    size_t obu_size;
+    enum Dav1dObuType type;
+    ret = parse_obu_header(data + cnt, imin(PROBE_SIZE - cnt, (int) obu_unit_size),
+                           &obu_size, &type, 1);
+    if (ret < 0 || type != DAV1D_OBU_TD || obu_size > 0)
+        return 0;
+    cnt += (int)obu_unit_size;
+
+    // look for first frame and accompanying sequence header
+    int seq = 0;
+    while (cnt < PROBE_SIZE) {
+        ret = leb(data + cnt, PROBE_SIZE - cnt, &obu_unit_size);
+        if (ret < 0 || ((uint64_t)obu_unit_size + ret) > frame_unit_size)
+            return 0;
+        cnt += ret;
+        temporal_unit_size -= ret;
+        frame_unit_size -= ret;
+
+        ret = parse_obu_header(data + cnt, imin(PROBE_SIZE - cnt, (int) obu_unit_size),
+                               &obu_size, &type, 1);
+        if (ret < 0)
+            return 0;
+        cnt += (int)obu_unit_size;
+
+        switch (type) {
+        case DAV1D_OBU_SEQ_HDR:
+            seq = 1;
+            break;
+        case DAV1D_OBU_FRAME:
+        case DAV1D_OBU_FRAME_HDR:
+            return seq;
+        case DAV1D_OBU_TD:
+        case DAV1D_OBU_TILE_GRP:
+            return 0;
+        default:
+            break;
+        }
+
+        temporal_unit_size -= obu_unit_size;
+        frame_unit_size -= obu_unit_size;
+        if (frame_unit_size <= 0)
+            break;
+    }
+
+    return 0;
+}
+
+typedef struct DemuxerPriv {
+    FILE *f;
+    size_t temporal_unit_size;
+    size_t frame_unit_size;
+} AnnexbInputContext;
+
+static int annexb_open(AnnexbInputContext *const c, const char *const file,
+                       unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
+{
+    int res;
+    size_t len;
+
+    if (!(c->f = fopen(file, "rb"))) {
+        fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno));
+        return -1;
+    }
+
+    // TODO: Parse sequence header and read timing info if any.
+    fps[0] = 25;
+    fps[1] = 1;
+    timebase[0] = 25;
+    timebase[1] = 1;
+    for (*num_frames = 0;; (*num_frames)++) {
+        res = leb128(c->f, &len);
+        if (res < 0)
+            break;
+        fseeko(c->f, len, SEEK_CUR);
+    }
+    fseeko(c->f, 0, SEEK_SET);
+
+    return 0;
+}
+
+static int annexb_read(AnnexbInputContext *const c, Dav1dData *const data) {
+    size_t len;
+    int res;
+
+    if (!c->temporal_unit_size) {
+        res = leb128(c->f, &c->temporal_unit_size);
+        if (res < 0) return -1;
+    }
+    if (!c->frame_unit_size) {
+        res = leb128(c->f, &c->frame_unit_size);
+        if (res < 0 || (c->frame_unit_size + res) > c->temporal_unit_size) return -1;
+        c->temporal_unit_size -= res;
+    }
+    res = leb128(c->f, &len);
+    if (res < 0 || (len + res) > c->frame_unit_size) return -1;
+    uint8_t *ptr = dav1d_data_create(data, len);
+    if (!ptr) return -1;
+    c->temporal_unit_size -= len + res;
+    c->frame_unit_size -= len + res;
+    if (fread(ptr, len, 1, c->f) != 1) {
+        fprintf(stderr, "Failed to read frame data: %s\n", strerror(errno));
+        dav1d_data_unref(data);
+        return -1;
+    }
+
+    return 0;
+}
+
+static void annexb_close(AnnexbInputContext *const c) {
+    fclose(c->f);
+}
+
+const Demuxer annexb_demuxer = {
+    .priv_data_size = sizeof(AnnexbInputContext),
+    .name = "annexb",
+    .probe = annexb_probe,
+    .probe_sz = PROBE_SIZE,
+    .open = annexb_open,
+    .read = annexb_read,
+    .close = annexb_close,
+};
diff --git a/tools/input/demuxer.h b/tools/input/demuxer.h
new file mode 100644 (file)
index 0000000..c2b88e1
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_INPUT_DEMUXER_H
+#define DAV1D_INPUT_DEMUXER_H
+
+#include "data.h"
+
+typedef struct DemuxerPriv DemuxerPriv;
+typedef struct Demuxer {
+    int priv_data_size;
+    const char *name;
+    int probe_sz;
+    int (*probe)(const uint8_t *data);
+    int (*open)(DemuxerPriv *ctx, const char *filename,
+                unsigned fps[2], unsigned *num_frames, unsigned timebase[2]);
+    int (*read)(DemuxerPriv *ctx, Dav1dData *data);
+    void (*close)(DemuxerPriv *ctx);
+} Demuxer;
+
+#endif /* DAV1D_INPUT_DEMUXER_H */
diff --git a/tools/input/input.c b/tools/input/input.c
new file mode 100644 (file)
index 0000000..3ed6983
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "input/input.h"
+#include "input/demuxer.h"
+
+struct DemuxerContext {
+    DemuxerPriv *data;
+    const Demuxer *impl;
+};
+
+extern const Demuxer ivf_demuxer;
+extern const Demuxer annexb_demuxer;
+extern const Demuxer section5_demuxer;
+static const Demuxer *const demuxers[] = {
+    &ivf_demuxer,
+    &annexb_demuxer,
+    &section5_demuxer,
+    NULL
+};
+
+int input_open(DemuxerContext **const c_out,
+               const char *const name, const char *const filename,
+               unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
+{
+    const Demuxer *impl;
+    DemuxerContext *c;
+    int res, i;
+
+    if (name) {
+        for (i = 0; demuxers[i]; i++) {
+            if (!strcmp(demuxers[i]->name, name)) {
+                impl = demuxers[i];
+                break;
+            }
+        }
+        if (!demuxers[i]) {
+            fprintf(stderr, "Failed to find demuxer named \"%s\"\n", name);
+            return DAV1D_ERR(ENOPROTOOPT);
+        }
+    } else {
+        int probe_sz = 0;
+        for (i = 0; demuxers[i]; i++)
+            probe_sz = imax(probe_sz, demuxers[i]->probe_sz);
+        uint8_t *const probe_data = malloc(probe_sz);
+        if (!probe_data) {
+            fprintf(stderr, "Failed to allocate memory\n");
+            return DAV1D_ERR(ENOMEM);
+        }
+        FILE *f = fopen(filename, "rb");
+        if (!f) {
+            fprintf(stderr, "Failed to open input file %s: %s\n", filename, strerror(errno));
+            return errno ? DAV1D_ERR(errno) : DAV1D_ERR(EIO);
+        }
+        res = !!fread(probe_data, 1, probe_sz, f);
+        fclose(f);
+        if (!res) {
+            free(probe_data);
+            fprintf(stderr, "Failed to read probe data\n");
+            return errno ? DAV1D_ERR(errno) : DAV1D_ERR(EIO);
+        }
+
+        for (i = 0; demuxers[i]; i++) {
+            if (demuxers[i]->probe(probe_data)) {
+                impl = demuxers[i];
+                break;
+            }
+        }
+        free(probe_data);
+        if (!demuxers[i]) {
+            fprintf(stderr,
+                    "Failed to probe demuxer for file %s\n",
+                    filename);
+            return DAV1D_ERR(ENOPROTOOPT);
+        }
+    }
+
+    if (!(c = calloc(1, sizeof(DemuxerContext) + impl->priv_data_size))) {
+        fprintf(stderr, "Failed to allocate memory\n");
+        return DAV1D_ERR(ENOMEM);
+    }
+    c->impl = impl;
+    c->data = (DemuxerPriv *) &c[1];
+    if ((res = impl->open(c->data, filename, fps, num_frames, timebase)) < 0) {
+        free(c);
+        return res;
+    }
+    *c_out = c;
+
+    return 0;
+}
+
+int input_read(DemuxerContext *const ctx, Dav1dData *const data) {
+    return ctx->impl->read(ctx->data, data);
+}
+
+void input_close(DemuxerContext *const ctx) {
+    ctx->impl->close(ctx->data);
+    free(ctx);
+}
diff --git a/tools/input/input.h b/tools/input/input.h
new file mode 100644 (file)
index 0000000..7b2fdc9
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_INPUT_INPUT_H
+#define DAV1D_INPUT_INPUT_H
+
+#include "data.h"
+
+typedef struct DemuxerContext DemuxerContext;
+
+int input_open(DemuxerContext **const c_out,
+               const char *const name, const char *const filename,
+               unsigned fps[2], unsigned *num_frames, unsigned timebase[2]);
+int input_read(DemuxerContext *ctx, Dav1dData *data);
+void input_close(DemuxerContext *ctx);
+
+#endif /* DAV1D_INPUT_INPUT_H */
diff --git a/tools/input/ivf.c b/tools/input/ivf.c
new file mode 100644 (file)
index 0000000..7b572ee
--- /dev/null
@@ -0,0 +1,158 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "input/demuxer.h"
+
+typedef struct DemuxerPriv {
+    FILE *f;
+} IvfInputContext;
+
+static const uint8_t probe_data[] = {
+    'D', 'K', 'I', 'F',
+    0, 0, 0x20, 0,
+    'A', 'V', '0', '1',
+};
+
+static int ivf_probe(const uint8_t *const data) {
+    return !memcmp(data, probe_data, sizeof(probe_data));
+}
+
+static unsigned rl32(const uint8_t *const p) {
+    return ((uint32_t)p[3] << 24U) | (p[2] << 16U) | (p[1] << 8U) | p[0];
+}
+
+static int64_t rl64(const uint8_t *const p) {
+    return (((uint64_t) rl32(&p[4])) << 32) | rl32(p);
+}
+
+static int ivf_open(IvfInputContext *const c, const char *const file,
+                    unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
+{
+    size_t res;
+    uint8_t hdr[32];
+
+    if (!(c->f = fopen(file, "rb"))) {
+        fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno));
+        return -1;
+    } else if ((res = fread(hdr, 32, 1, c->f)) != 1) {
+        fprintf(stderr, "Failed to read stream header: %s\n", strerror(errno));
+        fclose(c->f);
+        return -1;
+    } else if (memcmp(hdr, "DKIF", 4)) {
+        fprintf(stderr, "%s is not an IVF file [tag=%.4s|0x%02x%02x%02x%02x]\n",
+                file, hdr, hdr[0], hdr[1], hdr[2], hdr[3]);
+        fclose(c->f);
+        return -1;
+    } else if (memcmp(&hdr[8], "AV01", 4)) {
+        fprintf(stderr, "%s is not an AV1 file [tag=%.4s|0x%02x%02x%02x%02x]\n",
+                file, &hdr[8], hdr[8], hdr[9], hdr[10], hdr[11]);
+        fclose(c->f);
+        return -1;
+    }
+
+    timebase[0] = rl32(&hdr[16]);
+    timebase[1] = rl32(&hdr[20]);
+    const unsigned duration = rl32(&hdr[24]);
+
+    uint8_t data[4];
+    for (*num_frames = 0;; (*num_frames)++) {
+        if ((res = fread(data, 4, 1, c->f)) != 1)
+            break; // EOF
+        fseeko(c->f, rl32(data) + 8, SEEK_CUR);
+    }
+
+    uint64_t fps_num = (uint64_t) timebase[0] * *num_frames;
+    uint64_t fps_den = (uint64_t) timebase[1] * duration;
+    if (fps_num && fps_den) { /* Reduce fraction */
+        uint64_t gcd = fps_num;
+        for (uint64_t a = fps_den, b; (b = a % gcd); a = gcd, gcd = b);
+        fps_num /= gcd;
+        fps_den /= gcd;
+
+        while ((fps_num | fps_den) > UINT_MAX) {
+            fps_num >>= 1;
+            fps_den >>= 1;
+        }
+    }
+    if (fps_num && fps_den) {
+        fps[0] = (unsigned) fps_num;
+        fps[1] = (unsigned) fps_den;
+    } else {
+        fps[0] = fps[1] = 0;
+    }
+
+    fseeko(c->f, 32, SEEK_SET);
+
+    return 0;
+}
+
+static int ivf_read(IvfInputContext *const c, Dav1dData *const buf) {
+    uint8_t data[8];
+    uint8_t *ptr;
+    size_t res;
+
+    const int64_t off = ftello(c->f);
+    if ((res = fread(data, 4, 1, c->f)) != 1)
+        return -1; // EOF
+    const ptrdiff_t sz = rl32(data);
+    if ((res = fread(data, 8, 1, c->f)) != 1)
+        return -1; // EOF
+    ptr = dav1d_data_create(buf, sz);
+    if (!ptr) return -1;
+    buf->m.offset = off;
+    buf->m.timestamp = rl64(data);
+    if ((res = fread(ptr, sz, 1, c->f)) != 1) {
+        fprintf(stderr, "Failed to read frame data: %s\n", strerror(errno));
+        dav1d_data_unref(buf);
+        return -1;
+    }
+
+    return 0;
+}
+
+static void ivf_close(IvfInputContext *const c) {
+    fclose(c->f);
+}
+
+const Demuxer ivf_demuxer = {
+    .priv_data_size = sizeof(IvfInputContext),
+    .name = "ivf",
+    .probe = ivf_probe,
+    .probe_sz = sizeof(probe_data),
+    .open = ivf_open,
+    .read = ivf_read,
+    .close = ivf_close,
+};
diff --git a/tools/input/parse.h b/tools/input/parse.h
new file mode 100644 (file)
index 0000000..bebea21
--- /dev/null
@@ -0,0 +1,107 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2019, James Almer <jamrial@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_INPUT_PARSE_H
+#define DAV1D_INPUT_PARSE_H
+
+#include "dav1d/headers.h"
+
+static int leb128(FILE *const f, size_t *const len) {
+    unsigned i = 0, more;
+    *len = 0;
+    do {
+        uint8_t byte;
+        if (fread(&byte, 1, 1, f) < 1)
+            return -1;
+        more = byte & 0x80;
+        const unsigned bits = byte & 0x7f;
+        if (i <= 3 || (i == 4 && bits < (1 << 4)))
+            *len |= bits << (i * 7);
+        else if (bits) return -1;
+        if (++i == 8 && more) return -1;
+    } while (more);
+    return i;
+}
+
+// these functions are based on an implementation from FFmpeg, and relicensed
+// with author's permission
+
+static int leb(const uint8_t *ptr, int sz, size_t *const len) {
+    unsigned i = 0, more;
+    *len = 0;
+    do {
+        if (!sz--) return -1;
+        const int byte = *ptr++;
+        more = byte & 0x80;
+        const unsigned bits = byte & 0x7f;
+        if (i <= 3 || (i == 4 && bits < (1 << 4)))
+            *len |= bits << (i * 7);
+        else if (bits) return -1;
+        if (++i == 8 && more) return -1;
+    } while (more);
+    return i;
+}
+
+static inline int parse_obu_header(const uint8_t *buf, int buf_size,
+                                   size_t *const obu_size,
+                                   enum Dav1dObuType *const type,
+                                   const int allow_implicit_size)
+{
+    int ret, extension_flag, has_size_flag;
+
+    if (!buf_size)
+        return -1;
+    if (*buf & 0x80) // obu_forbidden_bit
+        return -1;
+
+    *type = (*buf & 0x78) >> 3;
+    extension_flag = (*buf & 0x4) >> 2;
+    has_size_flag  = (*buf & 0x2) >> 1;
+    // ignore obu_reserved_1bit
+    buf++;
+    buf_size--;
+
+    if (extension_flag) {
+        buf++;
+        buf_size--;
+        // ignore fields
+    }
+
+    if (has_size_flag) {
+        ret = leb(buf, buf_size, obu_size);
+        if (ret < 0)
+            return -1;
+        return (int) *obu_size + ret + 1 + extension_flag;
+    } else if (!allow_implicit_size)
+        return -1;
+
+    *obu_size = buf_size;
+    return buf_size + 1 + extension_flag;
+}
+
+#endif /* DAV1D_INPUT_PARSE_H */
diff --git a/tools/input/section5.c b/tools/input/section5.c
new file mode 100644 (file)
index 0000000..0c2ce28
--- /dev/null
@@ -0,0 +1,185 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * Copyright © 2019, James Almer <jamrial@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "dav1d/headers.h"
+
+#include "input/demuxer.h"
+#include "input/parse.h"
+
+#define PROBE_SIZE 1024
+
+static int section5_probe(const uint8_t *data) {
+    int ret, cnt = 0;
+
+    // Check that the first OBU is a Temporal Delimiter.
+    size_t obu_size;
+    enum Dav1dObuType type;
+    ret = parse_obu_header(data + cnt, PROBE_SIZE - cnt,
+                           &obu_size, &type, 0);
+    if (ret < 0 || type != DAV1D_OBU_TD || obu_size > 0)
+        return 0;
+    cnt += ret;
+
+    // look for first frame and accompanying sequence header
+    int seq = 0;
+    while (cnt < PROBE_SIZE) {
+        ret = parse_obu_header(data + cnt, PROBE_SIZE - cnt,
+                               &obu_size, &type, 0);
+        if (ret < 0)
+            return 0;
+        cnt += ret;
+
+        switch (type) {
+        case DAV1D_OBU_SEQ_HDR:
+            seq = 1;
+            break;
+        case DAV1D_OBU_FRAME:
+        case DAV1D_OBU_FRAME_HDR:
+            return seq;
+        case DAV1D_OBU_TD:
+        case DAV1D_OBU_TILE_GRP:
+            return 0;
+        default:
+            break;
+        }
+    }
+
+    return 0;
+}
+
+typedef struct DemuxerPriv {
+    FILE *f;
+} Section5InputContext;
+
+static int section5_open(Section5InputContext *const c, const char *const file,
+                         unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
+{
+    if (!(c->f = fopen(file, "rb"))) {
+        fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno));
+        return -1;
+    }
+
+    // TODO: Parse sequence header and read timing info if any.
+    fps[0] = 25;
+    fps[1] = 1;
+    timebase[0] = 25;
+    timebase[1] = 1;
+    *num_frames = 0;
+    for (;;) {
+        uint8_t byte[2];
+
+        if (fread(&byte[0], 1, 1, c->f) < 1)
+            break;
+        const enum Dav1dObuType obu_type = (byte[0] >> 3) & 0xf;
+        if (obu_type == DAV1D_OBU_TD)
+            (*num_frames)++;
+        const int has_length_field = byte[0] & 0x2;
+        if (!has_length_field)
+            return -1;
+        const int has_extension = byte[0] & 0x4;
+        if (has_extension && fread(&byte[1], 1, 1, c->f) < 1)
+            return -1;
+        size_t len;
+        const int res = leb128(c->f, &len);
+        if (res < 0)
+            return -1;
+        fseeko(c->f, len, SEEK_CUR); // skip packet
+    }
+    fseeko(c->f, 0, SEEK_SET);
+
+    return 0;
+}
+
+static int section5_read(Section5InputContext *const c, Dav1dData *const data) {
+    size_t total_bytes = 0;
+
+    for (int first = 1;; first = 0) {
+        uint8_t byte[2];
+
+        if (fread(&byte[0], 1, 1, c->f) < 1) {
+            if (!first && feof(c->f)) break;
+            return -1;
+        }
+        const enum Dav1dObuType obu_type = (byte[0] >> 3) & 0xf;
+        if (first) {
+            if (obu_type != DAV1D_OBU_TD)
+                return -1;
+        } else {
+            if (obu_type == DAV1D_OBU_TD) {
+                // include TD in next packet
+                fseeko(c->f, -1, SEEK_CUR);
+                break;
+            }
+        }
+        const int has_length_field = byte[0] & 0x2;
+        if (!has_length_field)
+            return -1;
+        const int has_extension = !!(byte[0] & 0x4);
+        if (has_extension && fread(&byte[1], 1, 1, c->f) < 1)
+            return -1;
+        size_t len;
+        const int res = leb128(c->f, &len);
+        if (res < 0)
+            return -1;
+        total_bytes += 1 + has_extension + res + len;
+        fseeko(c->f, len, SEEK_CUR); // skip packet, we'll read it below
+    }
+
+    fseeko(c->f, -(off_t)total_bytes, SEEK_CUR);
+    uint8_t *ptr = dav1d_data_create(data, total_bytes);
+    if (!ptr) return -1;
+    if (fread(ptr, total_bytes, 1, c->f) != 1) {
+        fprintf(stderr, "Failed to read frame data: %s\n", strerror(errno));
+        dav1d_data_unref(data);
+        return -1;
+    }
+
+    return 0;
+}
+
+static void section5_close(Section5InputContext *const c) {
+    fclose(c->f);
+}
+
+const Demuxer section5_demuxer = {
+    .priv_data_size = sizeof(Section5InputContext),
+    .name = "section5",
+    .probe = section5_probe,
+    .probe_sz = PROBE_SIZE,
+    .open = section5_open,
+    .read = section5_read,
+    .close = section5_close,
+};
diff --git a/tools/meson.build b/tools/meson.build
new file mode 100644 (file)
index 0000000..4b4217a
--- /dev/null
@@ -0,0 +1,88 @@
+# Copyright © 2018, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Common source files used by tools and examples
+
+dav1d_input_sources = files(
+    'input/input.c',
+    'input/annexb.c',
+    'input/ivf.c',
+    'input/section5.c',
+)
+
+dav1d_output_sources = files(
+    'output/md5.c',
+    'output/null.c',
+    'output/output.c',
+    'output/y4m2.c',
+    'output/yuv.c',
+)
+
+dav1d_input_objs = static_library('dav1d_input',
+    dav1d_input_sources,
+
+    include_directories : dav1d_inc_dirs,
+    install : false,
+    build_by_default : false,
+)
+
+dav1d_output_objs = static_library('dav1d_output',
+    dav1d_output_sources,
+
+    include_directories : dav1d_inc_dirs,
+    install : false,
+    build_by_default : false,
+)
+
+
+# Leave subdir if tools are disabled
+if not get_option('enable_tools')
+    subdir_done()
+endif
+
+
+#
+# Build definition for the dav1d tools
+#
+
+# Configuratin data for cli_config.h
+cli_cdata = configuration_data()
+
+cli_config_h_target = configure_file(output: 'cli_config.h', configuration: cli_cdata)
+
+# dav1d cli tool sources
+dav1d_sources = files(
+    'dav1d.c',
+    'dav1d_cli_parse.c',
+)
+
+dav1d = executable('dav1d',
+    dav1d_sources,
+    rev_target, cli_config_h_target,
+
+    link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs],
+    include_directories : [dav1d_inc_dirs],
+    dependencies : [getopt_dependency, thread_dependency, rt_dependency],
+    install : true,
+)
diff --git a/tools/output/md5.c b/tools/output/md5.c
new file mode 100644 (file)
index 0000000..6555de8
--- /dev/null
@@ -0,0 +1,317 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#include "common/intops.h"
+
+#include "output/muxer.h"
+
+static const uint8_t s[][4] = {
+    { 7, 12, 17, 22, },
+    { 5,  9, 14, 20, },
+    { 4, 11, 16, 23, },
+    { 6, 10, 15, 21, },
+};
+
+static const unsigned k[] = {
+    0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
+    0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
+    0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
+    0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
+    0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
+    0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+    0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
+    0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
+    0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
+    0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
+    0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
+    0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+    0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
+    0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
+    0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
+    0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
+};
+
+
+#if ENDIANNESS_BIG
+#define NE2LE_32(x) (((x & 0x00ff) << 24) |\
+                     ((x & 0xff00) <<  8) |\
+                     ((x >>  8) & 0xff00) |\
+                     ((x >> 24) & 0x00ff))
+
+#define NE2LE_64(x) (((x & 0x000000ff) << 56) |\
+                     ((x & 0x0000ff00) << 40) |\
+                     ((x & 0x00ff0000) << 24) |\
+                     ((x & 0xff000000) <<  8) |\
+                     ((x >>  8) & 0xff000000) |\
+                     ((x >> 24) & 0x00ff0000) |\
+                     ((x >> 40) & 0x0000ff00) |\
+                     ((x >> 56) & 0x000000ff))
+
+#else
+#define NE2LE_32(x) (x)
+#define NE2LE_64(x) (x)
+#endif
+
+typedef struct MuxerPriv {
+    unsigned abcd[4];
+    uint8_t data[64];
+    uint64_t len;
+    FILE *f;
+#if ENDIANNESS_BIG
+    uint8_t *bswap;
+    int bswap_w;
+#endif
+} MD5Context;
+
+static int md5_open(MD5Context *const md5, const char *const file,
+                    const Dav1dPictureParameters *const p,
+                    const unsigned fps[2])
+{
+    if (!strcmp(file, "-")) {
+        md5->f = stdout;
+    } else if (!(md5->f = fopen(file, "wb"))) {
+        fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno));
+        return -1;
+    }
+
+#if ENDIANNESS_BIG
+    md5->bswap = NULL;
+    md5->bswap_w = 0;
+#endif
+
+    md5->abcd[0] = 0x67452301;
+    md5->abcd[1] = 0xefcdab89;
+    md5->abcd[2] = 0x98badcfe;
+    md5->abcd[3] = 0x10325476;
+    md5->len = 0;
+
+    return 0;
+}
+
+static inline unsigned leftrotate(const unsigned x, const unsigned c) {
+    return (x << c) | (x >> (32 - c));
+}
+
+static void md5_body(MD5Context *md5, const uint8_t *const _data) {
+    const uint32_t *data = (uint32_t *) _data;
+
+    unsigned a = md5->abcd[0];
+    unsigned b = md5->abcd[1];
+    unsigned c = md5->abcd[2];
+    unsigned d = md5->abcd[3];
+    unsigned i;
+
+    for (i = 0; i < 64; i++) {
+        unsigned f, g, tmp;
+
+        if (i < 16) {
+            f = (b & c) | (~b & d);
+            g = i;
+        } else if (i < 32) {
+            f = (d & b) | (~d & c);
+            g = (5 * i + 1) & 15;
+        } else if (i < 48) {
+            f = b ^ c ^ d;
+            g = (3 * i + 5) & 15;
+        } else {
+            f = c ^ (b | ~d);
+            g = (7 * i) & 15;
+        }
+
+        tmp = d;
+        d = c;
+        c = b;
+        b += leftrotate(a + f + k[i] + NE2LE_32(data[g]), s[i >> 4][i & 3]);
+        a = tmp;
+    }
+
+    md5->abcd[0] += a;
+    md5->abcd[1] += b;
+    md5->abcd[2] += c;
+    md5->abcd[3] += d;
+}
+
+static void md5_update(MD5Context *const md5, const uint8_t *data, unsigned len) {
+    if (!len) return;
+
+    if (md5->len & 63) {
+        const unsigned tmp = imin(len, 64 - (md5->len & 63));
+
+        memcpy(&md5->data[md5->len & 63], data, tmp);
+        len -= tmp;
+        data += tmp;
+        md5->len += tmp;
+        if (!(md5->len & 63))
+            md5_body(md5, md5->data);
+    }
+
+    while (len >= 64) {
+        memcpy(md5->data, data, 64);
+        md5_body(md5, md5->data);
+        md5->len += 64;
+        data += 64;
+        len -= 64;
+    }
+
+    if (len) {
+        memcpy(md5->data, data, len);
+        md5->len += len;
+    }
+}
+
+static int md5_write(MD5Context *const md5, Dav1dPicture *const p) {
+    const int hbd = p->p.bpc > 8;
+    const int w = p->p.w, h = p->p.h;
+    uint8_t *yptr = p->data[0];
+
+#if ENDIANNESS_BIG
+    if (hbd && (!md5->bswap || md5->bswap_w < p->p.w)) {
+        free(md5->bswap);
+        md5->bswap_w = 0;
+        md5->bswap = malloc(p->p.w << 1);
+        if (!md5->bswap) return -1;
+        md5->bswap_w = p->p.w;
+    }
+#endif
+
+    for (int y = 0; y < h; y++) {
+#if ENDIANNESS_BIG
+        if (hbd) {
+            for (int x = 0; x < w; x++) {
+                md5->bswap[2 * x + 1] = yptr[2 * x];
+                md5->bswap[2 * x]     = yptr[2 * x + 1];
+            }
+            md5_update(md5, md5->bswap, w << hbd);
+        } else
+#endif
+        md5_update(md5, yptr, w << hbd);
+        yptr += p->stride[0];
+    }
+
+    if (p->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+        const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int cw = (w + ss_hor) >> ss_hor;
+        const int ch = (h + ss_ver) >> ss_ver;
+        for (int pl = 1; pl <= 2; pl++) {
+            uint8_t *uvptr = p->data[pl];
+
+            for (int y = 0; y < ch; y++) {
+#if ENDIANNESS_BIG
+                if (hbd) {
+                    for (int x = 0; x < cw; x++){
+                        md5->bswap[2 * x + 1] = uvptr[2 * x];
+                        md5->bswap[2 * x]     = uvptr[2 * x + 1];
+                    }
+                    md5_update(md5, md5->bswap, cw << hbd);
+                } else
+#endif
+                md5_update(md5, uvptr, cw << hbd);
+                uvptr += p->stride[1];
+            }
+        }
+    }
+
+    dav1d_picture_unref(p);
+
+    return 0;
+}
+
+static void md5_finish(MD5Context *const md5) {
+    static const uint8_t bit[2] = { 0x80, 0x00 };
+    uint64_t len = NE2LE_64(md5->len << 3);
+
+    md5_update(md5, &bit[0], 1);
+    while ((md5->len & 63) != 56)
+        md5_update(md5, &bit[1], 1);
+    md5_update(md5, (uint8_t *) &len, 8);
+}
+
+static void md5_close(MD5Context *const md5) {
+    md5_finish(md5);
+    for (int i = 0; i < 4; i++)
+        fprintf(md5->f, "%2.2x%2.2x%2.2x%2.2x",
+                md5->abcd[i] & 0xff,
+                (md5->abcd[i] >> 8) & 0xff,
+                (md5->abcd[i] >> 16) & 0xff,
+                md5->abcd[i] >> 24);
+    fprintf(md5->f, "\n");
+
+#if ENDIANNESS_BIG
+    free(md5->bswap);
+    md5->bswap_w = 0;
+#endif
+
+    if (md5->f != stdout)
+        fclose(md5->f);
+}
+
+static int md5_verify(MD5Context *const md5, const char *const md5_str) {
+    md5_finish(md5);
+
+    if (strlen(md5_str) < 32)
+        return 0;
+
+    const char *p = md5_str;
+    unsigned abcd[4] = { 0 };
+    char t[3] = { 0 };
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            unsigned val;
+            char *ignore;
+            memcpy(t, p, 2);
+            p += 2;
+            val = (unsigned) strtoul(t, &ignore, 16);
+            abcd[i] |= val << (8 * j);
+        }
+    }
+
+#if ENDIANNESS_BIG
+    free(md5->bswap);
+    md5->bswap_w = 0;
+#endif
+
+    return !!memcmp(abcd, md5->abcd, sizeof(abcd));
+}
+
+const Muxer md5_muxer = {
+    .priv_data_size = sizeof(MD5Context),
+    .name = "md5",
+    .extension = "md5",
+    .write_header = md5_open,
+    .write_picture = md5_write,
+    .write_trailer = md5_close,
+    .verify = md5_verify,
+};
diff --git a/tools/output/muxer.h b/tools/output/muxer.h
new file mode 100644 (file)
index 0000000..54b3f6a
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_OUTPUT_MUXER_H
+#define DAV1D_OUTPUT_MUXER_H
+
+#include "picture.h"
+
+typedef struct MuxerPriv MuxerPriv;
+typedef struct Muxer {
+    int priv_data_size;
+    const char *name;
+    const char *extension;
+    int (*write_header)(MuxerPriv *ctx, const char *filename,
+                        const Dav1dPictureParameters *p, const unsigned fps[2]);
+    int (*write_picture)(MuxerPriv *ctx, Dav1dPicture *p);
+    void (*write_trailer)(MuxerPriv *ctx);
+    /**
+     * Verifies the muxed data (for example in the md5 muxer). Replaces write_trailer.
+     *
+     * @param  hash_string Muxer specific reference value.
+     *
+     * @return 0 on success.
+     */
+    int (*verify)(MuxerPriv *ctx, const char *hash_string);
+} Muxer;
+
+#endif /* DAV1D_OUTPUT_MUXER_H */
diff --git a/tools/output/null.c b/tools/output/null.c
new file mode 100644 (file)
index 0000000..f8633f3
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "output/muxer.h"
+
+typedef struct MuxerPriv NullOutputContext;
+
+static int null_write(NullOutputContext *const c, Dav1dPicture *const p) {
+    dav1d_picture_unref(p);
+    return 0;
+}
+
+const Muxer null_muxer = {
+    .priv_data_size = 0,
+    .name = "null",
+    .extension = "null",
+    .write_picture = null_write,
+};
diff --git a/tools/output/output.c b/tools/output/output.c
new file mode 100644 (file)
index 0000000..368d079
--- /dev/null
@@ -0,0 +1,145 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/attributes.h"
+
+#include "output/output.h"
+#include "output/muxer.h"
+
+struct MuxerContext {
+    MuxerPriv *data;
+    const Muxer *impl;
+};
+
+extern const Muxer null_muxer;
+extern const Muxer md5_muxer;
+extern const Muxer yuv_muxer;
+extern const Muxer y4m2_muxer;
+static const Muxer *muxers[] = {
+    &null_muxer,
+    &md5_muxer,
+    &yuv_muxer,
+    &y4m2_muxer,
+    NULL
+};
+
+static const char *find_extension(const char *const f) {
+    const size_t l = strlen(f);
+
+    if (l == 0) return NULL;
+
+    const char *const end = &f[l - 1], *step = end;
+    while ((*step >= 'a' && *step <= 'z') ||
+           (*step >= 'A' && *step <= 'Z') ||
+           (*step >= '0' && *step <= '9'))
+    {
+        step--;
+    }
+
+    return (step < end && step > f && *step == '.' && step[-1] != '/') ?
+           &step[1] : NULL;
+}
+
+int output_open(MuxerContext **const c_out,
+                const char *const name, const char *const filename,
+                const Dav1dPictureParameters *const p, const unsigned fps[2])
+{
+    const Muxer *impl;
+    MuxerContext *c;
+    unsigned i;
+    int res;
+
+    if (name) {
+        for (i = 0; muxers[i]; i++) {
+            if (!strcmp(muxers[i]->name, name)) {
+                impl = muxers[i];
+                break;
+            }
+        }
+        if (!muxers[i]) {
+            fprintf(stderr, "Failed to find muxer named \"%s\"\n", name);
+            return DAV1D_ERR(ENOPROTOOPT);
+        }
+    } else if (!strcmp(filename, "/dev/null")) {
+        impl = muxers[0];
+    } else {
+        const char *const ext = find_extension(filename);
+        if (!ext) {
+            fprintf(stderr, "No extension found for file %s\n", filename);
+            return -1;
+        }
+        for (i = 0; muxers[i]; i++) {
+            if (!strcmp(muxers[i]->extension, ext)) {
+                impl = muxers[i];
+                break;
+            }
+        }
+        if (!muxers[i]) {
+            fprintf(stderr, "Failed to find muxer for extension \"%s\"\n", ext);
+            return DAV1D_ERR(ENOPROTOOPT);
+        }
+    }
+
+    if (!(c = malloc(sizeof(MuxerContext) + impl->priv_data_size))) {
+        fprintf(stderr, "Failed to allocate memory\n");
+        return DAV1D_ERR(ENOMEM);
+    }
+    c->impl = impl;
+    c->data = (MuxerPriv *) &c[1];
+    if (impl->write_header && (res = impl->write_header(c->data, filename, p, fps)) < 0) {
+        free(c);
+        return res;
+    }
+    *c_out = c;
+
+    return 0;
+}
+
+int output_write(MuxerContext *const ctx, Dav1dPicture *const p) {
+    const int res = ctx->impl->write_picture(ctx->data, p);
+    return res < 0 ? res : 0;
+}
+
+void output_close(MuxerContext *const ctx) {
+    if (ctx->impl->write_trailer)
+        ctx->impl->write_trailer(ctx->data);
+    free(ctx);
+}
+
+int output_verify(MuxerContext *const ctx, const char *const md5_str) {
+    const int res = ctx->impl->verify ?
+        ctx->impl->verify(ctx->data, md5_str) : 0;
+    free(ctx);
+    return res;
+}
diff --git a/tools/output/output.h b/tools/output/output.h
new file mode 100644 (file)
index 0000000..6111c86
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_OUTPUT_OUTPUT_H
+#define DAV1D_OUTPUT_OUTPUT_H
+
+#include "picture.h"
+
+typedef struct MuxerContext MuxerContext;
+
+int output_open(MuxerContext **c, const char *name, const char *filename,
+                const Dav1dPictureParameters *p, const unsigned fps[2]);
+int output_write(MuxerContext *ctx, Dav1dPicture *pic);
+void output_close(MuxerContext *ctx);
+/**
+ * Verifies the muxed data (for example in the md5 muxer). Replaces output_close.
+ *
+ * @param  hash_string Muxer specific reference value.
+ *
+ * @return 0 on success.
+ */
+int output_verify(MuxerContext *ctx, const char *hash_string);
+
+#endif /* DAV1D_OUTPUT_OUTPUT_H */
diff --git a/tools/output/y4m2.c b/tools/output/y4m2.c
new file mode 100644 (file)
index 0000000..bcd4034
--- /dev/null
@@ -0,0 +1,141 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#include "output/muxer.h"
+
+typedef struct MuxerPriv {
+    FILE *f;
+    int first;
+    unsigned fps[2];
+} Y4m2OutputContext;
+
+static int y4m2_open(Y4m2OutputContext *const c, const char *const file,
+                     const Dav1dPictureParameters *p, const unsigned fps[2])
+{
+    if (!strcmp(file, "-")) {
+        c->f = stdout;
+    } else if (!(c->f = fopen(file, "wb"))) {
+        fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno));
+        return -1;
+    }
+
+    c->first = 1;
+    c->fps[0] = fps[0];
+    c->fps[1] = fps[1];
+
+    return 0;
+}
+
+static int write_header(Y4m2OutputContext *const c, const Dav1dPicture *const p) {
+    static const char *const ss_names[][3] = {
+        [DAV1D_PIXEL_LAYOUT_I400] = { "mono", "mono10", "mono12" },
+        [DAV1D_PIXEL_LAYOUT_I420] = { NULL,   "420p10", "420p12" },
+        [DAV1D_PIXEL_LAYOUT_I422] = { "422",  "422p10", "422p12" },
+        [DAV1D_PIXEL_LAYOUT_I444] = { "444",  "444p10", "444p12" }
+    };
+
+    static const char *const chr_names_8bpc_i420[] = {
+        [DAV1D_CHR_UNKNOWN] = "420jpeg",
+        [DAV1D_CHR_VERTICAL] = "420mpeg2",
+        [DAV1D_CHR_COLOCATED] = "420"
+    };
+
+    const char *const ss_name =
+        p->p.layout == DAV1D_PIXEL_LAYOUT_I420 && p->p.bpc == 8 ?
+        chr_names_8bpc_i420[p->seq_hdr->chr > 2 ? DAV1D_CHR_UNKNOWN : p->seq_hdr->chr] :
+        ss_names[p->p.layout][p->seq_hdr->hbd];
+
+    fprintf(c->f, "YUV4MPEG2 W%d H%d F%d:%d Ip C%s\n",
+            p->p.w, p->p.h, c->fps[0], c->fps[1], ss_name);
+
+    return 0;
+}
+
+static int y4m2_write(Y4m2OutputContext *const c, Dav1dPicture *const p) {
+    if (c->first) {
+        c->first = 0;
+        const int res = write_header(c, p);
+        if (res < 0) return res;
+    }
+    fprintf(c->f, "FRAME\n");
+
+    uint8_t *ptr;
+    const int hbd = p->p.bpc > 8;
+
+    ptr = p->data[0];
+    for (int y = 0; y < p->p.h; y++) {
+        if (fwrite(ptr, p->p.w << hbd, 1, c->f) != 1)
+            goto error;
+        ptr += p->stride[0];
+    }
+
+    if (p->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+        // u/v
+        const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int cw = (p->p.w + ss_hor) >> ss_hor;
+        const int ch = (p->p.h + ss_ver) >> ss_ver;
+        for (int pl = 1; pl <= 2; pl++) {
+            ptr = p->data[pl];
+            for (int y = 0; y < ch; y++) {
+                if (fwrite(ptr, cw << hbd, 1, c->f) != 1)
+                    goto error;
+                ptr += p->stride[1];
+            }
+        }
+    }
+
+    dav1d_picture_unref(p);
+    return 0;
+
+error:
+    dav1d_picture_unref(p);
+    fprintf(stderr, "Failed to write frame data: %s\n", strerror(errno));
+    return -1;
+}
+
+static void y4m2_close(Y4m2OutputContext *const c) {
+    if (c->f != stdout)
+        fclose(c->f);
+}
+
+const Muxer y4m2_muxer = {
+    .priv_data_size = sizeof(Y4m2OutputContext),
+    .name = "yuv4mpeg2",
+    .extension = "y4m",
+    .write_header = y4m2_open,
+    .write_picture = y4m2_write,
+    .write_trailer = y4m2_close,
+};
diff --git a/tools/output/yuv.c b/tools/output/yuv.c
new file mode 100644 (file)
index 0000000..406f284
--- /dev/null
@@ -0,0 +1,104 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#include "output/muxer.h"
+
+typedef struct MuxerPriv {
+    FILE *f;
+} YuvOutputContext;
+
+static int yuv_open(YuvOutputContext *const c, const char *const file,
+                    const Dav1dPictureParameters *const p,
+                    const unsigned fps[2])
+{
+    if (!strcmp(file, "-")) {
+        c->f = stdout;
+    } else if (!(c->f = fopen(file, "wb"))) {
+        fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno));
+        return -1;
+    }
+
+    return 0;
+}
+
+static int yuv_write(YuvOutputContext *const c, Dav1dPicture *const p) {
+    uint8_t *ptr;
+    const int hbd = p->p.bpc > 8;
+
+    ptr = p->data[0];
+    for (int y = 0; y < p->p.h; y++) {
+        if (fwrite(ptr, p->p.w << hbd, 1, c->f) != 1)
+            goto error;
+        ptr += p->stride[0];
+    }
+
+    if (p->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+        // u/v
+        const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int cw = (p->p.w + ss_hor) >> ss_hor;
+        const int ch = (p->p.h + ss_ver) >> ss_ver;
+        for (int pl = 1; pl <= 2; pl++) {
+            ptr = p->data[pl];
+            for (int y = 0; y < ch; y++) {
+                if (fwrite(ptr, cw << hbd, 1, c->f) != 1)
+                    goto error;
+                ptr += p->stride[1];
+            }
+        }
+    }
+
+    dav1d_picture_unref(p);
+    return 0;
+
+error:
+    dav1d_picture_unref(p);
+    fprintf(stderr, "Failed to write frame data: %s\n", strerror(errno));
+    return -1;
+}
+
+static void yuv_close(YuvOutputContext *const c) {
+    if (c->f != stdout)
+        fclose(c->f);
+}
+
+const Muxer yuv_muxer = {
+    .priv_data_size = sizeof(YuvOutputContext),
+    .name = "yuv",
+    .extension = "yuv",
+    .write_header = yuv_open,
+    .write_picture = yuv_write,
+    .write_trailer = yuv_close,
+};